I have the following dictionary:
ip_dict =
{
"doc_1" : {
"img_1" : ("FP","some long text"),
"img_2" : ("LP", "another long text"),
"img_3" : ("Others", "long text"),
"img_4" : ("Others", "some loong text"),
"img_5" : ("FP", "one more text"),
"img_6" : ("FP", "another one"),
"img_7" : ("LP", "ANOTHER ONE"),
"img_8" : ("Others", "some text"),
"img_9" : ("Others", "some moretext"),
"img_10" : ("FP", "more text"),
"img_11" : ("Others", "whatever"),
"img_12" : ("Others", "more whatever"),
"img_13" : ("LP", "SoMe TeXt"),
"img_14" : ("Others", "some moretext"),
"img_15" : ("FP", "whatever"),
"img_16" : ("Others", "whatever"),
"img_17" : ("LP", "whateverrr")
},
"doc_2" : {
"img_1" : ("FP", "text"),
"img_2" : ("FP", "more text"),
"img_3" : ("LP", "more more text"),
"img_4" : ("Others", "some more"),
"img_5" : ("Others", "text text"),
"img_6" : ("FP", "more more text"),
"img_7" : ("Others", "lot of text"),
"img_8" : ("LP", "still more text")
}
}
Here FP
represents the first page and LP
the last page. For all the docs
I only want to extract the FP
and LP
. For the Others
, if they lie between FP
and LP
only then extract them, as they represent the pages between FP
and LP
. If they lie outside FP
and LP
then ignore them. Also for FP
which are not followed by a LP
, treat them as a single page and extract them. So my output dictionary would look like:
op_dict =
{
"doc_1" : [
{
"img_1" : ("FP","some long text"),
"img_2" : ("LP", "another long text")
},
{
"img_5" : ("FP", "one more text")
},
{
"img_6" : ("FP", "another one"),
"img_7" : ("LP", "ANOTHER ONE")
},
{
"img_10" : ("FP", "more text"),
"img_11" : ("Others", "whatever"),
"img_12" : ("Others", "more whatever"),
"img_13" : ("LP", "SoMe TeXt"),
},
{
"img_15" : ("FP", "whatever"),
"img_16" : ("Others", "whatever"),
"img_17" : ("LP", "whateverrr"),
}
],
"doc_2" : [
{
"img_1" : ("FP", "text")
},
{
"img_2" : ("FP", "more text"),
"img_3" : ("LP", "more more text")
},
{
"img_6" : ("FP", "more more text"),
"img_7" : ("Others", "lot of text"),
"img_8" : ("LP", "still more text")
},
]
}
As you can see, all the FP
and LP
have been extracted, but also those Others
which are in between FP
and LP
have also been extracted and stored in a dictionary. Also those FP
which are not followed by a LP
have also been extracted.
PS:
ip_dict =
{
"doc_1" : {
"img_1" : ("LP","some long text"),
"img_2" : ("Others", "another long text"),
"img_3" : ("Others", "long text"),
"img_4" : ("FP", "long text"),
"img_5" : ("Others", "long text"),
"img_6" : ("LP", "long text")
}
}
op_dict = {
"doc_1" : [{
"img_1" : ("LP","some long text")
},
{
"img_4" : ("FP", "long text"),
"img_5" : ("Others", "long text"),
"img_6" : ("LP", "long text")
}
]
}
Any help is appreciated!
With extended sequential logic:
def select_page_ranges(d: dict):
def _del_excess_items():
# if previous block was not closed and has excess entries
if start and last_mark != 'FP':
res[pk][-1] = {start_key: res[pk][-1][start_key]}
res = {}
for pk, v in ip_dict.items():
res[pk] = []
start, start_key, last_mark = None, None, ''
for k, v in v.items():
if v[0] == 'FP':
_del_excess_items()
res[pk].append({k: v})
start = True
start_key = k
elif v[0] == 'LP':
res[pk][-1].update({k: v})
start = False
elif start:
res[pk][-1].update({k: v})
last_mark = v[0]
_del_excess_items()
return res
print(select_page_ranges(ip_dict))
{'doc_1': [{'img_1': ('FP', 'some long text'),
'img_2': ('LP', 'another long text')},
{'img_5': ('FP', 'one more text')},
{'img_6': ('FP', 'another one'), 'img_7': ('LP', 'ANOTHER ONE')},
{'img_61': ('FP', 'another one'), 'img_71': ('LP', 'ANOTHER ONE')},
{'img_62': ('FP', 'another one'), 'img_72': ('LP', 'ANOTHER ONE')},
{'img_54': ('FP', 'one more text')},
{'img_540': ('FP', 'one more text')},
{'img_541': ('FP', 'one more text')},
{'img_13': ('FP', 'more text'),
'img_14': ('Others', 'whatever'),
'img_140': ('Others', 'whatever'),
'img_141': ('Others', 'whatever'),
'img_142': ('Others', 'whatever'),
'img_15': ('Others', 'more whatever'),
'img_16': ('LP', 'SoMe TeXt')},
{'img_18': ('FP', 'whatever'),
'img_19': ('Others', 'whatever'),
'img_20': ('LP', 'whateverrr')}],
'doc_2': [{'img_1': ('FP', 'text')},
{'img_2': ('FP', 'more text'), 'img_3': ('LP', 'more more text')},
{'img_6': ('FP', 'more more text'),
'img_7': ('Others', 'lot of text'),
'img_8': ('LP', 'still more text')},
{'img_69': ('FP', 'more more text')}]}