I have a python dictionary as given below:
ip = {
"doc1.pdf": {
"img1.png": ("FP", "text1"),
"img2.png": ("NP", "text2"),
"img3.png": ("FP", "text3"),
},
"doc2.pdf": {
"img1.png": ("FP", "text4"),
"img2.png": ("NP", "text5"),
"img3.png": ("NP", "text6"),
"img4.png": ("NP", "text7"),
"img5.png": ("Others", "text8"),
"img6.png": ("FP", "text9"),
"img7.png": ("NP", "text10"),
},
"doc3.pdf": {
"img1.png": ("Others", "text8"),
"img2.png": ("FP", "text9"),
"img3.png": ("Others", "text10"),
"img4.png": ("FP", "text11"),
},
"doc4.pdf": {
"img1.png": ("FP", "text12"),
"img2.png": ("Others", "text13"),
"img3.png": ("Others", "text14"),
"img4.png": ("Others", "text15"),
},
"doc5.pdf": {
"img1.png": ("FP", "text16"),
"img2.png": ("FP", "text17"),
"img3.png": ("NP", "text18"),
"img4.png": ("NP", "text19"),
},
}
Here the keyword FP
means FirstPage
, NP
is NextPage
and Others
is OtherPage
(which is not a part of the FP
or NP
). So FP
and NP
are sequential and hence FP
will appear before NP
. Now I want to segregate the sequential FP
's NP
's from other other sequential FP
's and NP
's.
I want to process the dictionary based on these rules:
Others
in the tuple present.FP
's and NP
's. So if one or more NP
's appear after an FP
then the FP
and NP
should be combined into one dictionary.FP
with no NP
following it, or if an FP
(1) is followed by another FP
(2) then the (1) FP
needs to be put in a separate dictionary.Here is what the output would look like for the above input:
op = {
"doc1.pdf": [
{
"img1.png": ("FP", "text1"),
"img2.png": ("NP", "text2")
}
{
"img3.png": ("FP", "text3")
}
],
"doc2.pdf": [
{
"img1.png": ("FP", "text4"),
"img2.png": ("NP", "text5"),
"img3.png": ("NP", "text6"),
"img4.png": ("NP", "text7")
}
{
"img6.png": ("FP", "text9"),
"img7.png": ("NP", "text10")
}
],
"doc3.pdf": [
{
"img2.png": ("FP", "text9")
}
{
"img4.png": ("FP", "text11"),
}
],
"doc4.pdf": [
{
"img1.png": ("FP", "text12")
}
],
"doc5.pdf": [
{
"img1.png": ("FP", "text16")
}
{
"img2.png": ("FP", "text17"),
"img3.png": ("NP", "text18"),
"img4.png": ("NP", "text19")
}
]
}
So far I have tried this but it is not working:
def remove_others(ip_dict):
op_dict = {}
for doc, img_dict in ip_dict.items():
temp_list = []
current_group = []
for img, values in img_dict.items():
label, text = values
if label == "Others":
continue
if current_group and label == "NP" and current_group[-1][1][0] == "FP":
current_group.append((img, (label, text)))
else:
if current_group:
temp_list.append(dict(current_group))
current_group = [(img, (label, text))]
if current_group:
temp_list.append(dict(current_group))
op_dict[doc] = temp_list
return op_dict
Any help is appreciated!
Instead of checking the last label of temp_list
, start a new dictionary whenever you see an FP
label, and add keys to it for other labels.
def remove_others(ip_dict):
op_dict = {}
for doc, img_dict in ip_dict.items():
current_group = []
for img, (label, text) in img_dict.items():
if label == "Others":
continue
if label == "FP":
current_item = {img: (label, text)}
current_group.append(current_item)
else:
current_item[img] = (label, text)
op_dict[doc] = current_group
return op_dict