I have the following python script that I would like to modify in order to create a new nested for loop inside the for j in i["attributes"]
loop to retrieve dynamically for each customer_id
the list of his Policy_No
and related attributes (Company_ID
, Branch_ID
and Receipt_No
):
import pandas as pd
df = pd.DataFrame({
'type': ['customer','customer','customer','customer'],
'customer_id': ['1-0000001','1-0000001','1-0000002','1-0000002'],
'u_fn': ['TestUser1_FirstName','TestUser1_FirstName','TestUser2_FirstName','TestUser2_FirstName'],
'u_ln': ['TestUser1_LastName','TestUser1_LastName','TestUser2_LastName','TestUser2_LastName'],
'Customer_Type_ID': ['ΦΥΣΙΚΟ','ΦΥΣΙΚΟ','ΦΥΣΙΚΟ','ΦΥΣΙΚΟ'],
'u_em': ['customer1@otenet.gr','customer1@otenet.gr','customer2@gmail.com','customer2@gmail.com'],
'u_mb': ['6900000001','6900000001','6900000002','6900000002'],
'# of policies':[2,2,2,2],
'Company_ID': [2,2,2,2],
'Branch_ID': [5,5,5,5],
'Policy_No': ['000000001','000000001','000000002','000000002'],
'Receipt_No': ['420000001','420000002','430000001','430000002']
})
grouped = df.groupby('customer_id').apply(lambda x: x[['u_fn', 'u_ln', 'Customer_Type_ID', 'u_em','u_mb',
'Company_ID', 'Branch_ID', 'Policy_No',
'Receipt_No', '# of policies']].to_dict('records'), include_groups=False).reset_index()
grouped.columns = ['customer_id', 'attributes'] # Column names for grouped DataFrame
output = grouped.to_dict('records')
results = []
for i in output:
attributes = {}
for j in i["attributes"]:
attributes["u_fn"] = j["u_fn"]
attributes["u_ln"] = j["u_ln"]
attributes["Customer_Type_ID"] = j["Customer_Type_ID"]
attributes["u_em"] = j["u_em"]
attributes["u_mb"] = j["u_mb"]
attributes["# of policies"] = j["# of policies"]
attributes["Company_ID"] = j["Company_ID"]
attributes["Branch_ID"] = j["Branch_ID"]
attributes["Policy_No"] = j["Policy_No"]
attributes["Receipt_No"] = j["Receipt_No"]
contact = {
"Id": i["customer_id"],
"attributes": attributes
}
results.append(contact)
Running this script, the value of the 'results' list is the following:
[{
'Id': '1-0000001',
'attributes': {
'u_fn': 'TestUser1_FirstName',
'u_ln': 'TestUser1_LastName',
'Customer_Type_ID': 'ΦΥΣΙΚΟ',
'u_em': 'customer1@otenet.gr',
'u_mb': '6900000001',
'# of policies': 2,
'Company_ID': 2,
'Branch_ID': 5,
'Policy_No': '000000001',
'Receipt_No': '420000002'
}
}, {
'Id': '1-0000002',
'attributes': {
'u_fn': 'TestUser2_FirstName',
'u_ln': 'TestUser2_LastName',
'Customer_Type_ID': 'ΦΥΣΙΚΟ',
'u_em': 'customer2@gmail.com',
'u_mb': '6900000002',
'# of policies': 2,
'Company_ID': 2,
'Branch_ID': 5,
'Policy_No': '000000002',
'Receipt_No': '430000002'
}
}
]
As you can see, each Policy_No has only the latest Receipt_No value, because during the iteration the first values retrieved (420000001 and 430000001) are overwritten by the second ones (420000002 and 430000002).
The expected result, should be the following one, where a new list called 'policies details' should be created and it should contain, for each ID, all the Policy_No
with the different Receipt_No
.
[{
'Id': '1-0000001',
'attributes': {
'u_fn': 'TestUser1_FirstName',
'u_ln': 'TestUser1_LastName',
'Customer_Type_ID': 'ΦΥΣΙΚΟ',
'u_em': 'customer1@otenet.gr',
'u_mb': '6900000001',
'# of policies': 2,
'policies details': [{
'Company_ID': 2,
'Branch_ID': 5,
'Policy_No': '000000001',
'Receipt_No': '420000001'
}, {
'Company_ID': 2,
'Branch_ID': 5,
'Policy_No': '000000001',
'Receipt_No': '420000002'
}
]
}
}, {
'Id': '1-0000002',
'attributes': {
'u_fn': 'TestUser2_FirstName',
'u_ln': 'TestUser2_LastName',
'Customer_Type_ID': 'ΦΥΣΙΚΟ',
'u_em': 'customer2@gmail.com',
'u_mb': '6900000002',
'# of policies': 2,
'policies details': [{
'Company_ID': 2,
'Branch_ID': 5,
'Policy_No': '000000002',
'Receipt_No': '430000001'
}, {
'Company_ID': 2,
'Branch_ID': 5,
'Policy_No': '000000002',
'Receipt_No': '430000002'
}
]
}
}
]
To achieve this result, starting from the initial data frame, I executed the following steps:
policies details
Policy_No
for k in i["Policy_No"]
df['policies details'] = df[['Company_ID', 'Branch_ID', 'Policy_No', 'Receipt_No']].to_dict(orient='records')
grouped = df.groupby('customer_id').apply(lambda x: x.groupby('Policy_No').apply(lambda y: y.drop(['customer_id', 'Policy_No'], axis=1).to_dict(orient='records'))).reset_index()
grouped.columns = ['customer_id', 'Policy_No', 'attributes'] # Column names for grouped DataFrame
output = grouped.to_dict('records')
results = []
for i in output:
attributes = {}
for j in i["attributes"]:
attributes["u_fn"] = j["u_fn"]
attributes["u_ln"] = j["u_ln"]
attributes["Customer_Type_ID"] = j["Customer_Type_ID"]
attributes["u_em"] = j["u_em"]
attributes["u_mb"] = j["u_mb"]
attributes["# of policies"] = j["# of policies"]
policies_details = []
for k in i["Policy_No"]:
attributes["policies details"] = j["policies details"]
contact = {
"Id": i["customer_id"],
"attributes": attributes
}
results.append(contact)
But again, the results value was not correct. I overwrote the first Receipt_No
values with the second ones. My main problem is that I don't know how to iterate correctly inside the existing for j in i["attributes"]
and how to append each value without overwrite it.
Any help will be highly appreciated.
Starting with your output
(renamed to contacts
) that looks like:
contacts = [
{
"customer_id": "1-0000001",
"attributes": [
{"u_fn": "TestUser1_FirstName", "u_ln": "TestUser1_LastName", "Customer_Type_ID": "ΦΥΣΙΚΟ", "u_em": "customer1@otenet.gr", "u_mb": "6900000001", "Company_ID": 2, "Branch_ID": 5, "Policy_No": "000000001", "Receipt_No": "420000001", "# of policies": 2},
{"u_fn": "TestUser1_FirstName", "u_ln": "TestUser1_LastName", "Customer_Type_ID": "ΦΥΣΙΚΟ", "u_em": "customer1@otenet.gr", "u_mb": "6900000001", "Company_ID": 2, "Branch_ID": 5, "Policy_No": "000000001", "Receipt_No": "420000002", "# of policies": 2}
]
},
{
"customer_id": "1-0000002",
"attributes": [
{"u_fn": "TestUser2_FirstName", "u_ln": "TestUser2_LastName", "Customer_Type_ID": "ΦΥΣΙΚΟ", "u_em": "customer2@gmail.com", "u_mb": "6900000002", "Company_ID": 2, "Branch_ID": 5, "Policy_No": "000000002", "Receipt_No": "430000001", "# of policies": 2},
{"u_fn": "TestUser2_FirstName", "u_ln": "TestUser2_LastName", "Customer_Type_ID": "ΦΥΣΙΚΟ", "u_em": "customer2@gmail.com", "u_mb": "6900000002", "Company_ID": 2, "Branch_ID": 5, "Policy_No": "000000002", "Receipt_No": "430000002", "# of policies": 2}
]
}
]
I would start by defining two lists of keys to clean things up a bit. Not required but I feel it is cleaner.
attribute_keys = ["u_fn", "u_ln", "Customer_Type_ID", "u_em", "u_mb", "# of policies"]
policy_details_keys = ["Company_ID", "Branch_ID", "Policy_No", "Receipt_No"]
For each record in contacts
and these two list of keys, we can construct a new record off of a base
record and a policy_details
record. We can then merge these two records together via the |
operator to give us the result to append to results
.
Technically results
could be constructed from a single comprehension, but I don't think doing so adds any clarity.
results = []
for contact in contacts:
root = {"customer_id": contact["customer_id"]}
attributes = {key: contact["attributes"][0][key] for key in attribute_keys}
policy_details = [
{key: attribute[key] for key in policy_details_keys}
for attribute in contact["attributes"]
]
attributes = attributes | {"policy_details": policy_details}
root = root | {"attributes": attributes}
results.append(root)
print(json.dumps(results, indent=4, ensure_ascii=False))
giving us:
[
{
"customer_id": "1-0000001",
"attributes": {
"u_fn": "TestUser1_FirstName",
"u_ln": "TestUser1_LastName",
"Customer_Type_ID": "ΦΥΣΙΚΟ",
"u_em": "customer1@otenet.gr",
"u_mb": "6900000001",
"# of policies": 2,
"policy_details": [
{
"Company_ID": 2,
"Branch_ID": 5,
"Policy_No": "000000001",
"Receipt_No": "420000001"
},
{
"Company_ID": 2,
"Branch_ID": 5,
"Policy_No": "000000001",
"Receipt_No": "420000002"
}
]
}
},
{
"customer_id": "1-0000002",
"attributes": {
"u_fn": "TestUser2_FirstName",
"u_ln": "TestUser2_LastName",
"Customer_Type_ID": "ΦΥΣΙΚΟ",
"u_em": "customer2@gmail.com",
"u_mb": "6900000002",
"# of policies": 2,
"policy_details": [
{
"Company_ID": 2,
"Branch_ID": 5,
"Policy_No": "000000002",
"Receipt_No": "430000001"
},
{
"Company_ID": 2,
"Branch_ID": 5,
"Policy_No": "000000002",
"Receipt_No": "430000002"
}
]
}
}
]
Pulling out all the stops to do it in one comprehension and throwing in the walrus operator for good measure, you might do:
results = [
{"customer_id": contact["customer_id"]} |
{"attributes":
{key: attributes[0][key] for key in attribute_keys} |
{"policy_details": [{key: attribute[key] for key in policy_details_keys} for attribute in attributes]}
} for contact in contacts if (attributes := contact["attributes"])
]