How to parse xls data to this struct, both row and column have merged cells, simply use df.index.to_series().ffill()
cannot handle.
{
"time": "time",
"category": "A",
"variety": "A1",
"specification": "S1",
"unit": "U1",
"average": 1.25,
"region": "RegionA",
"market": "MarketA",
"price": 1.1,
}
I figured out this solution:
def test_xls_parse():
file_path = 'test.xls'
df = pd.read_excel(file_path, engine='xlrd')
time_label = df.iloc[0, 0]
categories = df.iloc[1, 2:]
varieties = df.iloc[2, 2:]
specifications = df.iloc[3, 2:]
units = df.iloc[4, 2:]
averages = df.iloc[5, 2:]
regions = df.iloc[6:, 0].ffill()
markets = df.iloc[6:, 1]
prices = df.iloc[6:, 2:]
result = []
for i in range(len(categories)):
for j in range(len(regions)):
obj = {
"date": time_label,
"category": categories.iloc[i],
"variety": varieties.iloc[i],
"specification": specifications.iloc[i],
"unit": units.iloc[i],
"average": None if averages.iloc[i] == '-' else float(averages.iloc[i]),
"region": regions.iloc[j],
"market": markets.iloc[j],
"price": None if prices.iloc[j, i] == '-' else float(prices.iloc[j, i])
}
result.append(obj)
return pd.DataFrame(result)