pythonexcelpandasxls

How to parse xls data including merged cells using python pandas


How to parse xls data to this struct, both row and column have merged cells, simply use df.index.to_series().ffill() cannot handle.

{
  "time": "time",
  "category": "A",
  "variety": "A1",
  "specification": "S1",
  "unit": "U1",
  "average": 1.25,
  "region": "RegionA",
  "market": "MarketA",
  "price": 1.1,
}

enter image description here


Solution

  • I figured out this solution:

    def test_xls_parse():
        file_path = 'test.xls'
        df = pd.read_excel(file_path, engine='xlrd')
    
        time_label = df.iloc[0, 0]
    
    
        categories = df.iloc[1, 2:]
        varieties = df.iloc[2, 2:]
        specifications = df.iloc[3, 2:]
        units = df.iloc[4, 2:]
        averages = df.iloc[5, 2:]
    
    
        regions = df.iloc[6:, 0].ffill()
        markets = df.iloc[6:, 1]
        prices = df.iloc[6:, 2:]
    
        result = []
        for i in range(len(categories)):
            for j in range(len(regions)):
                obj = {
                    "date": time_label,
                    "category": categories.iloc[i],
                    "variety": varieties.iloc[i],
                    "specification": specifications.iloc[i],
                    "unit": units.iloc[i],
                    "average": None if averages.iloc[i] == '-' else float(averages.iloc[i]),
                    "region": regions.iloc[j],
                    "market": markets.iloc[j],
                    "price": None if prices.iloc[j, i] == '-' else float(prices.iloc[j, i])
                }
                result.append(obj)
    
        return pd.DataFrame(result)