pythonpandas

pandas pivot data, fill Mult index column horizontally


i have the following code:

import pandas as pd

data = {
    'name': ['Comp1', 'Comp1', 'Comp2', 'Comp2', 'Comp3'],
    'entity_type': ['type1', 'type1', 'type2', 'type2', 'type3'],
    'code': ['code1', 'code2', 'code3', 'code1', 'code2'],
    'date': ['2024-01-31', '2024-01-31', '2024-01-29', '2024-01-31', '2024-01-29'],
    'value': [10, 10, 100, 10, 200],
    'source': [None, None, 'Estimated', None, 'Reported']
}
df = pd.DataFrame(data)

pivot_df = df.pivot(index='date', columns=['name', 'entity_type', 'source', 'code'], values='value').rename_axis([('name', 'entity_type', 'source', 'date')])
df = pivot_df.reset_index()
df

This produces the following: enter image description here

I am having trouble with the following:

  1. I would like to remove the first column
  2. I would like to fill the first 3 rows horizontally. so, for example, the blank cells above 'code2' should be Comp1, type1, NaN
  3. would be nice to replace those nans in the column headers with empty string

Any help would be appreciated.

EDIT - working hack as this data i want to end up as an array that looks exactly in the dataframe to insert into a spreadsheet..this works. In this case however, there would be any meaningful 'columns'..

out = (df.pivot(index='date', columns=['name', 'entity_type', 'source', 'code'], values='value')
         .rename_axis([('name', 'entity_type', 'source', 'date')])
         .reset_index()
         .fillna('')
      )
out.columns.names = [None, None, None, None]
columns_df = pd.DataFrame(out.columns.tolist()).T
out = pd.concat([columns_df, pd.DataFrame(out.values)], ignore_index=True)
out

Solution

  • You could (1) reset_index with drop=True, (2) set the display.multi_sparse to False (with pandas.option_context) and (3) fillna with '':

    df = pd.DataFrame(data)
    
    out = (df.pivot(index='date', columns=['name', 'entity_type', 'source', 'code'],
                    values='value')
             .rename_axis([('name', 'entity_type', 'source', 'date')])
             .reset_index(drop=True)
             .fillna('')
          )
    
    with pd.option_context('display.multi_sparse', False):
        print(out)
    

    Output:

    name        Comp1 Comp1     Comp2 Comp2    Comp3
    entity_type type1 type1     type2 type2    type3
    source        NaN   NaN Estimated   NaN Reported
    code        code1 code2     code3 code1    code2
    0                           100.0          200.0
    1            10.0  10.0            10.0         
    

    Printing without the index:

    out = (df.pivot(index='date', columns=['name', 'entity_type', 'source', 'code'],
                    values='value')
             .fillna('')
          )
    
    with pd.option_context('display.multi_sparse', False):
        print(out.to_string(index=False))
    

    Output:

    Comp1 Comp1     Comp2 Comp2    Comp3
    type1 type1     type2 type2    type3
      NaN   NaN Estimated   NaN Reported
    code1 code2     code3 code1    code2
                    100.0          200.0
     10.0  10.0            10.0         
    

    Hiding the indices:

    out = (df.pivot(index='date', columns=['name', 'entity_type', 'source', 'code'],
                    values='value')
             .rename_axis(None)
             .rename(lambda x: '')
             .fillna('')
          )
    
    with pd.option_context('display.multi_sparse', False):
        print(out)
    

    Output:

    name        Comp1 Comp1     Comp2 Comp2    Comp3
    entity_type type1 type1     type2 type2    type3
    source        NaN   NaN Estimated   NaN Reported
    code        code1 code2     code3 code1    code2
                                100.0          200.0
                 10.0  10.0            10.0         
    

    Updated answer: header as rows

    out = (df.pivot(index='date', columns=['name', 'entity_type', 'source', 'code'],
                    values='value')
             .rename_axis(index=None, columns=('name', 'entity_type', 'source', 'date'))
             .fillna('')
             .T.reset_index().T
             .reset_index()
          )
    

    Output:

             index      0      1          2      3         4
    0         name  Comp1  Comp1      Comp2  Comp2     Comp3
    1  entity_type  type1  type1      type2  type2     type3
    2       source    NaN    NaN  Estimated    NaN  Reported
    3         date  code1  code2      code3  code1     code2
    4   2024-01-29                    100.0            200.0
    5   2024-01-31   10.0   10.0              10.0