pythonpandasnormalizeflat

flat_table getting ValueError: cannot reindex from a duplicate axis , My problem is different for this error


I have dataframe as below

  behaviour_attributes 
  0 {'className': 'behaviour', 'type': 'behaviour', 'verb': 'can_perform_stw_everything', 'bs': [{'bid': ObjectId('6050da979198a053c3a02484'), 'n': 'Can Perform Spin Wheel Everything', 'ao': datetime.datetime(2021, 4, 6, 0, 0, 0, 266000), 'bs': 'CountLimitException', 'tids': [ObjectId('605073cb9198a053c39d7a4d')], 'tags': [{'tid': ObjectId('605073cb9198a053c39d7a4d'), 'prsn': True}], 'prz': {'ch': False, 'pts': [{'pid': ObjectId('6050d99e9198a053c3a01bee'), 'pts': 0, 'eo': datetime.datetime(2021, 4, 8, 18, 0)}]}}]} 
  1 {'className': 'behaviour', 'type': 'behaviour', 'verb': 'game_escape_run', 'md': [{'n': 'total_score', 'v': '32'}, {'n': 'game_id', 'v': '3'}], 'bs': [{'bid': ObjectId('6050dba29198a053c3a02e4d'), 'n': 'Game Escape Run', 'ao': datetime.datetime(2021, 4, 5, 0, 0, 1, 230000), 'bs': 'OK', 'tids': [ObjectId('605073769198a053c39d77f1'), ObjectId('605071569198a053c39d6ab9')], 'tags': [{'tid': ObjectId('605071569198a053c39d6ab9'), 'prsn': True}, {'tid': ObjectId('605073769198a053c39d77f1'), 'prsn': True}], 'prz': {'ch': False, 'pts': [{'pid': ObjectId('6050d9689198a053c3a019f8'), 'pts': 1, 'eo': datetime.datetime(2021, 4, 5, 18, 0)}], 'at': {'tids': [ObjectId('605073769198a053c39d77f1'), ObjectId('605071569198a053c39d6ab9')], 'tags': [{'tid': ObjectId('605071569198a053c39d6ab9'), 'prsn': True}, {'tid': ObjectId('605073769198a053c39d77f1'), 'prsn': True}]}}}]}

import flat_table
if 'behaviour_attributes' in getDataByDate_df.columns:
    df = pd.DataFrame(getDataByDate_df['behaviour_attributes'])
    getDataByDate_dfA = flat_table.normalize(df)
    getDataByDate_df = pd.concat([getDataByDate_df, getDataByDate_dfA], axis=1)
    getDataByDate_df.drop('index', axis=1, inplace=True)
    getDataByDate_df.drop('behaviour_attributes', axis=1, inplace=True)
    del getDataByDate_dfA
    del df

I tried to drop index then use flat_table , but error remains same at getDataByDate_dfA = flat_table.normalize(df) line


Solution

  • I experience no bug on flat_table.normalize() after replacing the ObjectId() and datetime.datetime() objects by literal strings. Not sure whether this is a library bug or feature.

    Data

    I assume your data is stored in dict type, so I tried to restore your pasted data as dict by ast.literal_eval(). Since this method has trouble for objects, quoting them out is necessary.

    import pandas as pd
    import io
    import ast
    import re
    import flat_table
    
    df = pd.read_csv(io.StringIO("""
       behaviour_attributes 
    0  {'className': 'behaviour', 'type': 'behaviour', 'verb': 'can_perform_stw_everything', 'bs': [{'bid': ObjectId('6050da979198a053c3a02484'), 'n': 'Can Perform Spin Wheel Everything', 'ao': datetime.datetime(2021, 4, 6, 0, 0, 0, 266000), 'bs': 'CountLimitException', 'tids': [ObjectId('605073cb9198a053c39d7a4d')], 'tags': [{'tid': ObjectId('605073cb9198a053c39d7a4d'), 'prsn': True}], 'prz': {'ch': False, 'pts': [{'pid': ObjectId('6050d99e9198a053c3a01bee'), 'pts': 0, 'eo': datetime.datetime(2021, 4, 8, 18, 0)}]}}]} 
    1  {'className': 'behaviour', 'type': 'behaviour', 'verb': 'game_escape_run', 'md': [{'n': 'total_score', 'v': '32'}, {'n': 'game_id', 'v': '3'}], 'bs': [{'bid': ObjectId('6050dba29198a053c3a02e4d'), 'n': 'Game Escape Run', 'ao': datetime.datetime(2021, 4, 5, 0, 0, 1, 230000), 'bs': 'OK', 'tids': [ObjectId('605073769198a053c39d77f1'), ObjectId('605071569198a053c39d6ab9')], 'tags': [{'tid': ObjectId('605071569198a053c39d6ab9'), 'prsn': True}, {'tid': ObjectId('605073769198a053c39d77f1'), 'prsn': True}], 'prz': {'ch': False, 'pts': [{'pid': ObjectId('6050d9689198a053c3a019f8'), 'pts': 1, 'eo': datetime.datetime(2021, 4, 5, 18, 0)}], 'at': {'tids': [ObjectId('605073769198a053c39d77f1'), ObjectId('605071569198a053c39d6ab9')], 'tags': [{'tid': ObjectId('605071569198a053c39d6ab9'), 'prsn': True}, {'tid': ObjectId('605073769198a053c39d77f1'), 'prsn': True}]}}}]}
    """), sep=r"\s{2,}", engine='python')
    
    def restore_dict(s: str):
        """Restore dictionary by quoting out special objects."""
        s1 = re.sub(r"ObjectId\('([^)]*)'\)", r"'ObjectId(\1)'", s)
        s2 = re.sub(r"datetime\.datetime\(([^)]*)\)", r"'datetime.datetime(\1)'", s1)
        return ast.literal_eval(s2)
    df["behaviour_attributes"] = df["behaviour_attributes"].apply(restore_dict)
    

    Result

    df2 = flat_table.normalize(df)
    
    # remove long prefix in column names for printing
    df2.columns = [s.replace("behaviour_attributes.", "") for s in df2.columns]
    print(df2)
    
        index md.v         md.n  ...                        verb       type  className
    0       0  NaN          NaN  ...  can_perform_stw_everything  behaviour  behaviour
    1       1   32  total_score  ...             game_escape_run  behaviour  behaviour
    2       1   32  total_score  ...             game_escape_run  behaviour  behaviour
    3       1   32  total_score  ...             game_escape_run  behaviour  behaviour
    4       1   32  total_score  ...             game_escape_run  behaviour  behaviour
    5       1   32  total_score  ...             game_escape_run  behaviour  behaviour
    6       1   32  total_score  ...             game_escape_run  behaviour  behaviour
    7       1   32  total_score  ...             game_escape_run  behaviour  behaviour
    8       1   32  total_score  ...             game_escape_run  behaviour  behaviour
    9       1    3      game_id  ...             game_escape_run  behaviour  behaviour
    10      1    3      game_id  ...             game_escape_run  behaviour  behaviour
    11      1    3      game_id  ...             game_escape_run  behaviour  behaviour
    12      1    3      game_id  ...             game_escape_run  behaviour  behaviour
    13      1    3      game_id  ...             game_escape_run  behaviour  behaviour
    14      1    3      game_id  ...             game_escape_run  behaviour  behaviour
    15      1    3      game_id  ...             game_escape_run  behaviour  behaviour
    16      1    3      game_id  ...             game_escape_run  behaviour  behaviour
    [17 rows x 20 columns]