pythondataframetilingpandas-groupby

Tiling in groupby on dataframe


I have a data frame that contains returns, size and sedols for a couple of dates.

My goal is to identify the top and bottom values for a certain condition per date, i.e I want the top decile largest size entries and the bottom decile smallest size entries for each date and flag them in a new column by 'xx' and 'yy'.

I am confused how to apply the tiling while grouping as well as creating a new column, here is what I already have.

import pandas as pd
import numpy as np
import datetime as dt

from random import choice
from string import ascii_uppercase

def create_dummy_data(start_date, days, entries_pday):
    date_sequence_lst = [dt.datetime.strptime(start_date,'%Y-%m-%d') + 
dt.timedelta(days=x) for x in range(0,days)]
    date_sequence_lst = date_sequence_lst * entries_pday                
    returns_lst = [round(np.random.uniform(low=-0.10,high=0.20),2) for _ in range(entries_pday*days)]

    size_lst = [round(np.random.uniform(low=10.00,high=10000.00),0) for _ in range(entries_pday*days)]

    rdm_sedol_lst = [(''.join(choice(ascii_uppercase) for i in range(7))) for x in range(entries_pday)] 
    rdm_sedol_lst = rdm_sedol_lst * days

    dates_returns_df = pd.DataFrame({'Date':date_sequence_lst , 'Sedols':rdm_sedol_lst, 'Returns':returns_lst,'Size':size_lst})
    dates_returns_df = dates_returns_df.sort_values('Date',ascending=True)
    dates_returns_df = dates_returns_df.reset_index(drop=True)
    return dates_returns_df


def order_df_by(df_in,column_name):
    df_out = df_in.sort_values(['Date',column_name],ascending=[True,False])
    return df_out


def get_ntile(df_in,ntile):
    df_in['Tiled'] = df_in.groupby(['Date'])['Size'].transform(lambda x : pd.qcut(x,ntile))
    return df_in

if __name__ == "__main__":
    # create dummy returns
    data_df = create_dummy_data('2001-01-01',31,10)
    # sort by attribute
    data_sorted_df = order_df_by(data_df,'Size')
    #ntile data per date
    data_ntiled = get_ntile(data_sorted_df, 10)

    for key, item in data_ntiled:
        print(data_ntiled.get_group(key))

so far I would be expecting deciled results based on 'Size' for each date, the next step would be to filter only for decile 1 and decile 10 and flag the entries 'xx' and 'yy' respectively.

thanks


Solution

  • Consider using transform on the pandas.qcut method with labels 1 through ntile+1 for a decile column, then conditionally set flag with np.where using decile values:

    ...
    def get_ntile(df_in, ntile):
        df_in['Tiled'] = df_in.groupby(['Date'])['Size'].transform(lambda x: pd.qcut(x, ntile, labels=list(range(1, ntile+1))))
        return df_in
    
    if __name__ == "__main__":
        # create dummy returns
        data_df = create_dummy_data('2001-01-01',31,10)
        # sort by attribute
        data_sorted_df = order_df_by(data_df,'Size')
        #ntile data per date
        data_ntiled = get_ntile(data_sorted_df, 10)
    
        data_ntiled['flag'] = np.where(data_ntiled['Tiled']==1.0, 'YY',
                                       np.where(data_ntiled['Tiled']==10.0, 'XX', np.nan))
    
        print(data_ntiled.reset_index(drop=True).head(15))
    
    #          Date  Returns   Sedols    Size   Tiled flag
    # 0  2001-01-01    -0.03  TEEADVJ  8942.0    10.0   XX
    # 1  2001-01-01    -0.03  PDBWGBJ  7142.0     9.0  nan
    # 2  2001-01-01     0.03  QNVVPIC  6995.0     8.0  nan
    # 3  2001-01-01     0.04  NTKEAKB  6871.0     7.0  nan
    # 4  2001-01-01     0.20  ZVVCLSJ  6541.0     6.0  nan
    # 5  2001-01-01     0.12  IJKXLIF  5131.0     5.0  nan
    # 6  2001-01-01     0.14  HVPDRIU  4490.0     4.0  nan
    # 7  2001-01-01    -0.08  XNOGFET  3397.0     3.0  nan
    # 8  2001-01-01    -0.06  JOARYWC  2582.0     2.0  nan
    # 9  2001-01-01     0.12  FVKBQGU   723.0     1.0   YY
    # 10 2001-01-02     0.03  ZVVCLSJ  9291.0    10.0   XX
    # 11 2001-01-02     0.14  HVPDRIU  8875.0     9.0  nan
    # 12 2001-01-02     0.08  PDBWGBJ  7496.0     8.0  nan
    # 13 2001-01-02     0.02  FVKBQGU  7307.0     7.0  nan
    # 14 2001-01-02    -0.01  QNVVPIC  7159.0     6.0  nan