I have a data frame that contains returns, size and sedols for a couple of dates.
My goal is to identify the top and bottom values for a certain condition per date, i.e I want the top decile largest size entries and the bottom decile smallest size entries for each date and flag them in a new column by 'xx' and 'yy'.
I am confused how to apply the tiling while grouping as well as creating a new column, here is what I already have.
import pandas as pd
import numpy as np
import datetime as dt
from random import choice
from string import ascii_uppercase
def create_dummy_data(start_date, days, entries_pday):
date_sequence_lst = [dt.datetime.strptime(start_date,'%Y-%m-%d') +
dt.timedelta(days=x) for x in range(0,days)]
date_sequence_lst = date_sequence_lst * entries_pday
returns_lst = [round(np.random.uniform(low=-0.10,high=0.20),2) for _ in range(entries_pday*days)]
size_lst = [round(np.random.uniform(low=10.00,high=10000.00),0) for _ in range(entries_pday*days)]
rdm_sedol_lst = [(''.join(choice(ascii_uppercase) for i in range(7))) for x in range(entries_pday)]
rdm_sedol_lst = rdm_sedol_lst * days
dates_returns_df = pd.DataFrame({'Date':date_sequence_lst , 'Sedols':rdm_sedol_lst, 'Returns':returns_lst,'Size':size_lst})
dates_returns_df = dates_returns_df.sort_values('Date',ascending=True)
dates_returns_df = dates_returns_df.reset_index(drop=True)
return dates_returns_df
def order_df_by(df_in,column_name):
df_out = df_in.sort_values(['Date',column_name],ascending=[True,False])
return df_out
def get_ntile(df_in,ntile):
df_in['Tiled'] = df_in.groupby(['Date'])['Size'].transform(lambda x : pd.qcut(x,ntile))
return df_in
if __name__ == "__main__":
# create dummy returns
data_df = create_dummy_data('2001-01-01',31,10)
# sort by attribute
data_sorted_df = order_df_by(data_df,'Size')
#ntile data per date
data_ntiled = get_ntile(data_sorted_df, 10)
for key, item in data_ntiled:
print(data_ntiled.get_group(key))
so far I would be expecting deciled results based on 'Size' for each date, the next step would be to filter only for decile 1 and decile 10 and flag the entries 'xx' and 'yy' respectively.
thanks
Consider using transform
on the pandas.qcut method with labels 1 through ntile+1 for a decile column, then conditionally set flag with np.where
using decile values:
...
def get_ntile(df_in, ntile):
df_in['Tiled'] = df_in.groupby(['Date'])['Size'].transform(lambda x: pd.qcut(x, ntile, labels=list(range(1, ntile+1))))
return df_in
if __name__ == "__main__":
# create dummy returns
data_df = create_dummy_data('2001-01-01',31,10)
# sort by attribute
data_sorted_df = order_df_by(data_df,'Size')
#ntile data per date
data_ntiled = get_ntile(data_sorted_df, 10)
data_ntiled['flag'] = np.where(data_ntiled['Tiled']==1.0, 'YY',
np.where(data_ntiled['Tiled']==10.0, 'XX', np.nan))
print(data_ntiled.reset_index(drop=True).head(15))
# Date Returns Sedols Size Tiled flag
# 0 2001-01-01 -0.03 TEEADVJ 8942.0 10.0 XX
# 1 2001-01-01 -0.03 PDBWGBJ 7142.0 9.0 nan
# 2 2001-01-01 0.03 QNVVPIC 6995.0 8.0 nan
# 3 2001-01-01 0.04 NTKEAKB 6871.0 7.0 nan
# 4 2001-01-01 0.20 ZVVCLSJ 6541.0 6.0 nan
# 5 2001-01-01 0.12 IJKXLIF 5131.0 5.0 nan
# 6 2001-01-01 0.14 HVPDRIU 4490.0 4.0 nan
# 7 2001-01-01 -0.08 XNOGFET 3397.0 3.0 nan
# 8 2001-01-01 -0.06 JOARYWC 2582.0 2.0 nan
# 9 2001-01-01 0.12 FVKBQGU 723.0 1.0 YY
# 10 2001-01-02 0.03 ZVVCLSJ 9291.0 10.0 XX
# 11 2001-01-02 0.14 HVPDRIU 8875.0 9.0 nan
# 12 2001-01-02 0.08 PDBWGBJ 7496.0 8.0 nan
# 13 2001-01-02 0.02 FVKBQGU 7307.0 7.0 nan
# 14 2001-01-02 -0.01 QNVVPIC 7159.0 6.0 nan