I have a large data frame (>1m rows, 10+ cols) that I need to do the following to:
A
& B
in example)C
in example)C
(E
in example)D
in example)I have got the following code working, which gives the correct results. However, it is very slow. Can anyone suggest some pandas magic to get improved performance?
import pandas as pd
import numpy as np
np.random.seed(0)
A = list()
B = list()
C = list()
D = list()
E = list()
np_alphabet = np.array(list('ABCEEFGHIJKLMNOPQRSTUVWXYZ'), dtype="|S1")
np_codes = np.random.choice(np_alphabet, [5, 10])
for a in np_codes:
for b in range(2):
for i in range(5):
A.append(''.join(a))
B.append('{}_{}'.format(b, A[-1]))
C.append(np.random.rand())
D.append(i)
E.append(B[-1])
df = pd.DataFrame({
'A': A,
'B': B,
'C': C,
'D': D,
'E': E
})
df.set_index(['A', 'B'], drop=False, inplace=True)
df.sort_index(inplace=True)
print(df)
grouped_sizes = df.groupby(level=[0, 1]).size()
num_indices = grouped_sizes.shape[0]
print_num = max(1, num_indices // 20)
for idx in grouped_sizes.index:
if grouped_sizes[idx] > 1:
tmp_df = df.loc[idx].sort_values('C', inplace=False)
tmp_df['E'] = map(lambda x: '{}_{}'.format(*x), zip(range(1, tmp_df.size + 1), tmp_df['E']))
df.loc[idx] = tmp_df
else:
df.loc[idx, 'E'] = '1_{}'.format(df.loc[idx, 'E'])
print(df)
which gives the following output
# before
A B C D E
A B
AXEJWVIZMZ 0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.954914 0 0_AXEJWVIZMZ
0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.758615 1 0_AXEJWVIZMZ
0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.952573 2 0_AXEJWVIZMZ
0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.903142 3 0_AXEJWVIZMZ
0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.154262 4 0_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.560586 0 1_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.528869 1 1_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.115331 2 1_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.380718 3 1_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.147092 4 1_AXEJWVIZMZ
BVBSPSACVA 0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.824997 0 0_BVBSPSACVA
0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.264456 1 0_BVBSPSACVA
0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.282663 2 0_BVBSPSACVA
0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.678287 3 0_BVBSPSACVA
0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.409996 4 0_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.149984 0 1_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.711210 1 1_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.840399 2 1_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.804939 3 1_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.290150 4 1_BVBSPSACVA
FHNMIRQRSP 0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.119058 0 0_FHNMIRQRSP
0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.021955 1 0_FHNMIRQRSP
0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.299527 2 0_FHNMIRQRSP
0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.449371 3 0_FHNMIRQRSP
0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.179845 4 0_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.075765 0 1_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.413373 1 1_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.835250 2 1_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.371984 3 1_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.265494 4 1_FHNMIRQRSP
TJSECSLWFT 0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.804553 0 0_TJSECSLWFT
0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.376646 1 0_TJSECSLWFT
0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.904908 2 0_TJSECSLWFT
0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.274501 3 0_TJSECSLWFT
0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.820866 4 0_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.886687 0 1_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.198887 1 1_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.857795 2 1_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.326926 3 1_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.116743 4 1_TJSECSLWFT
WXEKPQSLQK 0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.249891 0 0_WXEKPQSLQK
0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.945414 1 0_WXEKPQSLQK
0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.235062 2 0_WXEKPQSLQK
0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.082703 3 0_WXEKPQSLQK
0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.894169 4 0_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.595575 0 1_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.769144 1 1_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.917691 2 1_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.567448 3 1_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.832299 4 1_WXEKPQSLQK
and
# after
A B C D E
A B
AXEJWVIZMZ 0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.154262 4 1_0_AXEJWVIZMZ
0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.758615 1 2_0_AXEJWVIZMZ
0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.903142 3 3_0_AXEJWVIZMZ
0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.952573 2 4_0_AXEJWVIZMZ
0_AXEJWVIZMZ AXEJWVIZMZ 0_AXEJWVIZMZ 0.954914 0 5_0_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.115331 2 1_1_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.147092 4 2_1_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.380718 3 3_1_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.528869 1 4_1_AXEJWVIZMZ
1_AXEJWVIZMZ AXEJWVIZMZ 1_AXEJWVIZMZ 0.560586 0 5_1_AXEJWVIZMZ
BVBSPSACVA 0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.264456 1 1_0_BVBSPSACVA
0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.282663 2 2_0_BVBSPSACVA
0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.409996 4 3_0_BVBSPSACVA
0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.678287 3 4_0_BVBSPSACVA
0_BVBSPSACVA BVBSPSACVA 0_BVBSPSACVA 0.824997 0 5_0_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.149984 0 1_1_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.290150 4 2_1_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.711210 1 3_1_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.804939 3 4_1_BVBSPSACVA
1_BVBSPSACVA BVBSPSACVA 1_BVBSPSACVA 0.840399 2 5_1_BVBSPSACVA
FHNMIRQRSP 0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.021955 1 1_0_FHNMIRQRSP
0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.119058 0 2_0_FHNMIRQRSP
0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.179845 4 3_0_FHNMIRQRSP
0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.299527 2 4_0_FHNMIRQRSP
0_FHNMIRQRSP FHNMIRQRSP 0_FHNMIRQRSP 0.449371 3 5_0_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.075765 0 1_1_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.265494 4 2_1_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.371984 3 3_1_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.413373 1 4_1_FHNMIRQRSP
1_FHNMIRQRSP FHNMIRQRSP 1_FHNMIRQRSP 0.835250 2 5_1_FHNMIRQRSP
TJSECSLWFT 0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.274501 3 1_0_TJSECSLWFT
0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.376646 1 2_0_TJSECSLWFT
0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.804553 0 3_0_TJSECSLWFT
0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.820866 4 4_0_TJSECSLWFT
0_TJSECSLWFT TJSECSLWFT 0_TJSECSLWFT 0.904908 2 5_0_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.116743 4 1_1_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.198887 1 2_1_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.326926 3 3_1_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.857795 2 4_1_TJSECSLWFT
1_TJSECSLWFT TJSECSLWFT 1_TJSECSLWFT 0.886687 0 5_1_TJSECSLWFT
WXEKPQSLQK 0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.082703 3 1_0_WXEKPQSLQK
0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.235062 2 2_0_WXEKPQSLQK
0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.249891 0 3_0_WXEKPQSLQK
0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.894169 4 4_0_WXEKPQSLQK
0_WXEKPQSLQK WXEKPQSLQK 0_WXEKPQSLQK 0.945414 1 5_0_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.567448 3 1_1_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.595575 0 2_1_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.769144 1 3_1_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.832299 4 4_1_WXEKPQSLQK
1_WXEKPQSLQK WXEKPQSLQK 1_WXEKPQSLQK 0.917691 2 5_1_WXEKPQSLQK
from __future__ import print_function, division
from timeit import Timer
import pandas as pd
import numpy as np
def create_df():
np.random.seed(0)
A = list()
B = list()
C = list()
D = list()
E = list()
np_alphabet = np.array(list('ABCEEFGHIJKLMNOPQRSTUVWXYZ'), dtype="|S1")
np_codes = np.random.choice(np_alphabet, [100, 10])
for a in np_codes:
for b in range(2):
for i in range(5):
A.append(''.join(a))
B.append('{}_{}'.format(b, A[-1]))
C.append(np.random.rand())
D.append(i)
E.append(B[-1])
df = pd.DataFrame({
'A': A,
'B': B,
'C': C,
'D': D,
'E': E
})
return df.copy()
def method1(df):
df.set_index(['A', 'B'], drop=False, inplace=True)
df.sort_index(inplace=True)
grouped_sizes = df.groupby(level=[0, 1]).size()
for idx in grouped_sizes.index:
if grouped_sizes[idx] > 1:
tmp_df = df.loc[idx].sort_values('C', inplace=False)
tmp_df['E'] = map(lambda x: '{}_{}'.format(*x), zip(range(1, tmp_df.size + 1), tmp_df['E']))
df.loc[idx] = tmp_df
else:
df.loc[idx, 'E'] = '1_{}'.format(df.loc[idx, 'E'])
return df
def method1a(df):
df.set_index(['A', 'B'], drop=False, inplace=True)
df.sort_values(['A', 'B', 'C'], inplace=True)
grouped_sizes = df.groupby(level=[0, 1]).size()
for idx in grouped_sizes.index:
if grouped_sizes[idx] > 1:
df.loc[idx, 'E'] = map(lambda x: '{}_{}'.format(*x), zip(range(1, grouped_sizes[idx] + 1), df.loc[idx, 'E']))
else:
df.loc[idx, 'E'] = '1_{}'.format(df.loc[idx, 'E'])
return df
def method2(df):
df.set_index(['A', 'B'], drop=False, inplace=True)
df['F'] = 0
df.sort_values(['A', 'B', 'C'], inplace=True)
grouped_sizes = df.groupby(level=[0, 1]).size()
for idx in grouped_sizes.index:
if grouped_sizes[idx] > 1:
df.loc[idx, 'F'] = range(1, grouped_sizes[idx] + 1)
else:
df.loc[idx, 'F'] = 1
df['E'] = df['F'].map(str) + '_' + df['E']
df.drop('F', axis=1, inplace=True)
return df
def method3(df):
df.set_index(['A', 'B'], drop=False, inplace=True)
df['F'] = 0
df.sort_values(['A', 'B', 'C'], inplace=True)
grouped_sizes = df.groupby(level=[0, 1]).size()
for idx in grouped_sizes.index:
if grouped_sizes[idx] > 1:
df.loc[idx, 'F'] = map(str, range(1, grouped_sizes[idx] + 1))
else:
df.loc[idx, 'F'] = '1'
df['E'] = df['F'] + '_' + df['E']
df.drop('F', axis=1, inplace=True)
return df
def method4(df):
prefixes = df.groupby(['A', 'B']).C.apply(pd.Series.argsort).add(1).astype(str)
df['E'] = prefixes + '_' + df.E
return df
def method5(df):
df.set_index(['A', 'B'], drop=False, inplace=True)
df.sort_values(['A', 'B', 'C'], inplace=True)
df['E'] = df.groupby(level=[0, 1]).cumcount().add(1).astype(str) + '_' + df['E']
return df
def assert_success(df):
row = df[(df['A'] == 'AEVFGIURPE') & (df['B'] == '0_AEVFGIURPE') & (df['D'] == 2)].iloc[0]
if not np.allclose(row['C'], 0.381397) or row['E'] != '3_0_AEVFGIURPE':
print('A: method{}() failed: {} != 0.871083 or {} != 5_1_XOYRFZNIJU'.format(func, row['C'], row['E']))
return
row = df[(df['A'] == 'XOYRFZNIJU') & (df['B'] == '1_XOYRFZNIJU') & (df['D'] == 1)].iloc[0]
if not np.allclose(row['C'], 0.871083) or row['E'] != '5_1_XOYRFZNIJU':
print('B: method{}() failed: {} != 0.871083 or {} != 5_1_XOYRFZNIJU'.format(func, row['C'], row['E']))
return
functions = list()
functions.append('1')
functions.append('1a')
functions.append('2')
functions.append('3')
functions.append('4')
functions.append('5')
for func in functions:
print('method{}'.format(func),
Timer(setup='from __main__ import create_df, assert_success, method{} as func'.format(func),
stmt='df = create_df(); df = func(df); assert_success(df)').repeat(number=10))
Which gives the following results:
method1 [6.581194877624512, 6.625822067260742, 6.722187042236328]
method1a [1.9003210067749023, 1.9387969970703125, 1.9142169952392578]
method2 [0.9547598361968994, 0.9532740116119385, 0.9760739803314209]
method3 [1.0121638774871826, 1.0000989437103271, 0.9709858894348145]
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
B: method4() failed: 0.871082572438 != 0.871083 or 1_1_XOYRFZNIJU != 5_1_XOYRFZNIJU
method4 [0.3202958106994629, 0.3348369598388672, 0.33800482749938965]
method5 [0.11518096923828125, 0.10490703582763672, 0.09626197814941406]
I think you need sort_values
first, then groupby
+ cumcount
for counter, then add 1
and convert to str
:
df.sort_values(['A', 'B', 'C'], inplace=True)
df['E'] = df.groupby(level=['A','B']).cumcount().add(1).astype(str) + '_' + df['E']
print(df)
A B C D E
A B
MPVAEEHJTV 0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.264556 3 1_0_MPVAEEHJTV
0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.414662 2 2_0_MPVAEEHJTV
0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.521848 1 3_0_MPVAEEHJTV
0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.774234 4 4_0_MPVAEEHJTV
0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.944669 0 5_0_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.018790 2 1_1_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.456150 0 2_1_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.568434 1 3_1_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.612096 4 4_1_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.617635 3 5_1_MPVAEEHJTV
RTTTOHABJZ 0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.096098 2 1_0_RTTTOHABJZ
0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.097101 0 2_0_RTTTOHABJZ
0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.468651 4 3_0_RTTTOHABJZ
0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.837945 1 4_0_RTTTOHABJZ
0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.976459 3 5_0_RTTTOHABJZ
1_RTTTOHABJZ RTTTOHABJZ 1_RTTTOHABJZ 0.039188 3 1_1_RTTTOHABJZ
...
...
EDIT:
It seems like bug if dont define levels
by columns only if column names are same as index names:
df['E'] = df.groupby(['A','B']).cumcount().add(1).astype(str) + '_' + df['E']
FutureWarning: 'A' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version df['E'] = df.groupby(column=['A','B']).cumcount().add(1).astype(str) + '_' + df['E']FutureWarning: 'B' is both a column name and an index level. Defaulting to column but this will raise an ambiguity error in a future version df['E'] = df.groupby(column=['A','B']).cumcount().add(1).astype(str) + '_' + df['E']
One possible solution is rename_axis
if necessary groupby by columns names (or rename columns which same names as index names):
df.sort_values(['A', 'B', 'C'], inplace=True)
df = df.rename_axis(('A_lev','B_lev'))
df['E'] = df.groupby(['A','B']).cumcount().add(1).astype(str) + '_' + df['E']
print(df)
A B C D E
A_lev B_lev
MPVAEEHJTV 0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.264556 3 1_0_MPVAEEHJTV
0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.414662 2 2_0_MPVAEEHJTV
0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.521848 1 3_0_MPVAEEHJTV
0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.774234 4 4_0_MPVAEEHJTV
0_MPVAEEHJTV MPVAEEHJTV 0_MPVAEEHJTV 0.944669 0 5_0_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.018790 2 1_1_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.456150 0 2_1_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.568434 1 3_1_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.612096 4 4_1_MPVAEEHJTV
1_MPVAEEHJTV MPVAEEHJTV 1_MPVAEEHJTV 0.617635 3 5_1_MPVAEEHJTV
RTTTOHABJZ 0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.096098 2 1_0_RTTTOHABJZ
0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.097101 0 2_0_RTTTOHABJZ
0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.468651 4 3_0_RTTTOHABJZ
0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.837945 1 4_0_RTTTOHABJZ
0_RTTTOHABJZ RTTTOHABJZ 0_RTTTOHABJZ 0.976459 3 5_0_RTTTOHABJZ
1_RTTTOHABJZ RTTTOHABJZ 1_RTTTOHABJZ 0.039188 3 1_1_RTTTOHABJZ
1_RTTTOHABJZ RTTTOHABJZ 1_RTTTOHABJZ 0.282807 4 2_1_RTTTOHABJZ
1_RTTTOHABJZ RTTTOHABJZ 1_RTTTOHABJZ 0.604846 1 3_1_RTTTOHABJZ
1_RTTTOHABJZ RTTTOHABJZ 1_RTTTOHABJZ 0.739264 2 4_1_RTTTOHABJZ
1_RTTTOHABJZ RTTTOHABJZ 1_RTTTOHABJZ 0.976761 0 5_1_RTTTOHABJZ
...
...