I have a Tukey's test table resulted from pairwise_tukeyhsd
from the python statsmodels.stats.multicomp
.
group1 group2 meandiff lower upper reject
0 101 102 0.2917 -0.0425 0.6259 False
1 101 103 0.1571 -0.1649 0.4792 False
2 101 104 -0.1333 -0.4675 0.2009 False
3 101 105 0.0833 -0.2509 0.4175 False
4 101 106 -0.0500 -0.3626 0.2626 False
5 102 103 -0.1345 -0.4566 0.1875 False
6 102 104 -0.4250 -0.7592 -0.0908 True
7 102 105 -0.2083 -0.5425 0.1259 False
8 102 106 -0.3417 -0.6543 -0.0290 True
9 103 104 -0.2905 -0.6125 0.0316 False
10 103 105 -0.0738 -0.3959 0.2482 False
11 103 106 -0.2071 -0.5067 0.0924 False
12 104 105 0.2167 -0.1175 0.5509 False
13 104 106 0.0833 -0.2293 0.3960 False
14 105 106 -0.1333 -0.4460 0.1793 False
I have this table as pandas
df
. I would like to label (by letters) the groups (101-106) denoting statistical relationships. For this specific example the desired result would be: (I don't mind if the results will be df, a list, a dictionary)
group label
101 ab
102 a
103 ab
104 b
105 ab
106 b
As you can see, all groups that share similar letter have an equal mean (reject column = False), and groups that have different letters (reject column = True) have different mean. For example, the mean of group 101 is equal to all of the other groups' means because group 101 has the letter ab, and all of the other groups have either a or b or ab. On the other hand, group 106 only has the letter b which indicates that it is similar to all the groups except for group 102 that has only the letter a.
I could not find an automatic python solution for that. I saw that R has a package for that called multcompLetters
, is there something similar in python?
So, after a couple of days sitting on it, and with no suggested answers/comments from other users, I think I figured it out. Let's say the table from my question is named df
. The following script is specific for my needs but I hope it can help others. I added comments to facilitate the understanding.
df_True = df.loc[df.reject==True,:]
letters = list(string.ascii_lowercase)
n = 0
group1_list = df_True.group1.tolist() #get the groups from the df with only True (True df) to a list
group2_list = df_True.group2.tolist()
group3 = group1_list+group2_list #concat both lists
group4 = list(set(group3)) #get unique items from the list
group5 = [str(i) for i in group4 ] #convert unicode to a str
group5.sort() #sort the list
gen = ((i, 0) for i in group5) #create dict with 0 so the dict won't be empty when starts
dictionary = dict(gen)
group6 = [(group5[i],group5[j]) for i in range(len(group5)) for j in range(i+1, len(group5))] #get all combination pairs
for pairs in group6: #check for each combination if it is present in df_True
print n
print dictionary
try:
a = df_True.loc[(df_True.group1==pairs[0])&(df_True.group2==pairs[1]),:] #check if the pair exists in the df
except:
a.shape[0] == 0
if a.shape[0] == 0: #it mean that the df is empty as it does not appear in df_True so this pair is equal
print 'equal'
if dictionary[pairs[0]] != 0 and dictionary[pairs[1]] == 0: #if the 1st is populated but the 2nd in not populated
print "1st is populated and 2nd is empty"
dictionary[pairs[1]] = dictionary[pairs[0]]
elif dictionary[pairs[0]] != 0 and dictionary[pairs[1]] != 0: #if both are populated, check matching labeles
print "both are populated"
if len(list(set([c for c in dictionary[pairs[0]] if c in dictionary[pairs[1]]]))) >0: #check if they have a common label
print "they have a shared character"
else:
print "equal but have different labels"
#check if the 1st group label doesn't appear in anyother labels, if it is unique then the 2nd group can have the first group label
m = 0 #count the number of groups that have a shared char with 1st group
j = 0 #count the number of groups that have a shared char with 2nd group
for key, value in dictionary.iteritems():
if key != pairs[0] and len(list(set([c for c in dictionary[pairs[0]] if c in value])))==0:
m+=1
for key, value in dictionary.iteritems():
if key != pairs[1] and len(list(set([c for c in dictionary[pairs[1]] if c in value])))==0:
j+=1
if m == len(dictionary)-1 and j == len(dictionary)-1: #it means that this value is unique because it has no shared char with another group
print "unique"
dictionary[pairs[1]] = dictionary[pairs[0]][0]
else:
print "there is at least one group in the dict that shares a char with the 1st group"
dictionary[pairs[1]] = dictionary[pairs[1]] + dictionary[pairs[0]][0]
else: # if it equals 0, meaning if the 1st is empty (which means that the 2nd must be also empty)
print "both are empty"
dictionary[pairs[0]] = letters[n]
dictionary[pairs[1]] = letters[n]
else:
print "not equal"
if dictionary[pairs[0]] != 0: # if the first one is populated (has a value) then give a value only to the second
print '1st is populated'
# if the 2nd is not empty and they don't share a charcter then no change is needed as they already have different labels
if dictionary[pairs[1]] != 0 and len(list(set([c for c in dictionary[pairs[0]] if c in dictionary[pairs[1]]]))) == 0:
print "no change"
elif dictionary[pairs[1]] == 0: #if the 2nd is not populated give it a new letter
dictionary[pairs[1]] = letters[n+1]
#if the 2nd is populated and equal to the 1st, then change the letter of the 2nd to a new one and assign its original letter to all the others that had the same original letter
elif dictionary[pairs[1]] != 0 and len(list(set([c for c in dictionary[pairs[0]] if c in dictionary[pairs[1]]]))) > 0:
#need to check that they don't share a charcter
print "need to add a letter"
original_value = dictionary[pairs[1]]
dictionary[pairs[1]] = letters[n]
for key, value in dictionary.iteritems():
if key != pairs[0] and len(list(set([c for c in original_value if c in value])))>0: #for any given value, check if it had a character from the group that will get a new letter, if so, it means that they are equal and thus the new letter should also appear in the value of the "old" group
dictionary[key] = original_value + letters[n] #add the original letter of the group to all the other groups it was similar to
else:
print '1st is empty'
dictionary[pairs[0]] = letters[n]
dictionary[pairs[1]] = letters[n+1]
print dictionary
n+=1
# get the letter out the dictionary
labels = list(dictionary.values())
labels1 = list(set(labels))
labels1.sort()
final_label = ''.join(labels1)
for GroupName in group_names:
if GroupName in dictionary:
print "already exists"
else:
dictionary[GroupName] = final_label
for key, value in dictionary.iteritems(): #this keeps only the unique char per group and sort it by group
dictionary[key] = ''.join(set(value))