I have a data_frame like this
Id | A | B | C |
---|---|---|---|
1 | 0 | 0 | 1 |
2 | 1 | 0 | 0 |
3 | 1 | 0 | 1 |
4 | 0 | 0 | 1 |
I would like to create a contingency matrix like below to calculate correlations
Label | T/F | count |
---|---|---|
A | 0 | 2 |
A | 1 | 2 |
B | 0 | 4 |
B | 1 | 0 |
C | 0 | 1 |
C | 1 | 3 |
I was able to get the count for one column using the below code but not sure how to do this for multiple columns.
df = data_frame.groupBy('A').count()
Any help is much appreciated. Thanks!
Narly one. See step by step approach below
#melt all columns except Id
new =df.withColumn('tab', F.array(*[F.struct(F.lit(x).alias('Label'), F.col(x).alias('T/F')) for x in df.columns if x!='Id'])).selectExpr('Id','inline(tab)').drop('tab')
new =(new.groupBy('Label').agg(collect_list('T/F').alias('T/F'))#Create list of T/F per labes
.withColumn('m', array(*[lit(x).cast('integer') for x in [0,1]]))#Add column with array of 0,1
.withColumn('m1',array_except(col('m'),col('T/F')))#add column with array oF ELEMENTS IN 0,1 MISSING ING IN T/F
.withColumn('T/F',explode(flatten(when(size(array_except(col('m'),col('T/F')))>0,concat(array(col('T/F'),array(lit(None))))).otherwise(array('T/F')))))#Ensure one element per row after adding None where an element was found missing in T/F
.groupBy('Label','T/F','m1').agg(count('T/F').alias('count'))#groupby
.withColumn('T/F', coalesce('T/F', col('m1')[0]))#Where there is none, fillna with missing element
.drop('m1')#drop unwanted column
)
+-----+---+-----+
|Label|T/F|count|
+-----+---+-----+
| B| 0| 4|
| B| 1| 0|
| C| 1| 3|
| C| 0| 1|
| A| 0| 2|
| A| 1| 2|
+-----+---+-----+