pythonpandasfor-loopapply

In python, How to find difference between a specific column in one dataframe and numeric columns of another dataframe?


I have two datasets/dataframes df1 and df2, I want to generate df3 by finding the difference between numeric columns of df2 and df1's column_X.

#### copy and paste below to generate df1 and df2

import pandas as pd 
from random import uniform
import numpy as np

# generation of df1

data = np.random.uniform(15,40, size=(60, 2))
df1 = pd.DataFrame(data, columns=['column_A','column_B'])
df1['column_X'] = df1.mean(axis=1)
df1

# generation of df2
data = np.random.uniform(10.5,32.8, size=(60, 30))
df2 = pd.DataFrame(data, columns=['column_1','column_2','column_3','column_4','column_5',
                                'column_6','column_7','column_8','column_9','column_10',
                               'column_11','column_12','column_13','column_14','column_15',
                               'column_16','column_17','column_18','column_19','column_20',
                               'column_21','column_22','column_23','column_24','column_25',
                             'column_26','column_27','column_28','column_29','column_30',])

df2["Group"] = pd.DataFrame(np.repeat(['A','B','C'], 20, axis=0))
# make  "Group" column the first column
col = df2.pop('Group') 
df2.insert(0, 'Group', col)  

df2

I want to generate df3 by substracting df2's numeric columns (column_1 to column_30) from df1's Column_X while retaining the "Group" column

# Step 1: create an empty df3 and then append df2['Group']

df3 = pd.DataFrame()

# substract "column_X from each numeric column 

df3['col_1X_sub'] = df2['column_1'] - df1['column_X']
df3['col_2X_sub'] = df2['column_2'] - df1['column_X']
df3['col_3X_sub'] = df2['column_3'] - df1['column_X']
.
.
.
df3['col_30X_sub'] = df2['column_30'] - df1['column_X']

final df3 should look something like this for all 30 columns

enter image description here


Solution

  • Copy and then broadcast-subtract:

    import pandas as pd
    import numpy as np
    
    np.random.seed(0)
    a, b = ab = np.random.uniform(low=15, high=40, size=(2, 60))
    df1 = pd.DataFrame({
        'column_A': a,
        'column_B': b,
        'column_X': ab.mean(axis=0),
    })
    
    df2 = pd.DataFrame(
        data=np.random.uniform(low=10.5, high=32.8, size=(60, 30)),
        columns=[f'column_{i}' for i in range(1, 31)],
    )
    df2.insert(
        loc=0, column='Group',
        value=np.repeat(['A','B','C'], 20, axis=0),
    )
    
    df3 = df2.copy()
    df3.iloc[:, 1:] -= df1['column_X'].values[:, np.newaxis]