pythonpandasdataframeoversampling

2D Gaussian oversampling over large dataframe


I currently have a dataframe in the following format:

step  tag_id  x_pos   y_pos
1     1         5      3
1     2         3      4
2     1         2      2
2     3         1      6
.........................
.........................
N     1         5      7

For each row in the df, I am aiming to add an additional m rows oversampling from a Gaussian distribution for the x and y values (independent). Thus, a df of N = 100 and m = 10 would result in a df length 1010, including the original and oversampled values.

The code I have for this works, but it is extremely slow over a large dataset (N > 100k). There are many operations (creating new arrays/ dfs, use of itertuples, etc.) that I'm sure are hampering performance; I would appreciate any help as to how I can improve the performance so I can generate higher m values over the whole dataset. For instance: input data is from a pandas dataframe, but the multi-variate normal function operates on numpy arrays. Is there a more natural way to implement this through pandas without the copying between numpy arrays and dataframes? Thanks!

Reproducible example:

import pandas as pd
import numpy as np
import random


def gaussianOversample2(row, n):
    sigma = 2
    mean_x = float(getattr(row,'x_pos'))
    mean_y = float(getattr(row,'y_pos'))
    step = getattr(row, 'step')
    tag_id = getattr(row, 'tag_id')
    sigma = np.array([1,1])
    cov = np.diag(sigma ** 2)
    x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
    x = np.concatenate(([mean_x], x))
    y = np.concatenate(([mean_y], y))
    
    steps = np.empty(n+1)
    tags = np.empty(n+1)
    
    steps.fill(step)
    tags.fill(tag_id)
    
    return x,y, steps, tags
    
    
def oversampleDf(df, n):
    
    oversampled_arr = np.empty((0,4), float)
    # with input df with step, tag_id, x_pos, y_pos
    data = pd.DataFrame(columns = df.columns)
    count = 0
    for row in df.itertuples(index=False):
        count = count + 1
        temp = np.zeros((len(row), n+1))
        oversample_x, oversample_y, steps, tags = gaussianOversample2(row, n)
        temp[0] = steps
        temp[1] = tags
        temp[2] = oversample_x
        temp[3] = oversample_y
        temp = pd.DataFrame(temp.T, columns = df.columns)

        data = data.append(temp)
        if count % 1000 == 0:
            print("Row: ", count)
    return data

df = pd.DataFrame([[1, 1, 5, 3],[1, 2, 3, 4],[2, 1, 2, 2],[2, 3, 1, 6], columns = ['step', 'tag_id', 'x_pos', 'y_pos']])

res = oversampleDf(df, 20)

"""
# Result should be:
    step  tag_id     x_pos     y_pos
0    1.0     1.0  5.000000  3.000000
1    1.0     1.0  3.423492  3.886602
2    1.0     1.0  5.404581  2.177559
3    1.0     1.0  4.023274  2.883737
4    1.0     1.0  3.390710  3.038782
..   ...     ...       ...       ...
16   2.0     3.0  1.894151  5.510321
17   2.0     3.0  1.110932  5.281578
18   2.0     3.0  1.623538  4.529825
19   2.0     3.0 -0.576756  7.476872
20   2.0     3.0 -0.866123  5.898048
"""

Solution

  • This is the solution I have found for myself; it is more of a workaround than a technique using quicker methods. I instead write out to a csv file, which I then read in once complete, as so:

    def gaussianOversample3(row, n):
        mean_x = float(getattr(row,'x_pos'))
        mean_y = float(getattr(row,'y_pos'))
        step = getattr(row, 'step')
        tag_id = getattr(row, 'tag_id')
        sigma = np.array([1,1])
        cov = np.diag(sigma ** 2)
        x,y = np.random.multivariate_normal([mean_x, mean_y], cov, n).T
        x = np.concatenate(([mean_x], x))
        y = np.concatenate(([mean_y], y))
        
        steps = np.empty(n+1)
        tags = np.empty(n+1)
        
        steps.fill(step)
        tags.fill(tag_id)
        
        pd.DataFrame(data = np.column_stack((steps,tags,x,y))).to_csv("oversample.csv", mode = 'a', header = False)
     
    def oversampleDf2(df, n):
        filename = "oversample.csv"
        d = pd.DataFrame(list())
        d.to_csv(filename)
    
        #count = 0
        for row in df.itertuples(index=False):
            #count = count + 1
            gaussianOversample3(row, n)
            #if count % 10000 == 0:
            #   print("Row: ", count)   
    
    

    Because of how it is reading the file, I have to do the following:

    oversampleDf2(defensive_df2, num_oversamples)
    oversampled_df = pd.read_csv("oversample_10.csv", sep= ' ')
    oversampled_df.columns = ['col']
    oversampled_df = oversampled_df.col.str.split(",",expand=True)
    oversampled_df.columns = ['temp', 'step', 'tag_id', 'x_pos', 'y_pos']
    oversampled_df = oversampled_df.drop(['temp'], axis = 1)
    oversampled_df = oversampled_df.astype(float)