pythonpy-datatable

Remove rows which have na values


I have the following datatable in python:-

#          A        B               B_lag_1         B_lag_2         B_lag_3         B_lag_4
#0         0        −0.342855       NA              NA              NA              NA
#1         0        0.0706784       −0.342855       NA              NA              NA
#2         0        0.0470259       0.0706784       −0.342855       NA              NA
#3         0        −0.0522357      0.0470259       0.0706784       −0.342855       NA
#4         0        −0.610938       −0.0522357      0.0470259       0.0706784       −0.342855
#5         1        −2.62617        NA              NA              NA              NA
#6         1        0.550128        −2.62617        NA              NA              NA
#7         1        0.538717        0.550128        −2.62617        NA              NA
#8         1        −0.487166       0.538717        0.550128        −2.62617        NA
#9         1        0.996788        −0.487166       0.538717        0.550128        −2.62617

From this, I want to remove all the rows which have any na values in them. How can I do this?

Thanks in advance


Solution

  • I never used datatable but pandas.DataFrame has isna() to select rows with na, and drop() to remove rows (or it can use del for this) and I found similar functions for datatable.


    datatable can use del to remove selected rows. It can use also .isna() or == None to select rows with na. Problem is that it can filter it only on one column - so it may need for-loop to check different columns.

    columns = dt.f[:]
    
    for n in range(1, 5):
        rows = (dt.f[f'B_lag_{n}'] == None)
        del df[rows, columns]
    
    print(df)
    

    This removes values from datatable but not rows and it create empty rows like this

       |     A          B    B_lag_1    B_lag_2    B_lag_3    B_lag_4
       | int64    float64    float64    float64    float64    float64
    -- + -----  ---------  ---------  ---------  ---------  ---------
     0 |    NA  NA         NA         NA         NA         NA       
     1 |    NA  NA         NA         NA         NA         NA       
     2 |    NA  NA         NA         NA         NA         NA       
     3 |    NA  NA         NA         NA         NA         NA       
     4 |     0  -0.234153   1.52303    0.647689  -0.138264   0.496714
     5 |    NA  NA         NA         NA         NA         NA       
     6 |    NA  NA         NA         NA         NA         NA       
     7 |    NA  NA         NA         NA         NA         NA       
     8 |    NA  NA         NA         NA         NA         NA       
     9 |     1   0.54256   -0.469474   0.767435   1.57921   -0.234137
    [10 rows x 6 columns]
    

    It can be better to keep rows which don't have None

    columns = dt.f[:]
    
    for n in range(1, 5):
        rows = (dt.f[f'B_lag_{n}'] != None)
        df = df[rows, columns]
    
    print(df)
    

    Result:

       |     A          B    B_lag_1   B_lag_2    B_lag_3    B_lag_4
       | int64    float64    float64   float64    float64    float64
    -- + -----  ---------  ---------  --------  ---------  ---------
     0 |     0  -0.234153   1.52303   0.647689  -0.138264   0.496714
     1 |     1   0.54256   -0.469474  0.767435   1.57921   -0.234137
    [2 rows x 6 columns]
    

    But you can use & (as operator AND) and | (as operator OR) to do the same without for-loop.

    columns = dt.f[:]
    
    rows = (dt.f['B_lag_1'] != None) & (dt.f['B_lag_2'] != None) & (dt.f['B_lag_3'] != None) & (dt.f['B_lag_4'] != None)
    
    df = df[rows, columns]
    
    print(df)
    

    But later I found that datatable has dt.rowall() and dt.rowany() to work with many columns and code can be simpler.

    rowall() works like operator AND, rowany() works like operator OR.

    columns = dt.f[:]
    
    rows = dt.rowall(dt.f['B_lag_1', 'B_lag_2', 'B_lag_3', 'B_lag_4'] != None)
    #rows = dt.rowall(dt.f['B_lag_1':'B_lag_4'] != None)  # range of columns
    #rows = dt.rowall(dt.f[:] != None)                    # all columns
    
    df = df[rows, columns]
    
    print(df)
    

    Full working code:

    I took code from my previous answer Create many lagged variables

    import datatable as dt
    import numpy as np
    
    def test1(df):
        print('\n--- test 1 ---\n')
        
        df = df.copy()
    
        #columns = dt.f['A', 'B', 'B_lag_1', 'B_lag_2', 'B_lag_3', 'B_lag_4']
        #columns = df.keys()
        columns = dt.f[:]
        
        for n in range(1, 5):
            rows = (dt.f[f'B_lag_{n}'] == None)
            del df[rows, columns]
    
        print(df)    
            
    def test2(df):
        print('\n--- test 2 ---\n')
        
        df = df.copy()
    
        #columns = dt.f['A', 'B', 'B_lag_1', 'B_lag_2', 'B_lag_3', 'B_lag_4']
        #columns = df.keys()
        columns = dt.f[:]
    
        for n in range(1, 5):
            rows = (dt.f[f'B_lag_{n}'] != None)
            df = df[rows, columns]
        
        print(df)
    
    def test3(df):
        print('\n--- test 3 ---\n')
        
        df = df.copy()
        
        rows = (dt.f['B_lag_1'] != None) & (dt.f['B_lag_2'] != None) & (dt.f['B_lag_3'] != None) & (dt.f['B_lag_4'] != None)
    
        columns = dt.f[:]
        
        df = df[rows, columns]
        
        print(df)
    
    def test4(df):
        print('\n--- test 4 ---\n')
        
        df = df.copy()
    
        columns = dt.f[:]
        
        #rows = dt.rowall(dt.f['B_lag_1', 'B_lag_2', 'B_lag_3', 'B_lag_4'] != None) # use columns in some range
        #rows = dt.rowall(dt.f['B_lag_1':'B_lag_4'] != None) # use columns in some range
        #rows = dt.rowall(dt.f[float] != None)               # use columns which have float values
        rows = dt.rowall(dt.f[:] != None)                    # use all columns
    
        df = df[rows, columns]
        
        print(df)
        
        
    # --- main ---
    
    np.random.seed(42)
    
    df = dt.Frame({
        "A": np.repeat(np.arange(0, 2), 5), 
        "B": np.random.normal(0, 1, 10)
    })
    
    for n in range(1, 5):
        df[f'B_lag_{n}'] = df[:, dt.shift(dt.f.B, n), dt.by('A')]['B']
    
    # --- tests ---
    
    test1(df)
    test2(df)
    test3(df)
    test4(df)