pythonpandasdataframegroup-byapply

KeyError: 0 # If we have a listlike key, _check_indexing_error will raise after applying a function to a pandas dataframe


I have a rather complicated function f(featureList) that takes a list of arbitrary length as input and gives another list of the same length as output:

import math
import random
import time

def survivalNormalcdf(x):
    return (1-math.erf(x/math.sqrt(2)))/2

def normalcdf(x):
    return (1+math.erf(x/math.sqrt(2)))/2

def normalpdf(x): 
    return math.exp(-x*x/2)/math.sqrt(2*math.pi)

def abss(p):
    q=[]
    for k in range(len(p)):
        q.append(abs(p[k]))
    return q;

def mult(a,p):
    q=[]
    for k in range(len(p)):
        q.append(a*p[k])
    return q;

def add(a,p):
    q=[]
    for k in range(len(p)):
        q.append(a[k]+p[k])
    return q

def dot(u,v,pp):
    s=0
    for k in range(len(u)):
       s+=u[k]*v[k]*pp[k]
    return s;

def grad(t,pp):
    h=math.sqrt(1/5/(len(pp)+1))
    g=[]

    for k in range(len(pp)):
        g.append(-pp[k])
        beg=t[k] - 10
        end=t[k] + 10
        qq=math.ceil((end-beg)/h)
        for q in range(qq):
            x=beg+q*h
            ss=survivalNormalcdf(x)
            for m in range(len(pp)):
                if k==m:
                    ss*=normalpdf(x-t[m])
                else:
                    ss*=survivalNormalcdf(x-t[m])
            g[k]+=ss*h;
    for k in range(len(pp)):
        g[k]/=pp[k]
    return g

def iint(t,pp):
    h=0.1
    ss=0
    for k in range(1):
        beg=min(min(t),0) - 10
        end=max(max(t),0) + 10
        qq=int((end-beg)/h)
        for q in range(qq):
            x=beg+q*h
            s=1; 
            for m in range(len(pp)):
                s*=survivalNormalcdf(x-t[m])
            ss+=(s-1)*survivalNormalcdf(x)*h
        for k in range(len(pp)):
            ss-=pp[k]*t[k]
    return ss

def f(ppp):
    kk=0
    maxx=ppp[0]
    for k in range(len(ppp)):
        if ppp[k]>maxx:
            kk=k
            maxx=ppp[k]
    pp=ppp[:kk]+ppp[kk+1:]
    
    t=[]
    for k in range(len(pp)):
        t.append(math.sqrt(2*math.log(1/pp[k])))
        
    u=grad(t,pp)
    mm=0
    while mm<=50*len(pp) and sum(abss(grad(t,pp)))>1/10.0**12:
        mm+=1
        if mm%len(pp)==1:
            pass
        s=min(1,1/sum(abss(u)))
        cnt=0
        while dot(u, grad(add(t,mult(s,u)),pp),pp)>0 and s*sum(abss(u))<len(u):
            s*=2
            cnt+=1
        a=0
        b=s
        beg=a
        end=b
        A=dot(u, grad(add(t,mult(a,u)),pp),pp)
        B=dot(u, grad(add(t,mult(b,u)),pp),pp)
        k=0
        while k<20 and abs(A-B)>(1/10.0**12)*max(abs(A),abs(B)):
            mid=(beg+end)/2
            if dot(u, grad(add(t,mult(mid,u)),pp),pp)>0:
                beg=mid
            else:
                end=mid
            
            c=max(beg-(1/10.0**12),min(end+(1/10.0**12),b+(B/(A-B))*(b-a)))
            C=dot(u, grad(add(t,mult(c,u)),pp),pp)
            if abs(a-c)>abs(b-c) and abs(b-c)>0:
                a=b
                A=B
                b=c
                B=C
            else:
                b=a
                B=A
                a=c
                A=C
            if C>0:
                beg=max(beg,c)
            else:
                end=min(end,c)
            k+=1
                
        s=c
            
        oldgrad=grad(t,pp)
        t=add(t,mult(s,u))
        newgrad=grad(t,pp)
        uold=mult(1,u)
        u=mult(1,newgrad)
        if mm%len(pp)!=1:
            u=add(u,mult(dot(newgrad, add(newgrad,mult(-1,oldgrad)),pp)/dot(oldgrad,oldgrad,pp),uold))

    
    ss=sum(abss(grad(t,pp)))
    tt=t[:kk]
    tt.append(0)
    t=tt+t[kk:]
    if ss>1/10.0**12:
        x=str(input("Failed"))
    return t

So for example, we have

f([0.2,0.1,0.55,0.15]) = [0.7980479577400461, 1.2532153405902076, 0, 0.9944188436386611]

f([0.02167131,0.17349148,0.08438952,0.04143787,0.02589056,0.03866752,0.0461553,0.09212758,0.10879326,0.186921,0.02990676,0.02731904,0.06020158,0.06302721]) = 
[1.174313198960376,
 0.04892832217716259,
 0.4858149215364752,
 0.864373517094786,
 1.0921431988531611,
 0.8989070806156786,
 0.8098127832637683,
 0.4358011113129989,
 0.3387512959281985,
 0,
 1.0239882119094197,
 1.0669265516784823,
 0.671235053100702,
 0.6466856803321204]

And I have a pandas dataframe that looks like

Class_ID  Date           Student_ID      feature  
1         1/1/2023       3               0.02167131     
1         1/1/2023       4               0.17349148     
1         1/1/2023       6               0.08438952     
1         1/1/2023       8               0.04143787     
1         1/1/2023       9               0.02589056
1         1/1/2023       1               0.03866752     
1         1/1/2023       10              0.0461553                                   
3         17/4/2022      5               0.2     
3         17/4/2022      2               0.1     
3         17/4/2022      3               0.55     
3         17/4/2022      4               0.15     

and I would like to apply the function f(featureList) to the feature column groupby Class_ID and generate a new column called New_feature. And here's my code:

df['New_feature'] = df.groupby('Class_ID', group_keys=False)['feature'].apply(f)

So the desired outcome looks like:

df_outcome = pd.read_fwf(io.StringIO("""Class_ID   Date          Student_ID      feature        New_feature
1         1/1/2023       3               0.02167131     2.385963956274992
1         1/1/2023       4               0.17349148     0
1         1/1/2023       6               0.08438952     1.6510552553095719
1         1/1/2023       8               0.04143787     2.054792417419151
1         1/1/2023       9               0.02589056     2.298129663961289
1         1/1/2023       1               0.03866752     2.0916706205231286
1         1/1/2023       10              0.0461553       1.9965409929949391
3         17/4/2022      5               0.2            0.7980479577400461
3         17/4/2022      2               0.1            1.2532153405902076
3         17/4/2022      3               0.55           0
3         17/4/2022      4               0.15           0.9944188436386611"""))

However it gives the following error:

KeyError: 0

The above exception was the direct cause of the following exception: 
# If we have a listlike key, _check_indexing_error will raise

Here is the code:

import io
import numpy as np
import pandas as pd
import math

df = pd.read_fwf(io.StringIO("""Class_ID  Date           Student_ID      feature  
1         1/1/2023       3               0.02167131     
1         1/1/2023       4               0.17349148     
1         1/1/2023       6               0.08438952     
1         1/1/2023       8               0.04143787     
1         1/1/2023       9               0.02589056
1         1/1/2023       1               0.03866752     
1         1/1/2023       10              0.0461553                                   
3         17/4/2022      5               0.2     
3         17/4/2022      2               0.1     
3         17/4/2022      3               0.55     
3         17/4/2022      4               0.15"""))     

def survivalNormalcdf(x):
    return (1-math.erf(x/math.sqrt(2)))/2

def normalcdf(x):
    return (1+math.erf(x/math.sqrt(2)))/2

def normalpdf(x): 
    return math.exp(-x*x/2)/math.sqrt(2*math.pi)

def abss(p):
    q=[]
    for k in range(len(p)):
        q.append(abs(p[k]))
    return q;

def mult(a,p):
    q=[]
    for k in range(len(p)):
        q.append(a*p[k])
    return q;

def add(a,p):
    q=[]
    for k in range(len(p)):
        q.append(a[k]+p[k])
    return q

def dot(u,v,pp):
    s=0
    for k in range(len(u)):
       s+=u[k]*v[k]*pp[k]
    return s;

def grad(t,pp):
    h=math.sqrt(1/5/(len(pp)+1))
    g=[]

    for k in range(len(pp)):
        g.append(-pp[k])
        beg=t[k] - 10
        end=t[k] + 10
        qq=math.ceil((end-beg)/h)
        for q in range(qq):
            x=beg+q*h
            ss=survivalNormalcdf(x)
            for m in range(len(pp)):
                if k==m:
                    ss*=normalpdf(x-t[m])
                else:
                    ss*=survivalNormalcdf(x-t[m])
            g[k]+=ss*h;
    for k in range(len(pp)):
        g[k]/=pp[k]
    return g

def iint(t,pp):
    h=0.1
    ss=0
    for k in range(1):
        beg=min(min(t),0) - 10
        end=max(max(t),0) + 10
        qq=int((end-beg)/h)
        for q in range(qq):
            x=beg+q*h
            s=1; 
            for m in range(len(pp)):
                s*=survivalNormalcdf(x-t[m])
            ss+=(s-1)*survivalNormalcdf(x)*h
        for k in range(len(pp)):
            ss-=pp[k]*t[k]
    return ss

def f(ppp):
    kk=0
    maxx=ppp[0]
    for k in range(len(ppp)):
        if ppp[k]>maxx:
            kk=k
            maxx=ppp[k]
    pp=ppp[:kk]+ppp[kk+1:]
    
    t=[]
    for k in range(len(pp)):
        t.append(math.sqrt(2*math.log(1/pp[k])))
        
    u=grad(t,pp)
    mm=0
    while mm<=50*len(pp) and sum(abss(grad(t,pp)))>1/10.0**12:
        mm+=1
        if mm%len(pp)==1:
            pass
        s=min(1,1/sum(abss(u)))
        cnt=0
        while dot(u, grad(add(t,mult(s,u)),pp),pp)>0 and s*sum(abss(u))<len(u):
            s*=2
            cnt+=1
        a=0
        b=s
        beg=a
        end=b
        A=dot(u, grad(add(t,mult(a,u)),pp),pp)
        B=dot(u, grad(add(t,mult(b,u)),pp),pp)
        k=0
        while k<20 and abs(A-B)>(1/10.0**12)*max(abs(A),abs(B)):
            mid=(beg+end)/2
            if dot(u, grad(add(t,mult(mid,u)),pp),pp)>0:
                beg=mid
            else:
                end=mid
            
            c=max(beg-(1/10.0**12),min(end+(1/10.0**12),b+(B/(A-B))*(b-a)))
            C=dot(u, grad(add(t,mult(c,u)),pp),pp)
            if abs(a-c)>abs(b-c) and abs(b-c)>0:
                a=b
                A=B
                b=c
                B=C
            else:
                b=a
                B=A
                a=c
                A=C
            if C>0:
                beg=max(beg,c)
            else:
                end=min(end,c)
            k+=1
                
        s=c
            
        oldgrad=grad(t,pp)
        t=add(t,mult(s,u))
        newgrad=grad(t,pp)
        uold=mult(1,u)
        u=mult(1,newgrad)
        if mm%len(pp)!=1:
            u=add(u,mult(dot(newgrad, add(newgrad,mult(-1,oldgrad)),pp)/dot(oldgrad,oldgrad,pp),uold))

    
    ss=sum(abss(grad(t,pp)))
    tt=t[:kk]
    tt.append(0)
    t=tt+t[kk:]
    if ss>1/10.0**12:
        x=str(input("Failed"))
    return t

df['New_feature'] = df.groupby('Class_ID', group_keys=False)['feature'].apply(f)
df

Did I do anything wrong? Thanks in advance.

Edit Here is a sample dataframe:

df = pd.read_fwf(io.StringIO("""Class_ID  Date           Student_ID      feature  
1         1/1/2023       3               0.02167131     
1         1/1/2023       4               0.17349148     
1         1/1/2023       6               0.08438952     
1         1/1/2023       8               0.04143787     
1         1/1/2023       9               0.02589056
1         1/1/2023       1               0.03866752     
1         1/1/2023       10              0.0461553                                   
3         17/4/2022      5               0.2     
3         17/4/2022      2               0.1     
3         17/4/2022      3               0.55     
3         17/4/2022      4               0.15
7         12/2/2019      3               0.1
7         12/2/2019      5               0.1
7         12/2/2019      12              0.05
7         12/2/2019      8               0.45
7         12/2/2019      6               0.3""")) 

and the desired output:

df_outcome = pd.read_fwf(io.StringIO("""Class_ID   Date          Student_ID      feature        New_feature
1         1/1/2023       3               0.02167131     2.385963956274992
1         1/1/2023       4               0.17349148     0
1         1/1/2023       6               0.08438952     1.6510552553095719
1         1/1/2023       8               0.04143787     2.054792417419151
1         1/1/2023       9               0.02589056     2.298129663961289
1         1/1/2023       1               0.03866752     2.0916706205231286
1         1/1/2023       10              0.0461553       1.9965409929949391
3         17/4/2022      5               0.2            0.7980479577400461
3         17/4/2022      2               0.1            1.2532153405902076
3         17/4/2022      3               0.55           0
3         17/4/2022      4               0.15           0.9944188436386611
7         12/2/2019      3               0.1            1.07079092
7         12/2/2019      5               0.1            1.07079092
7         12/2/2019      12              0.05           1.46861021
7         12/2/2019      8               0.45           0
7         12/2/2019      6               0.3            0.32415155"""))

Solution

  • The method to create pp is wrong. By your code, pp is nan.

    I fixed the method f below:

    def f(ppp):
        kk = 0
        maxx = ppp.iloc[0]
        print(f"maxx is {maxx}")
        print(f"ppp is {ppp}")
        for k in range(len(ppp)):
            if ppp.iloc[k] > maxx:
                kk = k
                maxx = ppp.iloc[k]
        pp = pd.concat([ppp.iloc[:kk], ppp.iloc[kk + 1 :]]).reset_index(drop=True)
        print(f"pp is {pp}")
        t = []
        for k in range(len(pp)):
            t.append(math.sqrt(2 * math.log(1 / pp.iloc[k])))
    
        u = grad(t, pp)
        mm = 0
        while mm <= 50 * len(pp) and sum(abss(grad(t, pp))) > 1 / 10.0**12:
            mm += 1
            if mm % len(pp) == 1:
                pass
            s = min(1, 1 / sum(abss(u)))
            cnt = 0
            while dot(u, grad(add(t, mult(s, u)), pp), pp) > 0 and s * sum(abss(u)) < len(
                u
            ):
                s *= 2
                cnt += 1
            a = 0
            b = s
            beg = a
            end = b
            A = dot(u, grad(add(t, mult(a, u)), pp), pp)
            B = dot(u, grad(add(t, mult(b, u)), pp), pp)
            k = 0
            while k < 20 and abs(A - B) > (1 / 10.0**12) * max(abs(A), abs(B)):
                mid = (beg + end) / 2
                if dot(u, grad(add(t, mult(mid, u)), pp), pp) > 0:
                    beg = mid
                else:
                    end = mid
    
                c = max(
                    beg - (1 / 10.0**12),
                    min(end + (1 / 10.0**12), b + (B / (A - B)) * (b - a)),
                )
                C = dot(u, grad(add(t, mult(c, u)), pp), pp)
                if abs(a - c) > abs(b - c) and abs(b - c) > 0:
                    a = b
                    A = B
                    b = c
                    B = C
                else:
                    b = a
                    B = A
                    a = c
                    A = C
                if C > 0:
                    beg = max(beg, c)
                else:
                    end = min(end, c)
                k += 1
    
            s = c
    
            oldgrad = grad(t, pp)
            t = add(t, mult(s, u))
            newgrad = grad(t, pp)
            uold = mult(1, u)
            u = mult(1, newgrad)
            if mm % len(pp) != 1:
                u = add(
                    u,
                    mult(
                        dot(newgrad, add(newgrad, mult(-1, oldgrad)), pp)
                        / dot(oldgrad, oldgrad, pp),
                        uold,
                    ),
                )
    
        ss = sum(abss(grad(t, pp)))
        tt = t[:kk]
        tt.append(0)
        t = tt + t[kk:]
        if ss > 1 / 10.0**12:
            x = str(input("Failed"))
        return t
    
    
    

    Actually, this maybe a part of function, but this can run to get a result by the MRE you provided. Maybe more error was caused by the usage of pandas.