I have a rather complicated function f(featureList)
that takes a list of arbitrary length as input and gives another list of the same length as output:
import math
import random
import time
def survivalNormalcdf(x):
return (1-math.erf(x/math.sqrt(2)))/2
def normalcdf(x):
return (1+math.erf(x/math.sqrt(2)))/2
def normalpdf(x):
return math.exp(-x*x/2)/math.sqrt(2*math.pi)
def abss(p):
q=[]
for k in range(len(p)):
q.append(abs(p[k]))
return q;
def mult(a,p):
q=[]
for k in range(len(p)):
q.append(a*p[k])
return q;
def add(a,p):
q=[]
for k in range(len(p)):
q.append(a[k]+p[k])
return q
def dot(u,v,pp):
s=0
for k in range(len(u)):
s+=u[k]*v[k]*pp[k]
return s;
def grad(t,pp):
h=math.sqrt(1/5/(len(pp)+1))
g=[]
for k in range(len(pp)):
g.append(-pp[k])
beg=t[k] - 10
end=t[k] + 10
qq=math.ceil((end-beg)/h)
for q in range(qq):
x=beg+q*h
ss=survivalNormalcdf(x)
for m in range(len(pp)):
if k==m:
ss*=normalpdf(x-t[m])
else:
ss*=survivalNormalcdf(x-t[m])
g[k]+=ss*h;
for k in range(len(pp)):
g[k]/=pp[k]
return g
def iint(t,pp):
h=0.1
ss=0
for k in range(1):
beg=min(min(t),0) - 10
end=max(max(t),0) + 10
qq=int((end-beg)/h)
for q in range(qq):
x=beg+q*h
s=1;
for m in range(len(pp)):
s*=survivalNormalcdf(x-t[m])
ss+=(s-1)*survivalNormalcdf(x)*h
for k in range(len(pp)):
ss-=pp[k]*t[k]
return ss
def f(ppp):
kk=0
maxx=ppp[0]
for k in range(len(ppp)):
if ppp[k]>maxx:
kk=k
maxx=ppp[k]
pp=ppp[:kk]+ppp[kk+1:]
t=[]
for k in range(len(pp)):
t.append(math.sqrt(2*math.log(1/pp[k])))
u=grad(t,pp)
mm=0
while mm<=50*len(pp) and sum(abss(grad(t,pp)))>1/10.0**12:
mm+=1
if mm%len(pp)==1:
pass
s=min(1,1/sum(abss(u)))
cnt=0
while dot(u, grad(add(t,mult(s,u)),pp),pp)>0 and s*sum(abss(u))<len(u):
s*=2
cnt+=1
a=0
b=s
beg=a
end=b
A=dot(u, grad(add(t,mult(a,u)),pp),pp)
B=dot(u, grad(add(t,mult(b,u)),pp),pp)
k=0
while k<20 and abs(A-B)>(1/10.0**12)*max(abs(A),abs(B)):
mid=(beg+end)/2
if dot(u, grad(add(t,mult(mid,u)),pp),pp)>0:
beg=mid
else:
end=mid
c=max(beg-(1/10.0**12),min(end+(1/10.0**12),b+(B/(A-B))*(b-a)))
C=dot(u, grad(add(t,mult(c,u)),pp),pp)
if abs(a-c)>abs(b-c) and abs(b-c)>0:
a=b
A=B
b=c
B=C
else:
b=a
B=A
a=c
A=C
if C>0:
beg=max(beg,c)
else:
end=min(end,c)
k+=1
s=c
oldgrad=grad(t,pp)
t=add(t,mult(s,u))
newgrad=grad(t,pp)
uold=mult(1,u)
u=mult(1,newgrad)
if mm%len(pp)!=1:
u=add(u,mult(dot(newgrad, add(newgrad,mult(-1,oldgrad)),pp)/dot(oldgrad,oldgrad,pp),uold))
ss=sum(abss(grad(t,pp)))
tt=t[:kk]
tt.append(0)
t=tt+t[kk:]
if ss>1/10.0**12:
x=str(input("Failed"))
return t
So for example, we have
f([0.2,0.1,0.55,0.15]) = [0.7980479577400461, 1.2532153405902076, 0, 0.9944188436386611]
f([0.02167131,0.17349148,0.08438952,0.04143787,0.02589056,0.03866752,0.0461553,0.09212758,0.10879326,0.186921,0.02990676,0.02731904,0.06020158,0.06302721]) =
[1.174313198960376,
0.04892832217716259,
0.4858149215364752,
0.864373517094786,
1.0921431988531611,
0.8989070806156786,
0.8098127832637683,
0.4358011113129989,
0.3387512959281985,
0,
1.0239882119094197,
1.0669265516784823,
0.671235053100702,
0.6466856803321204]
And I have a pandas dataframe that looks like
Class_ID Date Student_ID feature
1 1/1/2023 3 0.02167131
1 1/1/2023 4 0.17349148
1 1/1/2023 6 0.08438952
1 1/1/2023 8 0.04143787
1 1/1/2023 9 0.02589056
1 1/1/2023 1 0.03866752
1 1/1/2023 10 0.0461553
3 17/4/2022 5 0.2
3 17/4/2022 2 0.1
3 17/4/2022 3 0.55
3 17/4/2022 4 0.15
and I would like to apply the function f(featureList)
to the feature
column groupby
Class_ID
and generate a new column called New_feature
. And here's my code:
df['New_feature'] = df.groupby('Class_ID', group_keys=False)['feature'].apply(f)
So the desired outcome looks like:
df_outcome = pd.read_fwf(io.StringIO("""Class_ID Date Student_ID feature New_feature
1 1/1/2023 3 0.02167131 2.385963956274992
1 1/1/2023 4 0.17349148 0
1 1/1/2023 6 0.08438952 1.6510552553095719
1 1/1/2023 8 0.04143787 2.054792417419151
1 1/1/2023 9 0.02589056 2.298129663961289
1 1/1/2023 1 0.03866752 2.0916706205231286
1 1/1/2023 10 0.0461553 1.9965409929949391
3 17/4/2022 5 0.2 0.7980479577400461
3 17/4/2022 2 0.1 1.2532153405902076
3 17/4/2022 3 0.55 0
3 17/4/2022 4 0.15 0.9944188436386611"""))
However it gives the following error:
KeyError: 0
The above exception was the direct cause of the following exception:
# If we have a listlike key, _check_indexing_error will raise
Here is the code:
import io
import numpy as np
import pandas as pd
import math
df = pd.read_fwf(io.StringIO("""Class_ID Date Student_ID feature
1 1/1/2023 3 0.02167131
1 1/1/2023 4 0.17349148
1 1/1/2023 6 0.08438952
1 1/1/2023 8 0.04143787
1 1/1/2023 9 0.02589056
1 1/1/2023 1 0.03866752
1 1/1/2023 10 0.0461553
3 17/4/2022 5 0.2
3 17/4/2022 2 0.1
3 17/4/2022 3 0.55
3 17/4/2022 4 0.15"""))
def survivalNormalcdf(x):
return (1-math.erf(x/math.sqrt(2)))/2
def normalcdf(x):
return (1+math.erf(x/math.sqrt(2)))/2
def normalpdf(x):
return math.exp(-x*x/2)/math.sqrt(2*math.pi)
def abss(p):
q=[]
for k in range(len(p)):
q.append(abs(p[k]))
return q;
def mult(a,p):
q=[]
for k in range(len(p)):
q.append(a*p[k])
return q;
def add(a,p):
q=[]
for k in range(len(p)):
q.append(a[k]+p[k])
return q
def dot(u,v,pp):
s=0
for k in range(len(u)):
s+=u[k]*v[k]*pp[k]
return s;
def grad(t,pp):
h=math.sqrt(1/5/(len(pp)+1))
g=[]
for k in range(len(pp)):
g.append(-pp[k])
beg=t[k] - 10
end=t[k] + 10
qq=math.ceil((end-beg)/h)
for q in range(qq):
x=beg+q*h
ss=survivalNormalcdf(x)
for m in range(len(pp)):
if k==m:
ss*=normalpdf(x-t[m])
else:
ss*=survivalNormalcdf(x-t[m])
g[k]+=ss*h;
for k in range(len(pp)):
g[k]/=pp[k]
return g
def iint(t,pp):
h=0.1
ss=0
for k in range(1):
beg=min(min(t),0) - 10
end=max(max(t),0) + 10
qq=int((end-beg)/h)
for q in range(qq):
x=beg+q*h
s=1;
for m in range(len(pp)):
s*=survivalNormalcdf(x-t[m])
ss+=(s-1)*survivalNormalcdf(x)*h
for k in range(len(pp)):
ss-=pp[k]*t[k]
return ss
def f(ppp):
kk=0
maxx=ppp[0]
for k in range(len(ppp)):
if ppp[k]>maxx:
kk=k
maxx=ppp[k]
pp=ppp[:kk]+ppp[kk+1:]
t=[]
for k in range(len(pp)):
t.append(math.sqrt(2*math.log(1/pp[k])))
u=grad(t,pp)
mm=0
while mm<=50*len(pp) and sum(abss(grad(t,pp)))>1/10.0**12:
mm+=1
if mm%len(pp)==1:
pass
s=min(1,1/sum(abss(u)))
cnt=0
while dot(u, grad(add(t,mult(s,u)),pp),pp)>0 and s*sum(abss(u))<len(u):
s*=2
cnt+=1
a=0
b=s
beg=a
end=b
A=dot(u, grad(add(t,mult(a,u)),pp),pp)
B=dot(u, grad(add(t,mult(b,u)),pp),pp)
k=0
while k<20 and abs(A-B)>(1/10.0**12)*max(abs(A),abs(B)):
mid=(beg+end)/2
if dot(u, grad(add(t,mult(mid,u)),pp),pp)>0:
beg=mid
else:
end=mid
c=max(beg-(1/10.0**12),min(end+(1/10.0**12),b+(B/(A-B))*(b-a)))
C=dot(u, grad(add(t,mult(c,u)),pp),pp)
if abs(a-c)>abs(b-c) and abs(b-c)>0:
a=b
A=B
b=c
B=C
else:
b=a
B=A
a=c
A=C
if C>0:
beg=max(beg,c)
else:
end=min(end,c)
k+=1
s=c
oldgrad=grad(t,pp)
t=add(t,mult(s,u))
newgrad=grad(t,pp)
uold=mult(1,u)
u=mult(1,newgrad)
if mm%len(pp)!=1:
u=add(u,mult(dot(newgrad, add(newgrad,mult(-1,oldgrad)),pp)/dot(oldgrad,oldgrad,pp),uold))
ss=sum(abss(grad(t,pp)))
tt=t[:kk]
tt.append(0)
t=tt+t[kk:]
if ss>1/10.0**12:
x=str(input("Failed"))
return t
df['New_feature'] = df.groupby('Class_ID', group_keys=False)['feature'].apply(f)
df
Did I do anything wrong? Thanks in advance.
Edit Here is a sample dataframe:
df = pd.read_fwf(io.StringIO("""Class_ID Date Student_ID feature
1 1/1/2023 3 0.02167131
1 1/1/2023 4 0.17349148
1 1/1/2023 6 0.08438952
1 1/1/2023 8 0.04143787
1 1/1/2023 9 0.02589056
1 1/1/2023 1 0.03866752
1 1/1/2023 10 0.0461553
3 17/4/2022 5 0.2
3 17/4/2022 2 0.1
3 17/4/2022 3 0.55
3 17/4/2022 4 0.15
7 12/2/2019 3 0.1
7 12/2/2019 5 0.1
7 12/2/2019 12 0.05
7 12/2/2019 8 0.45
7 12/2/2019 6 0.3"""))
and the desired output:
df_outcome = pd.read_fwf(io.StringIO("""Class_ID Date Student_ID feature New_feature
1 1/1/2023 3 0.02167131 2.385963956274992
1 1/1/2023 4 0.17349148 0
1 1/1/2023 6 0.08438952 1.6510552553095719
1 1/1/2023 8 0.04143787 2.054792417419151
1 1/1/2023 9 0.02589056 2.298129663961289
1 1/1/2023 1 0.03866752 2.0916706205231286
1 1/1/2023 10 0.0461553 1.9965409929949391
3 17/4/2022 5 0.2 0.7980479577400461
3 17/4/2022 2 0.1 1.2532153405902076
3 17/4/2022 3 0.55 0
3 17/4/2022 4 0.15 0.9944188436386611
7 12/2/2019 3 0.1 1.07079092
7 12/2/2019 5 0.1 1.07079092
7 12/2/2019 12 0.05 1.46861021
7 12/2/2019 8 0.45 0
7 12/2/2019 6 0.3 0.32415155"""))
The method to create pp
is wrong. By your code, pp is nan.
I fixed the method f below:
def f(ppp):
kk = 0
maxx = ppp.iloc[0]
print(f"maxx is {maxx}")
print(f"ppp is {ppp}")
for k in range(len(ppp)):
if ppp.iloc[k] > maxx:
kk = k
maxx = ppp.iloc[k]
pp = pd.concat([ppp.iloc[:kk], ppp.iloc[kk + 1 :]]).reset_index(drop=True)
print(f"pp is {pp}")
t = []
for k in range(len(pp)):
t.append(math.sqrt(2 * math.log(1 / pp.iloc[k])))
u = grad(t, pp)
mm = 0
while mm <= 50 * len(pp) and sum(abss(grad(t, pp))) > 1 / 10.0**12:
mm += 1
if mm % len(pp) == 1:
pass
s = min(1, 1 / sum(abss(u)))
cnt = 0
while dot(u, grad(add(t, mult(s, u)), pp), pp) > 0 and s * sum(abss(u)) < len(
u
):
s *= 2
cnt += 1
a = 0
b = s
beg = a
end = b
A = dot(u, grad(add(t, mult(a, u)), pp), pp)
B = dot(u, grad(add(t, mult(b, u)), pp), pp)
k = 0
while k < 20 and abs(A - B) > (1 / 10.0**12) * max(abs(A), abs(B)):
mid = (beg + end) / 2
if dot(u, grad(add(t, mult(mid, u)), pp), pp) > 0:
beg = mid
else:
end = mid
c = max(
beg - (1 / 10.0**12),
min(end + (1 / 10.0**12), b + (B / (A - B)) * (b - a)),
)
C = dot(u, grad(add(t, mult(c, u)), pp), pp)
if abs(a - c) > abs(b - c) and abs(b - c) > 0:
a = b
A = B
b = c
B = C
else:
b = a
B = A
a = c
A = C
if C > 0:
beg = max(beg, c)
else:
end = min(end, c)
k += 1
s = c
oldgrad = grad(t, pp)
t = add(t, mult(s, u))
newgrad = grad(t, pp)
uold = mult(1, u)
u = mult(1, newgrad)
if mm % len(pp) != 1:
u = add(
u,
mult(
dot(newgrad, add(newgrad, mult(-1, oldgrad)), pp)
/ dot(oldgrad, oldgrad, pp),
uold,
),
)
ss = sum(abss(grad(t, pp)))
tt = t[:kk]
tt.append(0)
t = tt + t[kk:]
if ss > 1 / 10.0**12:
x = str(input("Failed"))
return t
Actually, this maybe a part of function, but this can run to get a result by the MRE you provided. Maybe more error was caused by the usage of pandas.