I want to impute missing data on abortion rates of US-Counties. I´m comparing different Methods, one of them is Kriging. I used Pykrige for External Dirft Kriging:
def Kriging_with_SGS(df, variable, kriged_var, variogram, lags, drifts=None):
interpolation_points = df[~df[variable].isnull()]
interpolation_points.reset_index(drop=True, inplace=True)
known_x = interpolation_points.geometry.centroid.x.values
known_y = interpolation_points.geometry.centroid.y.values
values = interpolation_points[variable].values
external_drifts = []
if drifts is not None:
for drift in drifts:
if drift in interpolation_points.columns:
external_drift = interpolation_points[drift].values
external_drifts.append(external_drift)
else:
print(f"The drift variable '{drift}' is missing.")
if drifts:
uk_all = UniversalKriging(
known_x,
known_y,
values,
variogram_model=variogram,
drift_terms=["point_log"],
point_drift=external_drifts if external_drifts else None,
enable_plotting=True,
nlags=lags,
verbose=True
)
else:
uk_all = UniversalKriging(
known_x,
known_y,
values,
variogram_model=variogram,
enable_plotting=True,
nlags=lags,
verbose=True
)
df[kriged_var] = uk_all.execute(
'points',
df.geometry.centroid.x.values,
df.geometry.centroid.y.values
)[0]
With this code I managed to get the kriging results for the Counties which have missing values in the target variable. But I have the feeling, that I´m loosing to much variance in the far field. Therefore I want to include conditioning in my method:
def cond_krig(df, variable, iterations, drift):
# create a df with only those values which exist:
interpolation_points = df[~df[variable].isnull()]
interpolation_points.reset_index(drop=True, inplace=True)
known_x = interpolation_points.geometry.centroid.x.values
known_y = interpolation_points.geometry.centroid.y.values
cond_val = interpolation_points[variable].values
# define the grid with all points that I have
all_x = df.geometry.centroid.x.values
all_y = df.geometry.centroid.y.values
ext_drift = interpolation_points[drift].values
grid_drift = df[drift].values
model = gs.Matern(dim=2,
var=0.09,
len_scale=11284.23,
nugget=0,
nu=0.20)
krig = gs.krige.ExtDrift(model = model,
cond_pos = (known_x, known_y),
cond_val = cond_val,
ext_drift = ext_drift)
krig.unstructured([all_x, all_y], ext_drift=grid_drift)
cond_srf = gs.CondSRF(krig)
cond_srf.set_pos([all_x, all_y],
"unstructured")
seed = gs.random.MasterRNG(20170519)
for i in range(iterations):
cond_srf(seed=seed(), store=[f"f{i}", False, False])
print("___")
print(i)
print("___")
fields = [cond_srf[f"f{i}"] for i in range(iterations)]
filtered_df["ensemble mean"] = np.mean(fields, axis=0)
filtered_df["kriged field"] = cond_srf.krig.field
I´m getting: Value Error:Krige: wrong number of ext. drift values.
Thus I have two questions:
1.) What am I doing wrong in the second code, I provided? I know that I can make some fine-tuning with latlon and other parameters but there seems to be a substential error in my code.
2.) Is it possible to use the kriging model from Pykrige for conditioning? Something along these lines:
krig = uk_all.execute(
'points',
df.geometry.centroid.x.values,
df.geometry.centroid.y.values
)[0]
cond_srf = gs.CondSRF(krig)
cond_srf.set_pos([df.geometry.centroid.x.values,
df.geometry.centroid.y.values],
"unstructured")
I´m happy about any answer or advice. Many thanks!
For the record
This was answered here: https://github.com/GeoStat-Framework/GSTools/discussions/324
TL;DR:
The crucial part is, that ext_drift=ext_drift_all
needs to be added to the arguments when calling cond_srf
for the target locations.
Since it is an external drift, it needs to be provided at the target grid beforehand.