I am running an ML pipeline and the training component/step (see code below) continues to fail with the following error: "RuntimeError: Unknown return type: <class 'inspect._empty'>. Must be one of str
, int
, float
, a subclass of Artifact
, or a NamedTuple collection of these types."
Any ideas on what might be causing the issue/error and how to resolve it?
Thank you!
@component(
# this component builds an xgboost classifier with xgboost
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml"
)
def build_xgb_xgboost(project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics: Output[Metrics],
model: Output[Model]
):
from google.cloud import bigquery
import xgboost as xgb
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error
import joblib
import pyarrow
import db_dtypes
client = bigquery.Client(project=project)
view_uri = f"{project}.{bq_dataset}.{test_view_name}" #replace view_name with test_view_name
build_df_for_xgboost = '''
SELECT * FROM `{view_uri}`
'''.format(view_uri = view_uri)
job_config = bigquery.QueryJobConfig()
df_1 = client.query(build_df_for_xgboost).to_dataframe()
#client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()
df = df_1.drop(['int64_field_0'], axis=1)
def onehot_encode(df, column):
df = df.copy()
dummies = pd.get_dummies(df[column], prefix=column)
df = pd.concat([df, dummies], axis=1)
df = df.drop(column, axis=1)
return df
# Binary encoding
df['preferred_foot'] = df['preferred_foot'].replace({'left': 0, 'right': 1})
# One-hot encoding
for column in ['attacking_work_rate', 'defensive_work_rate']:
df = onehot_encode(df, column=column)
# Split df into X and y
y = df['overall_rating']
X = df.drop('overall_rating', axis=1)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
#specify parameters
#define your model
bst = XGBRegressor(
objective='reg:linear',
learning_rate = '.1',
alpha = '0.001'
)
#fit your model
bst.fit(X_train, y_train)
# Predict the model
y_pred = bst.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
mae = mean_absolute_error(y_test, y_pred)
metrics.log_metric("RMSE", rmse)
metrics.log_metric("framework", "xgboost")
metrics.log_metric("dataset_size", len(df))
metrics.log_metric("MAE", mae)
dump(bst, model.path + ".joblib")
I think this might just be a bug in the version of KFP v2 SDK code you're using.
I mostly use the stable KFPv1 methods to avoid problems.
from kfp.components import InputPath, OutputPath, create_component_from_func
def train_xgboost_model(
project: str,
bq_dataset: str,
test_view_name: str,
bq_location: str,
metrics_path: OutputPath(Metrics),
model_path: OutputPath(Model),
):
import json
from pathlib import Path
metrics = {
...
}
Path(metrics_path).write_text(json.dumps(metrics))
dump(bst, model_path)
train_xgboost_model_op = create_component_from_func(
func=train_xgboost_model,
packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
base_image="python:3.9",
output_component_file="create_xgb_model_xgboost.yaml",
)
You can also find many examples of real-world components in this repo: https://github.com/Ark-kun/pipeline_components/tree/master/components
including an XGBoost trainer https://github.com/Ark-kun/pipeline_components/blob/d8c4cf5/components/XGBoost/Train/component.py
and a full XGBoost pipeline: https://github.com/Ark-kun/pipeline_components/blob/4f19be6f26eaaf85ba251110d10d103b17e54a17/samples/Google_Cloud_Vertex_AI/Train_tabular_regression_model_using_XGBoost_and_import_to_Vertex_AI/pipeline.py