kubeflow google-cloud-vertex-ai kubeflow-pipelines mlops

Kubeflow Pipeline Training Component Failing | Unknown return type: <class 'inspect._empty'>

I am running an ML pipeline and the training component/step (see code below) continues to fail with the following error: "RuntimeError: Unknown return type: <class 'inspect._empty'>. Must be one of str, int, float, a subclass of Artifact, or a NamedTuple collection of these types."

Any ideas on what might be causing the issue/error and how to resolve it?

Thank you!


@component(
    # this component builds an xgboost classifier with xgboost
    packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
    base_image="python:3.9",
    output_component_file="create_xgb_model_xgboost.yaml"
)

def build_xgb_xgboost(project: str, 
                            bq_dataset: str, 
                            test_view_name: str,
                            bq_location: str,
                            metrics: Output[Metrics],
                            model: Output[Model]

):
    from google.cloud import bigquery
    import xgboost as xgb
    import pandas as pd
    from xgboost import XGBRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import mean_squared_error as MSE
    from sklearn.metrics import mean_absolute_error
    import joblib
    import pyarrow
    import db_dtypes
     

    client = bigquery.Client(project=project) 

    view_uri = f"{project}.{bq_dataset}.{test_view_name}" #replace view_name with test_view_name
    
    build_df_for_xgboost = '''
    SELECT * FROM `{view_uri}`
    '''.format(view_uri = view_uri)

    job_config = bigquery.QueryJobConfig()
    df_1 = client.query(build_df_for_xgboost).to_dataframe()
    
    #client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()  
    
    df = df_1.drop(['int64_field_0'], axis=1)
    
    def onehot_encode(df, column):
        df = df.copy()
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df
    
    # Binary encoding
    df['preferred_foot'] = df['preferred_foot'].replace({'left': 0, 'right': 1})
    
    # One-hot encoding
    for column in ['attacking_work_rate', 'defensive_work_rate']:
        df = onehot_encode(df, column=column)
    
    # Split df into X and y
    y = df['overall_rating']
    X = df.drop('overall_rating', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    #specify parameters
    
    #define your model 
    bst = XGBRegressor(
    objective='reg:linear',
    learning_rate = '.1',
    alpha = '0.001'
    )
    
    #fit your model
    bst.fit(X_train, y_train)
    
    # Predict the model 
    y_pred = bst.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    mae = mean_absolute_error(y_test, y_pred)
    
    metrics.log_metric("RMSE", rmse)
    metrics.log_metric("framework", "xgboost")
    metrics.log_metric("dataset_size", len(df))
    metrics.log_metric("MAE", mae)
    
    dump(bst, model.path + ".joblib")

Solution

I think this might just be a bug in the version of KFP v2 SDK code you're using.

I mostly use the stable KFPv1 methods to avoid problems.


from kfp.components import InputPath, OutputPath, create_component_from_func


def train_xgboost_model(
    project: str, 
    bq_dataset: str, 
    test_view_name: str,
    bq_location: str,
    metrics_path: OutputPath(Metrics),
    model_path: OutputPath(Model),
):
    import json
    from pathlib import Path

    metrics = {
       ...
    }
    Path(metrics_path).write_text(json.dumps(metrics))

    dump(bst, model_path)

train_xgboost_model_op = create_component_from_func(
    func=train_xgboost_model,
    packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow", "db_dtypes"],
    base_image="python:3.9",
    output_component_file="create_xgb_model_xgboost.yaml",
)

You can also find many examples of real-world components in this repo: https://github.com/Ark-kun/pipeline_components/tree/master/components

including an XGBoost trainer https://github.com/Ark-kun/pipeline_components/blob/d8c4cf5/components/XGBoost/Train/component.py

and a full XGBoost pipeline: https://github.com/Ark-kun/pipeline_components/blob/4f19be6f26eaaf85ba251110d10d103b17e54a17/samples/Google_Cloud_Vertex_AI/Train_tabular_regression_model_using_XGBoost_and_import_to_Vertex_AI/pipeline.py