pythongoogle-cloud-platformgoogle-cloud-mlgoogle-cloud-schedulergoogle-cloud-vertex-ai

Jobs-Cloud Scheduler (Google Cloud) fails to run scheduled pipelines


I'm here because I'm facing a problem with scheduled jobs in Google Cloud. In Vertex AI Workbench, I created a notebook in Python 3 that creates a pipeline that trains AutoML with data from the public credit card dataset. If I run the job at the end of its creation, everything works. However, if I schedule the job run as described here in Job Cloud Scheduler, the pipeline is enabled but the run fails.

Here is the code that I have:

import os
# import sys
import google.cloud.aiplatform as aip
import kfp
# from kfp.v2.dsl import component
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
# from kfp.v2.google.client import AIPlatformClient

PROJECT_ID = "fraud-detection-project-329506"
REGION = "us-central1"

credential_path = r"C:\Users\...\fraud-detection-project-329506-4d16889a494a.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
      
BUCKET_NAME = "gs://..."
SERVICE_ACCOUNT = "...@fraud-detection-project-329506.iam.gserviceaccount.com"

API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
PIPELINE_ROOT = "{}/dataset".format(BUCKET_NAME)

aip.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

# file names
TRAIN_FILE_NAME = "creditcard_train.csv"
TEST_FILE_NAME = "creditcard_test.csv"

# path for train and test dataset 
gcs_csv_path_train = f"{PIPELINE_ROOT}/{TRAIN_FILE_NAME}"
gcs_csv_path_test = f"{PIPELINE_ROOT}/{TEST_FILE_NAME}"

#gcs location where the output is to be written to
gcs_destination_prefix = "{}/output".format(BUCKET_NAME)

@kfp.dsl.pipeline(name="automl-tab-training-v2")
def pipeline(project: str = PROJECT_ID):
    
    # create tabular dataset
    dataset_create_op = gcc_aip.TabularDatasetCreateOp(
        project=project, display_name="creditcard", gcs_source=gcs_csv_path_train
    )
    
  
    # Training with AutoML
    training_op = gcc_aip.AutoMLTabularTrainingJobRunOp(
        project=project,
        display_name="train-automl-fraud-detection",
        optimization_prediction_type="classification",
        column_transformations=[
            {"numeric": {"column_name": "Time"}},
            {"numeric": {"column_name": "V1"}},
            {"numeric": {"column_name": "V2"}},
            {"numeric": {"column_name": "V3"}},
            {"numeric": {"column_name": "V4"}},
            {"numeric": {"column_name": "V5"}},
            {"numeric": {"column_name": "V6"}},
            {"numeric": {"column_name": "V7"}},
            {"numeric": {"column_name": "V8"}},
            {"numeric": {"column_name": "V9"}},
            {"numeric": {"column_name": "V10"}},
            {"numeric": {"column_name": "V11"}},
            {"numeric": {"column_name": "V12"}},
            {"numeric": {"column_name": "V13"}},
            {"numeric": {"column_name": "V14"}},
            {"numeric": {"column_name": "V15"}},
            {"numeric": {"column_name": "V16"}},
            {"numeric": {"column_name": "V17"}},
            {"numeric": {"column_name": "V18"}},
            {"numeric": {"column_name": "V19"}},
            {"numeric": {"column_name": "V20"}},
            {"numeric": {"column_name": "V21"}},
            {"numeric": {"column_name": "V22"}},
            {"numeric": {"column_name": "V23"}},
            {"numeric": {"column_name": "V24"}},
            {"numeric": {"column_name": "V25"}},
            {"numeric": {"column_name": "V26"}},
            {"numeric": {"column_name": "V27"}},
            {"numeric": {"column_name": "V28"}},
            {"numeric": {"column_name": "Amount"}},
        ],
        dataset=dataset_create_op.outputs["dataset"],#dataset_with_FeatEng,
        target_column="Class",
        budget_milli_node_hours=1000,
    )
    
    # batch prediction after training
    batchprediction_op = gcc_aip.ModelBatchPredictOp(
        model=training_op.outputs["model"],
        job_display_name='prediction1',
        gcs_source=gcs_csv_path_test,
        project=project,
        machine_type="n1-standard-2",
        gcs_destination_prefix=gcs_destination_prefix,
    )
    

COMPILED_PIPELINE_PATH = r"C:\Users\...\tabular_classification_pipeline.json"
SCHEDULE = "5 5 * * *"
DISPLAY_NAME = 'fraud_detection'

# compile pipeline
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path=COMPILED_PIPELINE_PATH,
)

# job run after its creation
job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=COMPILED_PIPELINE_PATH,
    pipeline_root=PIPELINE_ROOT,
)
job.run()

# api_client = AIPlatformClient(project_id=PROJECT_ID, region=REGION)

# schedule training/prediction every day at a certain hour
# api_client.create_schedule_from_job_spec(
#    job_spec_path=COMPILED_PIPELINE_PATH,
#    pipeline_root=PIPELINE_ROOT,
#    schedule=SCHEDULE,
# )

Looking at the error log, I found:

{
httpRequest: {
status: 404
}
insertId: "13yj575g2rylrz9"
jsonPayload: {
@type: "type.googleapis.com/google.cloud.scheduler.logging.AttemptFinished"
jobName: "projects/fraud-detection-project-329506/locations/us-central1/jobs/pipeline_pipeline_179e648c_0-11-a-a-a"
status: "NOT_FOUND"
targetType: "HTTP"
url: "https://us-central1-fraud-detection-project-329506.cloudfunctions.net/templated_http_request-v1"
}
logName: "projects/fraud-detection-project-329506/logs/cloudscheduler.googleapis.com%2Fexecutions"
receiveTimestamp: "2021-10-19T18:00:00.309225533Z"
resource: {
labels: {
job_id: "pipeline_pipeline_179e648c_0-11-a-a-a"
location: "us-central1"
project_id: "fraud-detection-project-329506"
}
type: "cloud_scheduler_job"
}
severity: "ERROR"
timestamp: "2021-10-19T18:00:00.309225533Z"
}

Does it mean that I have to create the URL before running the notebook? I have no idea how to go on. Thank you in advance.


Solution

  • From the error you shared, apparently Cloud Function failed to create the job.

    status: "NOT_FOUND" 
    targetType: "HTTP" 
    url: "https://us-central1-fraud-detection-project-329506.cloudfunctions.net/templated_http_request-v1"
    

    A possible reason from the Cloud Function side could be if Cloud Build API is not used in your project before or it is disabled. Can you check if it is enabled and try again? If you have enabled this API recently, wait for a few minutes for the action to propagate to the systems and retry.