What could be the point I am missing here. All datasets and training is done on gcp and my setup is basic.
All steps looked fine until the last (4). Tried other pre-built pytorch images recommended by google but the error is persisting. The error long is as shown below:
---------------------------------------------------------------------------
_InactiveRpcError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/google/api_core/grpc_helpers.py in error_remapped_callable(*args, **kwargs)
65 try:
---> 66 return callable_(*args, **kwargs)
67 except grpc.RpcError as exc:
/opt/conda/lib/python3.7/site-packages/grpc/_channel.py in __call__(self, request, timeout, metadata, credentials, wait_for_ready, compression)
945 wait_for_ready, compression)
--> 946 return _end_unary_response_blocking(state, call, False, None)
947
/opt/conda/lib/python3.7/site-packages/grpc/_channel.py in _end_unary_response_blocking(state, call, with_call, deadline)
848 else:
--> 849 raise _InactiveRpcError(state)
850
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
status = StatusCode.INVALID_ARGUMENT
details = "Invalid image "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest" for deployment. Please use a Model with a valid image."
debug_error_string = "{"created":"@1652032269.328842405","description":"Error received from peer ipv4:142.250.148.95:443","file":"src/core/lib/surface/call.cc","file_line":903,"grpc_message":"Invalid image "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest" for deployment. Please use a Model with a valid image.","grpc_status":3}"
>
The above exception was the direct cause of the following exception:
InvalidArgument Traceback (most recent call last)
/tmp/ipykernel_2924/2180059764.py in <module>
5 machine_type = DEPLOY_COMPUTE,
6 min_replica_count = 1,
----> 7 max_replica_count = 1
8 )
/opt/conda/lib/python3.7/site-packages/google/cloud/aiplatform/models.py in deploy(self, model, deployed_model_display_name, traffic_percentage, traffic_split, machine_type, min_replica_count, max_replica_count, accelerator_type, accelerator_count, service_account, explanation_metadata, explanation_parameters, metadata, sync)
697 explanation_parameters=explanation_parameters,
698 metadata=metadata,
--> 699 sync=sync,
700 )
701
/opt/conda/lib/python3.7/site-packages/google/cloud/aiplatform/base.py in wrapper(*args, **kwargs)
728 if self:
729 VertexAiResourceNounWithFutureManager.wait(self)
--> 730 return method(*args, **kwargs)
731
732 # callbacks to call within the Future (in same Thread)
/opt/conda/lib/python3.7/site-packages/google/cloud/aiplatform/models.py in _deploy(self, model, deployed_model_display_name, traffic_percentage, traffic_split, machine_type, min_replica_count, max_replica_count, accelerator_type, accelerator_count, service_account, explanation_metadata, explanation_parameters, metadata, sync)
812 explanation_metadata=explanation_metadata,
813 explanation_parameters=explanation_parameters,
--> 814 metadata=metadata,
815 )
816
/opt/conda/lib/python3.7/site-packages/google/cloud/aiplatform/models.py in _deploy_call(cls, api_client, endpoint_resource_name, model_resource_name, endpoint_resource_traffic_split, deployed_model_display_name, traffic_percentage, traffic_split, machine_type, min_replica_count, max_replica_count, accelerator_type, accelerator_count, service_account, explanation_metadata, explanation_parameters, metadata)
979 deployed_model=deployed_model,
980 traffic_split=traffic_split,
--> 981 metadata=metadata,
982 )
983
/opt/conda/lib/python3.7/site-packages/google/cloud/aiplatform_v1/services/endpoint_service/client.py in deploy_model(self, request, endpoint, deployed_model, traffic_split, retry, timeout, metadata)
1155
1156 # Send the request.
-> 1157 response = rpc(request, retry=retry, timeout=timeout, metadata=metadata,)
1158
1159 # Wrap the response in an operation future.
/opt/conda/lib/python3.7/site-packages/google/api_core/gapic_v1/method.py in __call__(self, timeout, retry, *args, **kwargs)
152 kwargs["metadata"] = metadata
153
--> 154 return wrapped_func(*args, **kwargs)
155
156
/opt/conda/lib/python3.7/site-packages/google/api_core/grpc_helpers.py in error_remapped_callable(*args, **kwargs)
66 return callable_(*args, **kwargs)
67 except grpc.RpcError as exc:
---> 68 raise exceptions.from_grpc_error(exc) from exc
69
70 return error_remapped_callable
InvalidArgument: 400 Invalid image "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest" for deployment. Please use a Model with a valid image.
Below are some details where I create model, endpoint and deploy to endpoint.
DEPLOY_COMPUTE = 'n1-standard-4'
DEPLOY_IMAGE='us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest'
model = aip.Model.upload(
display_name = f'{NOTEBOOK}_{TIMESTAMP}',
serving_container_image_uri = DEPLOY_IMAGE,
artifact_uri = URI,
labels = {'notebook':f'{NOTEBOOK}'}
)
endpoint = aip.Endpoint.create(
display_name = f'{NOTEBOOK}_{TIMESTAMP}',
labels = {'notebook':f'{NOTEBOOK}'}
)
endpoint.deploy(
model = model,
deployed_model_display_name = f'{NOTEBOOK}_{TIMESTAMP}',
traffic_percentage = 100,
machine_type = DEPLOY_COMPUTE,
min_replica_count = 1,
max_replica_count = 1
)
You are using us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest
container image for importing models. However, models trained in pytorch
cannot use pre-built containers when importing models since as mentioned in this documentation,
You can use a pre-built container if your model meets the following requirements:
- Trained in Python 3.7 or later
- Trained using TensorFlow, scikit-learn, or XGBoost
- Exported to meet framework-specific requirements for one of the pre-built prediction containers
I suggest 2 workaround options for your use case:
You can create a custom prediction container image for your pytorch
trained model by referring to this documentation .
or
Re-train your model that meets the above requirements so that you can use the pre-bult container.