google-cloud-platformgoogle-cloud-vertex-aillamagoogle-generativeai

Performing inference on Llama-2 from Vertex AI Model Garden


I deployed llama2-chat-13b from model garden. However, I am getting error while trying to perform inference.

Configuration:

project="X";
endpoint_id="Y";
location="us-east1";
64 VCPUs, 57.6 GB RAM;
GPU= 4 T4;

I tried three approaches, but all of them are returning some kind of error:

Approach 1:

from typing import Dict, List, Union

from google.cloud import aiplatform
from google.protobuf import json_format
from google.protobuf.struct_pb2 import Value


def predict_custom_trained_model_sample(
    project: str,
    endpoint_id: str,
    instances: Union[Dict, List[Dict]],
    location: str = "us-east1",
    api_endpoint: str = "us-east1-aiplatform.googleapis.com",
):
    """
    `instances` can be either single instance of type dict or a list
    of instances.
    """
    # The AI Platform services require regional API endpoints.
    client_options = {"api_endpoint": api_endpoint}
    # Initialize client that will be used to create and send requests.
    # This client only needs to be created once, and can be reused for multiple requests.
    client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
    # The format of each instance should conform to the deployed model's prediction input schema.
    instances = instances if isinstance(instances, list) else [instances]
    instances = [
        json_format.ParseDict(instance_dict, Value()) for instance_dict in instances
    ]
    parameters_dict = {}
    parameters = json_format.ParseDict(parameters_dict, Value())
    endpoint = client.endpoint_path(
        project=project, location=location, endpoint=endpoint_id
    )
    response = client.predict(
        endpoint=endpoint, instances=instances, parameters=parameters
    )
    print("response")
    print(" deployed_model_id:", response.deployed_model_id)
    # The predictions are a google.protobuf.Value representation of the model's predictions.
    predictions = response.predictions
    for prediction in predictions:
        print(" prediction:", dict(prediction))


# [END aiplatform_predict_custom_trained_model_sample]


predict_custom_trained_model_sample(
    project="",
    endpoint_id="",
    location="us-east1",
    instances = [
      {
         "prompt":["hello"]
      },
   ]
)

I get the following error:

_InactiveRpcError                         Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/google/api_core/grpc_helpers.py in error_remapped_callable(*args, **kwargs)
     71         try:
---> 72             return callable_(*args, **kwargs)
     73         except grpc.RpcError as exc:

6 frames
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    status = StatusCode.INTERNAL
    details = "Internal Server Error"
    debug_error_string = "UNKNOWN:Error received from peer ipv4:172.217.15.234:443 {created_time:"2023-10-31T20:13:00.233826088+00:00", grpc_status:13, grpc_message:"Internal Server Error"}"
>

The above exception was the direct cause of the following exception:

InternalServerError                       Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/google/api_core/grpc_helpers.py in error_remapped_callable(*args, **kwargs)
     72             return callable_(*args, **kwargs)
     73         except grpc.RpcError as exc:
---> 74             raise exceptions.from_grpc_error(exc) from exc
     75 
     76     return error_remapped_callable

InternalServerError: 500 Internal Server Error

Approach 2:

import vertexai
from vertexai.language_models import TextGenerationModel
vertexai.init(project="X", location="us-east1")
parameters = {
    "candidate_count": 1,
    "max_output_tokens": 1024,
    "temperature": 0.2,
    "top_p": 0.8,
    "top_k": 40
}
model = TextGenerationModel.from_pretrained("Llama2-13B-chat-001")

I get the following error:

_InactiveRpcError                         Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/google/api_core/grpc_helpers.py in error_remapped_callable(*args, **kwargs)
     71         try:
---> 72             return callable_(*args, **kwargs)
     73         except grpc.RpcError as exc:

11 frames
_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
    status = StatusCode.INVALID_ARGUMENT
    details = "Invalid publisher model resource format."
    debug_error_string = "UNKNOWN:Error received from peer ipv4:172.217.0.74:443 {created_time:"2023-10-31T20:23:51.049594923+00:00", grpc_status:3, grpc_message:"Invalid publisher model resource format."}"
>

The above exception was the direct cause of the following exception:

InvalidArgument                           Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/google/api_core/grpc_helpers.py in error_remapped_callable(*args, **kwargs)
     72             return callable_(*args, **kwargs)
     73         except grpc.RpcError as exc:
---> 74             raise exceptions.from_grpc_error(exc) from exc
     75 
     76     return error_remapped_callable

InvalidArgument: 400 Invalid publisher model resource format.

Finally, I tried with langchain. Approach 3:

from langchain.llms.vertexai import VertexAIModelGarden, VertexAI

llm = VertexAIModelGarden(
    project="X",
    endpoint_id="Y",
    location='us-east1'
)
llm("hello")

I get the following error: TypeError: string indices must be integers


Solution

  • Well, this is working for me.

    from typing import Dict, List, Union
    
    from google.cloud import aiplatform
    from google.protobuf import json_format
    from google.protobuf.struct_pb2 import Value
    
    
    def predict_custom_trained_model_sample(
        project: str,
        endpoint_id: str,
        instances: Union[Dict, List[Dict]],
        location: str = "us-central1",
    ):
        """
        `instances` can be either single instance of type dict or a list
        of instances.
        """
        api_endpoint = f"{location}-aiplatform.googleapis.com"
        # The AI Platform services require regional API endpoints.
        client_options = {"api_endpoint": api_endpoint}
        # Initialize client that will be used to create and send requests.
        # This client only needs to be created once, and can be reused for multiple requests.
        client = aiplatform.gapic.PredictionServiceClient(client_options=client_options)
        # The format of each instance should conform to the deployed model's prediction input schema.
        instances = instances if isinstance(instances, list) else [instances]
        instances = [
            json_format.ParseDict(instance_dict, Value()) for instance_dict in instances
        ]
        parameters_dict = {}
        parameters = json_format.ParseDict(parameters_dict, Value())
        endpoint = client.endpoint_path(
            project=project, location=location, endpoint=endpoint_id
        )
        response = client.predict(
            endpoint=endpoint, instances=instances, parameters=parameters
        )
        print("Response")
        print("Deployed Model ID:", response.deployed_model_id)
        # The predictions are a google.protobuf.Value representation of the model's predictions.
        predictions = response.predictions
        for prediction in predictions:
            print("prediction:", prediction)
    
    query = "Who is Albert Einstein?"
    predict_custom_trained_model_sample(
        project="X",
        endpoint_id="Y",
        location="us-east1",
        instances = [
          {
             "prompt": query, "temperature": 0.0
          },
       ]
    )