pythonvisual-studio-codesplitazure-functionsazure-durable-functions

How to check the execution time and amount of data for each line of code in Python


I am trying to optimally split Python code on FaaS to improve response time.

To split the code at the optimal location, I need the execution time of each line and the size of the data on which each line depends. Is there an appropriate way to obtain these two?

The environment I am using is,

By the way, it may not be necessary to split them, but I am going to try to split the activity function in the code below into multiple functions.

import azure.functions as func
import azure.durable_functions as df
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing  # Dataset
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import Lasso  
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error  # MSE(Mean Squared Error)
from sklearn.preprocessing import StandardScaler 

app = df.DFApp(http_auth_level=func.AuthLevel.ANONYMOUS)
### client function ###
@app.route(route="orchestrators/client_function")
@app.durable_client_input(client_name="client")
async def client_function(req: func.HttpRequest, client: df.DurableOrchestrationClient) -> func.HttpResponse:
    instance_id = await client.start_new("orchestrator", None, {})
    await client.wait_for_completion_or_create_check_status_response(req, instance_id)
    return client.create_check_status_response(req, instance_id)

### orchestrator function ###
@app.orchestration_trigger(context_name="context")
def orchestrator(context: df.DurableOrchestrationContext) -> str:
    result = yield context.call_activity("origin_analysis", '')
    return "finished"


### activity function ###
@app.blob_output(arg_name="outputblob", path="newblob/test.txt", connection="BlobStorageConnection")
@app.activity_trigger(input_name="blank")
def origin_analysis(blank: str, outputblob: func.Out[str]):
    # prepare data
    california_housing = fetch_california_housing()

    exp_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
    tar_data = pd.DataFrame(california_housing.target, columns=['HousingPrices'])
    data = pd.concat([exp_data, tar_data], axis=1)

    # Delete anomalous values
    data = data[data['HouseAge'] != 52]
    data = data[data['HousingPrices'] != 5.00001]

    # Create useful variables
    data['Household'] = data['Population']/data['AveOccup']
    data['AllRooms'] = data['AveRooms']*data['Household']
    data['AllBedrms'] = data['AveBedrms']*data['Household']


    ### simple regression analysis ###
    exp_var = 'MedInc'
    tar_var = 'HousingPrices'

    # Remove outliers
    q_95 = data['MedInc'].quantile(0.95)

    data = data[data['MedInc'] < q_95]

    data = data[data['MedInc'] < q_95]

    # Split data into explanatory and objective variables
    X = data[[exp_var]]
    y = data[[tar_var]]

    # learn
    model = LinearRegression()
    model.fit(X, y)

    ### multiple regression analysis ###
    exp_vars = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
    tar_var = 'HousingPrices'

    # Remove outliers
    for exp_var in exp_vars:
        q_95 = data[exp_var].quantile(0.95)
        data = data[data[exp_var] < q_95]

    # Split data into explanatory and objective variables
    X = data[exp_vars]
    y = data[[tar_var]]

    # Split into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    #  Standardize X_train
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns = exp_vars)

    # learn
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # Calculate predicted values
    y_pred = model.predict(X_train_scaled)
    y_pred[:10]

    # MSE for test data
    X_test_scaled = scaler.transform(X_test) # Test data standardized by mean and standard deviation obtained from training data
    y_test_pred = model.predict(X_test_scaled) # Predicting against test data
    mse_test = mean_squared_error(y_test, y_test_pred)

    # Ridge regression
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train_scaled, y_train)
    ridge_y_pred = ridge.predict(X_train_scaled)

    # Checking Partial Regression Coefficients
    ridge_w = pd.DataFrame(ridge.coef_.T, index=exp_vars, columns=['Ridge'])
    for xi, wi in zip(exp_vars, ridge.coef_[0]):
        print('{0:7s}: {1:6.3f}'.format(xi, wi))
    
    # Mean Squared Error (MSE) for training data
    mse_train = mean_squared_error(y_train, y_pred)

    # Mean Squared Error (MSE) for training data
    ridge_mse_train = mean_squared_error(y_train, ridge_y_pred)

    # MSE for test data
    ridge_y_test_pred = ridge.predict(X_test_scaled) # Predicting against test data
    ridge_mse_test = mean_squared_error(y_test, ridge_y_test_pred)

    # Lasso regression
    lasso = Lasso(alpha=1.0)
    lasso.fit(X_train_scaled, y_train)
    lasso_y_pred = lasso.predict(X_train_scaled)

    # Checking Partial Regression Coefficients
    lasso_w = pd.Series(index=exp_vars, data=lasso.coef_)

    lasso_mse_train = mean_squared_error(y_train, lasso_y_pred)

    lasso_X_test_scaled = scaler.transform(X_test)
    lasso_y_pred_test = lasso.predict(lasso_X_test_scaled)
    lasso_mse_test = mean_squared_error(y_test, lasso_y_pred_test)

    # Comparison of the accuracy of multiple regression analysis with and without regularization
    data = {'Training data MSE':[mse_train, ridge_mse_train, lasso_mse_train],
            'Test Data MSE':[mse_test, ridge_mse_test, lasso_mse_test],
            'coefficient of determination':[model.score(X_test_scaled, y_test), ridge.score(X_test_scaled, y_test), lasso.score(X_test_scaled, y_test)]}
    df_mse = pd.DataFrame(data=data, index=['multiple regression', 'Ridge regression', 'Lasso regression'])

    return str(df_mse)

Solution

  • You can use time module to check the execution time of the code blocks in your code, I have imported time module and made changes to your code like below:-

    My function_app.py

    import azure.functions as func
    import azure.durable_functions as df
    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from sklearn.datasets import fetch_california_housing  # Dataset
    from sklearn.model_selection import train_test_split 
    from sklearn.linear_model import Lasso  
    from sklearn.linear_model import Ridge 
    from sklearn.metrics import mean_squared_error  # MSE(Mean Squared Error)
    from sklearn.preprocessing import StandardScaler 
    import sys
    import cProfile
    import time
    
    app = df.DFApp(http_auth_level=func.AuthLevel.ANONYMOUS)
    ### client function ###
    
    @app.route(route="orchestrators/client_function")
    @app.durable_client_input(client_name="client")
    async def client_function(req: func.HttpRequest, client: df.DurableOrchestrationClient) -> func.HttpResponse:
        instance_id = await client.start_new("orchestrator", None, {})
        await client.wait_for_completion_or_create_check_status_response(req, instance_id)
        return client.create_check_status_response(req, instance_id)
       
    ### orchestrator function ###
    @app.orchestration_trigger(context_name="context")
    def orchestrator(context: df.DurableOrchestrationContext) -> str:
        result = yield context.call_activity("origin_analysis", '')
        return "finished"
    
    
    ### activity function ###
    @app.blob_output(arg_name="outputblob", path="newblob/test.txt", connection="BlobStorageConnection")
    @app.activity_trigger(input_name="blank")
    def origin_analysis(blank: str, outputblob: func.Out[str]):
        start_time = time.time()
        # prepare data
        california_housing = fetch_california_housing()
    
        exp_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
        tar_data = pd.DataFrame(california_housing.target, columns=['HousingPrices'])
        data = pd.concat([exp_data, tar_data], axis=1)
    
        # Delete anomalous values
        data = data[data['HouseAge'] != 52]
        data = data[data['HousingPrices'] != 5.00001]
    
        # Create useful variables
        data['Household'] = data['Population']/data['AveOccup']
        data['AllRooms'] = data['AveRooms']*data['Household']
        data['AllBedrms'] = data['AveBedrms']*data['Household']
    
    
        ### simple regression analysis ###
        exp_var = 'MedInc'
        tar_var = 'HousingPrices'
    
        # Remove outliers
        q_95 = data['MedInc'].quantile(0.95)
    
        data = data[data['MedInc'] < q_95]
    
        data = data[data['MedInc'] < q_95]
    
        # Split data into explanatory and objective variables
        X = data[[exp_var]]
        y = data[[tar_var]]
    
        # learn
        model = LinearRegression()
        model.fit(X, y)
    
        ### multiple regression analysis ###
        exp_vars = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
        tar_var = 'HousingPrices'
    
        # Remove outliers
        for exp_var in exp_vars:
            q_95 = data[exp_var].quantile(0.95)
            data = data[data[exp_var] < q_95]
    
        # Split data into explanatory and objective variables
        X = data[exp_vars]
        y = data[[tar_var]]
    
        # Split into training and test data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
        #  Standardize X_train
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_train_scaled = pd.DataFrame(X_train_scaled, columns = exp_vars)
    
        # learn
        model = LinearRegression()
        model.fit(X_train_scaled, y_train)
    
        # Calculate predicted values
        y_pred = model.predict(X_train_scaled)
        y_pred[:10]
    
        # MSE for test data
        X_test_scaled = scaler.transform(X_test) # Test data standardized by mean and standard deviation obtained from training data
        y_test_pred = model.predict(X_test_scaled) # Predicting against test data
        mse_test = mean_squared_error(y_test, y_test_pred)
    
        # Ridge regression
        ridge = Ridge(alpha=1.0)
        ridge.fit(X_train_scaled, y_train)
        ridge_y_pred = ridge.predict(X_train_scaled)
    
        # Checking Partial Regression Coefficients
        ridge_w = pd.DataFrame(ridge.coef_.T, index=exp_vars, columns=['Ridge'])
        for xi, wi in zip(exp_vars, ridge.coef_[0]):
            print('{0:7s}: {1:6.3f}'.format(xi, wi))
        
        # Mean Squared Error (MSE) for training data
        mse_train = mean_squared_error(y_train, y_pred)
    
        # Mean Squared Error (MSE) for training data
        ridge_mse_train = mean_squared_error(y_train, ridge_y_pred)
    
        # MSE for test data
        ridge_y_test_pred = ridge.predict(X_test_scaled) # Predicting against test data
        ridge_mse_test = mean_squared_error(y_test, ridge_y_test_pred)
    
        # Lasso regression
        lasso = Lasso(alpha=1.0)
        lasso.fit(X_train_scaled, y_train)
        lasso_y_pred = lasso.predict(X_train_scaled)
    
        # Checking Partial Regression Coefficients
        lasso_w = pd.Series(index=exp_vars, data=lasso.coef_)
    
        lasso_mse_train = mean_squared_error(y_train, lasso_y_pred)
    
        lasso_X_test_scaled = scaler.transform(X_test)
        lasso_y_pred_test = lasso.predict(lasso_X_test_scaled)
        lasso_mse_test = mean_squared_error(y_test, lasso_y_pred_test)
    
        # Comparison of the accuracy of multiple regression analysis with and without regularization
        data = {'Training data MSE':[mse_train, ridge_mse_train, lasso_mse_train],
                'Test Data MSE':[mse_test, ridge_mse_test, lasso_mse_test],
                'coefficient of determination':[model.score(X_test_scaled, y_test), ridge.score(X_test_scaled, y_test), lasso.score(X_test_scaled, y_test)]}
        df_mse = pd.DataFrame(data=data, index=['multiple regression', 'Ridge regression', 'Lasso regression'])
        
        end_time = time.time()
        execution_time = end_time - start_time
        # Measure the size of 'data' here
        data_size = sys.getsizeof(data)
        print(f"Execution time: {execution_time} seconds")
        print(f"Size of 'data': {data_size} bytes")
        return str(df_mse)
        # test = cProfile.run(origin_analysis)
        # print(test)
    

    Output:-

    enter image description here

    enter image description here

    You can also make use of cProfile to get the execution time of each code block, Here's the function_app.py code utilizing cProfile:-

    function_app.py:-

    import azure.functions as func
    import azure.durable_functions as df
    import pandas as pd
    from sklearn.linear_model import LinearRegression
    from sklearn.datasets import fetch_california_housing  # Dataset
    from sklearn.model_selection import train_test_split 
    from sklearn.linear_model import Lasso  
    from sklearn.linear_model import Ridge 
    from sklearn.metrics import mean_squared_error  # MSE(Mean Squared Error)
    from sklearn.preprocessing import StandardScaler 
    import sys
    import cProfile
    import time
    
    app = df.DFApp(http_auth_level=func.AuthLevel.ANONYMOUS)
    ### client function ###
    
    @app.route(route="orchestrators/client_function")
    @app.durable_client_input(client_name="client")
    async def client_function(req: func.HttpRequest, client: df.DurableOrchestrationClient) -> func.HttpResponse:
        instance_id = await client.start_new("orchestrator", None, {})
        await client.wait_for_completion_or_create_check_status_response(req, instance_id)
        return client.create_check_status_response(req, instance_id)
       
    ### orchestrator function ###
    @app.orchestration_trigger(context_name="context")
    def orchestrator(context: df.DurableOrchestrationContext) -> str:
        result = yield context.call_activity("origin_analysis", '')
        return "finished"
    
    
    ### activity function ###
    @app.blob_output(arg_name="outputblob", path="newblob/test.txt", connection="BlobStorageConnection")
    @app.activity_trigger(input_name="blank")
    def origin_analysis(blank: str, outputblob: func.Out[str]):
        start_time = time.time()
        # prepare data
        california_housing = fetch_california_housing()
    
    
        exp_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
        tar_data = pd.DataFrame(california_housing.target, columns=['HousingPrices'])
        data = pd.concat([exp_data, tar_data], axis=1)
    
        # Delete anomalous values
        data = data[data['HouseAge'] != 52]
        data = data[data['HousingPrices'] != 5.00001]
    
        # Create useful variables
        data['Household'] = data['Population']/data['AveOccup']
        data['AllRooms'] = data['AveRooms']*data['Household']
        data['AllBedrms'] = data['AveBedrms']*data['Household']
    
    
        ### simple regression analysis ###
        exp_var = 'MedInc'
        tar_var = 'HousingPrices'
    
        # Remove outliers
        q_95 = data['MedInc'].quantile(0.95)
    
        data = data[data['MedInc'] < q_95]
    
        data = data[data['MedInc'] < q_95]
    
        # Split data into explanatory and objective variables
        X = data[[exp_var]]
        y = data[[tar_var]]
    
        # learn
        model = LinearRegression()
        model.fit(X, y)
    
        ### multiple regression analysis ###
        exp_vars = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
        tar_var = 'HousingPrices'
    
        # Remove outliers
        for exp_var in exp_vars:
            q_95 = data[exp_var].quantile(0.95)
            data = data[data[exp_var] < q_95]
    
        # Split data into explanatory and objective variables
        X = data[exp_vars]
        y = data[[tar_var]]
    
        # Split into training and test data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
        #  Standardize X_train
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_train_scaled = pd.DataFrame(X_train_scaled, columns = exp_vars)
    
        # learn
        model = LinearRegression()
        model.fit(X_train_scaled, y_train)
    
        # Calculate predicted values
        y_pred = model.predict(X_train_scaled)
        y_pred[:10]
    
        # MSE for test data
        X_test_scaled = scaler.transform(X_test) # Test data standardized by mean and standard deviation obtained from training data
        y_test_pred = model.predict(X_test_scaled) # Predicting against test data
        mse_test = mean_squared_error(y_test, y_test_pred)
    
        # Ridge regression
        ridge = Ridge(alpha=1.0)
        ridge.fit(X_train_scaled, y_train)
        ridge_y_pred = ridge.predict(X_train_scaled)
    
        # Checking Partial Regression Coefficients
        ridge_w = pd.DataFrame(ridge.coef_.T, index=exp_vars, columns=['Ridge'])
        for xi, wi in zip(exp_vars, ridge.coef_[0]):
            print('{0:7s}: {1:6.3f}'.format(xi, wi))
        
        # Mean Squared Error (MSE) for training data
        mse_train = mean_squared_error(y_train, y_pred)
    
        # Mean Squared Error (MSE) for training data
        ridge_mse_train = mean_squared_error(y_train, ridge_y_pred)
    
        # MSE for test data
        ridge_y_test_pred = ridge.predict(X_test_scaled) # Predicting against test data
        ridge_mse_test = mean_squared_error(y_test, ridge_y_test_pred)
    
        # Lasso regression
        lasso = Lasso(alpha=1.0)
        lasso.fit(X_train_scaled, y_train)
        lasso_y_pred = lasso.predict(X_train_scaled)
    
        # Checking Partial Regression Coefficients
        lasso_w = pd.Series(index=exp_vars, data=lasso.coef_)
    
        lasso_mse_train = mean_squared_error(y_train, lasso_y_pred)
    
        lasso_X_test_scaled = scaler.transform(X_test)
        lasso_y_pred_test = lasso.predict(lasso_X_test_scaled)
        lasso_mse_test = mean_squared_error(y_test, lasso_y_pred_test)
        profiler = cProfile.Profile()
        profiler.enable()
    
        # Comparison of the accuracy of multiple regression analysis with and without regularization
        data = {'Training data MSE':[mse_train, ridge_mse_train, lasso_mse_train],
                'Test Data MSE':[mse_test, ridge_mse_test, lasso_mse_test],
                'coefficient of determination':[model.score(X_test_scaled, y_test), ridge.score(X_test_scaled, y_test), lasso.score(X_test_scaled, y_test)]}
        df_mse = pd.DataFrame(data=data, index=['multiple regression', 'Ridge regression', 'Lasso regression'])
        profiler.disable()
        profiler.print_stats(sort='cumtime')
        end_time = time.time()
        execution_time = end_time - start_time
        # Measure the size of 'data' here
        data_size = sys.getsizeof(data)
        print(f"Execution time: {execution_time} seconds")
        print(f"Size of 'data': {data_size} bytes")
        return str(df_mse)
    

    Output:-

    enter image description here

    enter image description here