pysparkscatter-plot

How to plot R squared value from pyspark DataFrame?


to recreate the issue

import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import Window, DataFrame
from pyspark.sql.functions import col
from sklearn.metrics import r2_score
# from recommenders.utils.spark_utils import start_or_get_spark
# from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
import numpy as np
import os

COL_USER = "UserId"
COL_ITEM = "MovieId"
COL_RATING = "Rating"
COL_PREDICTION = "Rating"

HEADER = {
    "col_user": COL_USER,
    "col_item": COL_ITEM,
    "col_rating": COL_RATING,
    "col_prediction": COL_PREDICTION,
}

class SparkRatingEvaluation:
    """Spark Rating Evaluator"""

    def __init__(
        self,
        rating_true,
        rating_pred,
        col_user = COL_USER,
        col_item = COL_ITEM,
        col_rating = COL_RATING,
        col_prediction = COL_PREDICTION,
    ):
        """Initializer.

        This is the Spark version of rating metrics evaluator.
        The methods of this class, calculate rating metrics such as root mean squared error, mean absolute error,
        R squared, and explained variance.

        Args:
            rating_true (pyspark.sql.DataFrame): True labels.
            rating_pred (pyspark.sql.DataFrame): Predicted labels.
            col_user (str): column name for user.
            col_item (str): column name for item.
            col_rating (str): column name for rating.
            col_prediction (str): column name for prediction.
        """
        self.rating_true = rating_true
        self.rating_pred = rating_pred
        self.col_user = col_user
        self.col_item = col_item
        self.col_rating = col_rating
        self.col_prediction = col_prediction

        # Check if inputs are Spark DataFrames.
        if not isinstance(self.rating_true, DataFrame):
            raise TypeError(
                "rating_true should be but is not a Spark DataFrame"
            )  # pragma : No Cover

        if not isinstance(self.rating_pred, DataFrame):
            raise TypeError(
                "rating_pred should be but is not a Spark DataFrame"
            )  # pragma : No Cover

        # Check if columns exist.
        true_columns = self.rating_true.columns
        pred_columns = self.rating_pred.columns

        if rating_true.count() == 0:
            raise ValueError("Empty input dataframe")
        if rating_pred.count() == 0:
            raise ValueError("Empty input dataframe")

        if self.col_user not in true_columns:
            raise ValueError("Schema of rating_true not valid. Missing User Col")
        if self.col_item not in true_columns:
            raise ValueError("Schema of rating_true not valid. Missing Item Col")
        if self.col_rating not in true_columns:
            raise ValueError("Schema of rating_true not valid. Missing Rating Col")

        if self.col_user not in pred_columns:
            raise ValueError(
                "Schema of rating_pred not valid. Missing User Col"
            )  # pragma : No Cover
        if self.col_item not in pred_columns:
            raise ValueError(
                "Schema of rating_pred not valid. Missing Item Col"
            )  # pragma : No Cover
        if self.col_prediction not in pred_columns:
            raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")

        self.rating_true = self.rating_true.select(
            col(self.col_user),
            col(self.col_item),
            col(self.col_rating).cast("double").alias("label"),
        )
        self.rating_pred = self.rating_pred.select(
            col(self.col_user),
            col(self.col_item),
            col(self.col_prediction).cast("double").alias("prediction"),
        )

        self.y_pred_true = (
            self.rating_true.join(
                self.rating_pred, [self.col_user, self.col_item], "inner"
            )
            .drop(self.col_user)
            .drop(self.col_item)
        )

        self.metrics = RegressionMetrics(
            self.y_pred_true.rdd.map(lambda x: (x.prediction, x.label))
        )

    def rmse(self):
        """Calculate Root Mean Squared Error.

        Returns:
            float: Root mean squared error.
        """
        return self.metrics.rootMeanSquaredError

    def mae(self):
        """Calculate Mean Absolute Error.

        Returns:
            float: Mean Absolute Error.
        """
        return self.metrics.meanAbsoluteError

    def rsquared(self):
        """Calculate R squared.

        Returns:
            float: R squared.
        """
        return self.metrics.r2

    def exp_var(self):
        """Calculate explained variance.

        Note:
           Spark MLLib's implementation is buggy (can lead to values > 1), hence we use var().

        Returns:
            float: Explained variance (min=0, max=1).
        """
        var1 = self.y_pred_true.selectExpr("variance(label-prediction)").collect()[0][0]
        var2 = self.y_pred_true.selectExpr("variance(label)").collect()[0][0]

        if var1 is None or var2 is None:
            return -np.inf
        else:
            # numpy divide is more tolerant to var2 being zero
            return 1 - np.divide(var1, var2)

def start_or_get_spark(
    app_name="Sample",
    url="local[*]",
    memory="10g",
    config=None,
    packages=None,
    jars=None,
    repositories=None,
):
    """Start Spark if not started

    Args:
        app_name (str): set name of the application
        url (str): URL for spark master
        memory (str): size of memory for spark driver. This will be ignored if spark.driver.memory is set in config.
        config (dict): dictionary of configuration options
        packages (list): list of packages to install
        jars (list): list of jar files to add
        repositories (list): list of maven repositories

    Returns:
        object: Spark context.
    """

    submit_args = ""
    if packages is not None:
        submit_args = "--packages {} ".format(",".join(packages))
    if jars is not None:
        submit_args += "--jars {} ".format(",".join(jars))
    if repositories is not None:
        submit_args += "--repositories {}".format(",".join(repositories))
    if submit_args:
        os.environ["PYSPARK_SUBMIT_ARGS"] = "{} pyspark-shell".format(submit_args)

    spark_opts = [
        'SparkSession.builder.appName("{}")'.format(app_name),
        'master("{}")'.format(url),
    ]

    if config is not None:
        for key, raw_value in config.items():
            value = (
                '"{}"'.format(raw_value) if isinstance(raw_value, str) else raw_value
            )
            spark_opts.append('config("{key}", {value})'.format(key=key, value=value))

    if config is None or "spark.driver.memory" not in config:
        spark_opts.append('config("spark.driver.memory", "{}")'.format(memory))

    # Set larger stack size
    spark_opts.append('config("spark.executor.extraJavaOptions", "-Xss4m")')
    spark_opts.append('config("spark.driver.extraJavaOptions", "-Xss4m")')

    spark_opts.append("getOrCreate()")
    return eval(".".join(spark_opts))

I have the actual and the prediction data frames following which then used to calculate the r2 value:

x_true = pd.DataFrame({COL_USER: [1, 2, 3, 4], COL_ITEM: [5, 6, 7, 8], COL_RATING: [9, 10, 11, 12]})
x_pred = pd.DataFrame({COL_USER: [1, 2, 3, 4], COL_ITEM: [5, 6, 7, 8], COL_RATING: [13, 14, 15, 16]})
spark = start_or_get_spark("EvaluationTesting", "local")
X_true = spark.createDataFrame(x_true)
X_pred = spark.createDataFrame(x_pred)
sre = SparkRatingEvaluation(X_true, X_pred, **HEADER)
r2 = sre.rsquared()
print(r2)

plt.annotate("r-squared = {:.3f}".format(r2_score(X_true, x_pred)), (0, 1))
plt.show()

I want to visualize the result in Scatterplot. I have tried annotating but it is not working. It's showing: InvalidParameterError: The 'y_true' parameter of r2_score must be an array-like. Got DataFrame[UserId: bigint, MovieId: bigint, Rating: bigint] instead.


Solution

  • in your code, you passed X_true, x_pred. The first one X_true is a spark dataframe, then x_pred is a pandas dataframe, I think you wanted to use both spark dataframes (the answer will be the same if both are pandas dataframes), then another mistake was to pass the whole dataframes, where it only takes list/array as inputs

    selecting just the Rating columns:

    X_true_Rating = [row[COL_RATING] for row in X_true.select(COL_RATING).collect()]
    X_pred_Rating = [row[COL_RATING] for row in X_pred.select(COL_RATING).collect()]
    

    would have helped if you created the plot first

    plt.scatter(X_true_Rating, X_pred_Rating)
    

    Then your annotation was in (0,1), but your x-axis started from 9 and y-axis from 13, so you couldn't see the annotation

    plt.annotate("r-squared = {:.3f}".format(r2_score(X_true_Rating, X_pred_Rating)), (min(X_true_Rating), max(X_pred_Rating)))
    plt.show()