to recreate the issue
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import Window, DataFrame
from pyspark.sql.functions import col
from sklearn.metrics import r2_score
# from recommenders.utils.spark_utils import start_or_get_spark
# from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
import numpy as np
import os
COL_USER = "UserId"
COL_ITEM = "MovieId"
COL_RATING = "Rating"
COL_PREDICTION = "Rating"
HEADER = {
"col_user": COL_USER,
"col_item": COL_ITEM,
"col_rating": COL_RATING,
"col_prediction": COL_PREDICTION,
}
class SparkRatingEvaluation:
"""Spark Rating Evaluator"""
def __init__(
self,
rating_true,
rating_pred,
col_user = COL_USER,
col_item = COL_ITEM,
col_rating = COL_RATING,
col_prediction = COL_PREDICTION,
):
"""Initializer.
This is the Spark version of rating metrics evaluator.
The methods of this class, calculate rating metrics such as root mean squared error, mean absolute error,
R squared, and explained variance.
Args:
rating_true (pyspark.sql.DataFrame): True labels.
rating_pred (pyspark.sql.DataFrame): Predicted labels.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
"""
self.rating_true = rating_true
self.rating_pred = rating_pred
self.col_user = col_user
self.col_item = col_item
self.col_rating = col_rating
self.col_prediction = col_prediction
# Check if inputs are Spark DataFrames.
if not isinstance(self.rating_true, DataFrame):
raise TypeError(
"rating_true should be but is not a Spark DataFrame"
) # pragma : No Cover
if not isinstance(self.rating_pred, DataFrame):
raise TypeError(
"rating_pred should be but is not a Spark DataFrame"
) # pragma : No Cover
# Check if columns exist.
true_columns = self.rating_true.columns
pred_columns = self.rating_pred.columns
if rating_true.count() == 0:
raise ValueError("Empty input dataframe")
if rating_pred.count() == 0:
raise ValueError("Empty input dataframe")
if self.col_user not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing User Col")
if self.col_item not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing Item Col")
if self.col_rating not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing Rating Col")
if self.col_user not in pred_columns:
raise ValueError(
"Schema of rating_pred not valid. Missing User Col"
) # pragma : No Cover
if self.col_item not in pred_columns:
raise ValueError(
"Schema of rating_pred not valid. Missing Item Col"
) # pragma : No Cover
if self.col_prediction not in pred_columns:
raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")
self.rating_true = self.rating_true.select(
col(self.col_user),
col(self.col_item),
col(self.col_rating).cast("double").alias("label"),
)
self.rating_pred = self.rating_pred.select(
col(self.col_user),
col(self.col_item),
col(self.col_prediction).cast("double").alias("prediction"),
)
self.y_pred_true = (
self.rating_true.join(
self.rating_pred, [self.col_user, self.col_item], "inner"
)
.drop(self.col_user)
.drop(self.col_item)
)
self.metrics = RegressionMetrics(
self.y_pred_true.rdd.map(lambda x: (x.prediction, x.label))
)
def rmse(self):
"""Calculate Root Mean Squared Error.
Returns:
float: Root mean squared error.
"""
return self.metrics.rootMeanSquaredError
def mae(self):
"""Calculate Mean Absolute Error.
Returns:
float: Mean Absolute Error.
"""
return self.metrics.meanAbsoluteError
def rsquared(self):
"""Calculate R squared.
Returns:
float: R squared.
"""
return self.metrics.r2
def exp_var(self):
"""Calculate explained variance.
Note:
Spark MLLib's implementation is buggy (can lead to values > 1), hence we use var().
Returns:
float: Explained variance (min=0, max=1).
"""
var1 = self.y_pred_true.selectExpr("variance(label-prediction)").collect()[0][0]
var2 = self.y_pred_true.selectExpr("variance(label)").collect()[0][0]
if var1 is None or var2 is None:
return -np.inf
else:
# numpy divide is more tolerant to var2 being zero
return 1 - np.divide(var1, var2)
def start_or_get_spark(
app_name="Sample",
url="local[*]",
memory="10g",
config=None,
packages=None,
jars=None,
repositories=None,
):
"""Start Spark if not started
Args:
app_name (str): set name of the application
url (str): URL for spark master
memory (str): size of memory for spark driver. This will be ignored if spark.driver.memory is set in config.
config (dict): dictionary of configuration options
packages (list): list of packages to install
jars (list): list of jar files to add
repositories (list): list of maven repositories
Returns:
object: Spark context.
"""
submit_args = ""
if packages is not None:
submit_args = "--packages {} ".format(",".join(packages))
if jars is not None:
submit_args += "--jars {} ".format(",".join(jars))
if repositories is not None:
submit_args += "--repositories {}".format(",".join(repositories))
if submit_args:
os.environ["PYSPARK_SUBMIT_ARGS"] = "{} pyspark-shell".format(submit_args)
spark_opts = [
'SparkSession.builder.appName("{}")'.format(app_name),
'master("{}")'.format(url),
]
if config is not None:
for key, raw_value in config.items():
value = (
'"{}"'.format(raw_value) if isinstance(raw_value, str) else raw_value
)
spark_opts.append('config("{key}", {value})'.format(key=key, value=value))
if config is None or "spark.driver.memory" not in config:
spark_opts.append('config("spark.driver.memory", "{}")'.format(memory))
# Set larger stack size
spark_opts.append('config("spark.executor.extraJavaOptions", "-Xss4m")')
spark_opts.append('config("spark.driver.extraJavaOptions", "-Xss4m")')
spark_opts.append("getOrCreate()")
return eval(".".join(spark_opts))
I have the actual and the prediction data frames following which then used to calculate the r2 value:
x_true = pd.DataFrame({COL_USER: [1, 2, 3, 4], COL_ITEM: [5, 6, 7, 8], COL_RATING: [9, 10, 11, 12]})
x_pred = pd.DataFrame({COL_USER: [1, 2, 3, 4], COL_ITEM: [5, 6, 7, 8], COL_RATING: [13, 14, 15, 16]})
spark = start_or_get_spark("EvaluationTesting", "local")
X_true = spark.createDataFrame(x_true)
X_pred = spark.createDataFrame(x_pred)
sre = SparkRatingEvaluation(X_true, X_pred, **HEADER)
r2 = sre.rsquared()
print(r2)
plt.annotate("r-squared = {:.3f}".format(r2_score(X_true, x_pred)), (0, 1))
plt.show()
I want to visualize the result in Scatterplot. I have tried annotating but it is not working. It's showing: InvalidParameterError: The 'y_true' parameter of r2_score must be an array-like. Got DataFrame[UserId: bigint, MovieId: bigint, Rating: bigint] instead.
in your code, you passed X_true, x_pred. The first one X_true is a spark dataframe, then x_pred is a pandas dataframe, I think you wanted to use both spark dataframes (the answer will be the same if both are pandas dataframes), then another mistake was to pass the whole dataframes, where it only takes list/array as inputs
selecting just the Rating columns:
X_true_Rating = [row[COL_RATING] for row in X_true.select(COL_RATING).collect()]
X_pred_Rating = [row[COL_RATING] for row in X_pred.select(COL_RATING).collect()]
would have helped if you created the plot first
plt.scatter(X_true_Rating, X_pred_Rating)
Then your annotation was in (0,1), but your x-axis started from 9 and y-axis from 13, so you couldn't see the annotation
plt.annotate("r-squared = {:.3f}".format(r2_score(X_true_Rating, X_pred_Rating)), (min(X_true_Rating), max(X_pred_Rating)))
plt.show()