pythonpysparkvectorrecommendation-enginecosine-similarity

Cosine similarity pyspark


im convert dataset string to array and then convert into vectors like this

from pyspark.ml.feature import HashingTF, IDF

# Create a HashingTF object to convert the "text" column to feature vectors
hashing_tf = HashingTF(inputCol="combined_features", outputCol="raw_features")

# Transform the DataFrame to create the raw feature vectors
df = hashing_tf.transform(combarray)

# Create an IDF object to calculate the inverse document frequency for the raw feature vectors
idf = IDF(inputCol="raw_features", outputCol="features")

# Fit the IDF on the DataFrame and transform it to create the final feature vectors
df = idf.fit(df).transform(df)

# View the resulting feature vectors
df.select("features").show(truncate=False)

Output :

+-------------------------------------+
|features                             |
+-------------------------------------+
|(262144,[243082],[7.785305182539862])|
|(262144,[90558],[7.785305182539862]) |
|(262144,[9277],[7.785305182539862])  |
|(262144,[55279],[7.785305182539862]) |
|(262144,[114098],[7.785305182539862])|
|(262144,[106982],[7.785305182539862])|
|(262144,[248513],[7.785305182539862])|
+-------------------------------------+

How do i create cosine similarity in pyspark from my features?

Update

i combine the data :

from pyspark.sql.functions import concat, lit, col
selected_feature = selected_feature.withColumn('combined_features',
                                               concat(col('genres'),
                                                      col('keywords'),
                                                      col('tagline'),
                                                      col('cast'),
                                                      col('director')))
combine = selected_feature.select("combined_features")

the data like this :

+--------------------------------------------------+
|                                 combined_features|
+--------------------------------------------------+
|Action Adventure Fantasy Science Fictionculture...|
|Adventure Fantasy Actionocean drug abuse exotic...|
|Action Adventure Crimespy based on novel secret...|
+--------------------------------------------------+

i write code like the answer and still got error like in the comment

import pyspark.sql.functions as F
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, IDF
from pyspark.ml.feature import HashingTF, Tokenizer
from sklearn.pipeline import Pipeline

regex_tokenizer = RegexTokenizer(gaps=False, pattern="\w+", inputCol="combined_features", outputCol="tokens")
count_vectorizer = CountVectorizer(inputCol="tokens", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="idf")
tf_idf_pipeline = Pipeline(stages=[regex_tokenizer, count_vectorizer, idf])
combine = tf_idf_pipeline.fit(combine).transform(combine).drop("news", "tokens", "tf")
combine = combarray.crossJoin(combine.withColumnRenamed("idf", "idf2"))

@F.udf(returnType=FloatType())
def cos_sim(u, v):
  return float(u.dot(v) / (u.norm(2) * v.norm(2)))

df.withColumn("cos_sim", cos_sim(F.col("idf"), F.col("idf2")))

Solution

  • There a multiple corrections required in your code:

    Following is a working code:

    import pyspark.sql.functions as F
    from pyspark.ml.feature import RegexTokenizer, CountVectorizer, IDF
    from pyspark.ml import Pipeline
    
    regex_tokenizer = RegexTokenizer(gaps=False, pattern="\w+", inputCol="combined_features", outputCol="tokens")
    count_vectorizer = CountVectorizer(inputCol="tokens", outputCol="tf")
    idf = IDF(inputCol="tf", outputCol="idf")
    tf_idf_pipeline = Pipeline(stages=[regex_tokenizer, count_vectorizer, idf])
    combine = tf_idf_pipeline.fit(combine).transform(combine).drop("tokens", "tf")
    combine = combine.crossJoin(combine.withColumnRenamed("idf", "idf2"))
    
    @F.udf(returnType=FloatType())
    def cos_sim(u, v):
      return float(u.dot(v) / (u.norm(2) * v.norm(2)))
    
    combine = combine.withColumn("cos_sim", cos_sim(F.col("idf"), F.col("idf2")))
    combine.drop("idf", "idf2").show(truncate=False)
    
    +-----------------------------------------------+-----------------------------------------------+-----------+
    |combined_features                              |combined_features                              |cos_sim    |
    +-----------------------------------------------+-----------------------------------------------+-----------+
    |Action Adventure Fantasy Science Fictionculture|Action Adventure Fantasy Science Fictionculture|1.0        |
    |Action Adventure Fantasy Science Fictionculture|Adventure Fantasy Actionocean drug abuse exotic|0.05507607 |
    |Action Adventure Fantasy Science Fictionculture|Action Adventure Crimespy based on novel secret|0.049466185|
    |Adventure Fantasy Actionocean drug abuse exotic|Action Adventure Fantasy Science Fictionculture|0.05507607 |
    |Adventure Fantasy Actionocean drug abuse exotic|Adventure Fantasy Actionocean drug abuse exotic|1.0        |
    |Adventure Fantasy Actionocean drug abuse exotic|Action Adventure Crimespy based on novel secret|0.0        |
    |Action Adventure Crimespy based on novel secret|Action Adventure Fantasy Science Fictionculture|0.049466185|
    |Action Adventure Crimespy based on novel secret|Adventure Fantasy Actionocean drug abuse exotic|0.0        |
    |Action Adventure Crimespy based on novel secret|Action Adventure Crimespy based on novel secret|1.0        |
    +-----------------------------------------------+-----------------------------------------------+-----------+
    

    Sample dataset used:

    combine = spark.createDataFrame(data=[["Action Adventure Fantasy Science Fictionculture"],["Adventure Fantasy Actionocean drug abuse exotic"],["Action Adventure Crimespy based on novel secret"]], schema=["combined_features"])
    combine.show(truncate=False)
    
    +-----------------------------------------------+
    |combined_features                              |
    +-----------------------------------------------+
    |Action Adventure Fantasy Science Fictionculture|
    |Adventure Fantasy Actionocean drug abuse exotic|
    |Action Adventure Crimespy based on novel secret|
    +-----------------------------------------------+