im convert dataset string to array and then convert into vectors like this
from import HashingTF, IDF
# Create a HashingTF object to convert the "text" column to feature vectors
hashing_tf = HashingTF(inputCol="combined_features", outputCol="raw_features")
# Transform the DataFrame to create the raw feature vectors
df = hashing_tf.transform(combarray)
# Create an IDF object to calculate the inverse document frequency for the raw feature vectors
idf = IDF(inputCol="raw_features", outputCol="features")
# Fit the IDF on the DataFrame and transform it to create the final feature vectors
df =
# View the resulting feature vectors"features").show(truncate=False)
Output :
|features |
|(262144,[90558],[7.785305182539862]) |
|(262144,[9277],[7.785305182539862]) |
|(262144,[55279],[7.785305182539862]) |
How do i create cosine similarity in pyspark from my features?
i combine the data :
from pyspark.sql.functions import concat, lit, col
selected_feature = selected_feature.withColumn('combined_features',
combine ="combined_features")
the data like this :
| combined_features|
|Action Adventure Fantasy Science Fictionculture...|
|Adventure Fantasy Actionocean drug abuse exotic...|
|Action Adventure Crimespy based on novel secret...|
i write code like the answer and still got error like in the comment
import pyspark.sql.functions as F
from import RegexTokenizer, CountVectorizer, IDF
from import HashingTF, Tokenizer
from sklearn.pipeline import Pipeline
regex_tokenizer = RegexTokenizer(gaps=False, pattern="\w+", inputCol="combined_features", outputCol="tokens")
count_vectorizer = CountVectorizer(inputCol="tokens", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="idf")
tf_idf_pipeline = Pipeline(stages=[regex_tokenizer, count_vectorizer, idf])
combine ="news", "tokens", "tf")
combine = combarray.crossJoin(combine.withColumnRenamed("idf", "idf2"))
def cos_sim(u, v):
return float( / (u.norm(2) * v.norm(2)))
df.withColumn("cos_sim", cos_sim(F.col("idf"), F.col("idf2")))
There a multiple corrections required in your code:
. The correct import is from import Pipeline
.Following is a working code:
import pyspark.sql.functions as F
from import RegexTokenizer, CountVectorizer, IDF
from import Pipeline
regex_tokenizer = RegexTokenizer(gaps=False, pattern="\w+", inputCol="combined_features", outputCol="tokens")
count_vectorizer = CountVectorizer(inputCol="tokens", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="idf")
tf_idf_pipeline = Pipeline(stages=[regex_tokenizer, count_vectorizer, idf])
combine ="tokens", "tf")
combine = combine.crossJoin(combine.withColumnRenamed("idf", "idf2"))
def cos_sim(u, v):
return float( / (u.norm(2) * v.norm(2)))
combine = combine.withColumn("cos_sim", cos_sim(F.col("idf"), F.col("idf2")))
combine.drop("idf", "idf2").show(truncate=False)
|combined_features |combined_features |cos_sim |
|Action Adventure Fantasy Science Fictionculture|Action Adventure Fantasy Science Fictionculture|1.0 |
|Action Adventure Fantasy Science Fictionculture|Adventure Fantasy Actionocean drug abuse exotic|0.05507607 |
|Action Adventure Fantasy Science Fictionculture|Action Adventure Crimespy based on novel secret|0.049466185|
|Adventure Fantasy Actionocean drug abuse exotic|Action Adventure Fantasy Science Fictionculture|0.05507607 |
|Adventure Fantasy Actionocean drug abuse exotic|Adventure Fantasy Actionocean drug abuse exotic|1.0 |
|Adventure Fantasy Actionocean drug abuse exotic|Action Adventure Crimespy based on novel secret|0.0 |
|Action Adventure Crimespy based on novel secret|Action Adventure Fantasy Science Fictionculture|0.049466185|
|Action Adventure Crimespy based on novel secret|Adventure Fantasy Actionocean drug abuse exotic|0.0 |
|Action Adventure Crimespy based on novel secret|Action Adventure Crimespy based on novel secret|1.0 |
Sample dataset used:
combine = spark.createDataFrame(data=[["Action Adventure Fantasy Science Fictionculture"],["Adventure Fantasy Actionocean drug abuse exotic"],["Action Adventure Crimespy based on novel secret"]], schema=["combined_features"])
|combined_features |
|Action Adventure Fantasy Science Fictionculture|
|Adventure Fantasy Actionocean drug abuse exotic|
|Action Adventure Crimespy based on novel secret|