I have dataframe with 1000 text rows. df['text']
I also have 5 words that I want to know for each one of them how much they represnt the text (between 0 to 1)
every score will be in df["word1"]
,df["word2"]
and etc
I will glad for recomendations how to do that
edit
represnt = the semantic distance between the word to the text.
for example - lets say in row 1 the text is "i want to eat" and I have 2 words : food and house.
so in df["food "]
it would be higher score than in df["house"]
You could use a pre-trained sentence transformer model from sentence_transformers
:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
class SemanticSimilarityCalculator:
def __init__(self, model_name: str = 'all-MiniLM-L6-v2') -> None:
self.model = SentenceTransformer(model_name)
self.word_embeddings = None
def encode_words(self, words: list[str]) -> None:
self.word_embeddings = self.model.encode(words, convert_to_tensor=True)
self.words = words
def calculate_similarity(self, text: str) -> list[float]:
if self.word_embeddings is None:
raise ValueError('Words must be encoded before calculating similarity.')
text_embedding = self.model.encode(text, convert_to_tensor=True)
similarities = util.cos_sim(text_embedding, self.word_embeddings)[
0
].tolist()
return similarities
def add_similarity_scores_to_df(
self, df: pd.DataFrame, text_column: str
) -> pd.DataFrame:
if self.words is None:
raise ValueError(
'Words must be encoded before adding scores to the DataFrame.'
)
similarity_columns = ['word_' + word for word in self.words]
df[similarity_columns] = df[text_column].apply(
lambda text: pd.Series(self.calculate_similarity(text))
)
return df
def main():
data = {'text': ['I want to eat', 'The house is big', 'I need to sleep']}
df = pd.DataFrame(data)
words = ['food', 'house', 'sleep', 'drink', 'run']
calculator = SemanticSimilarityCalculator()
calculator.encode_words(words)
df_with_scores = calculator.add_similarity_scores_to_df(
df, text_column='text'
)
print(df_with_scores)
if __name__ == '__main__':
main()
Output:
text word_food word_house word_sleep word_drink word_run
0 I want to eat 0.592410 0.215032 0.254065 0.370329 0.259350
1 The house is big 0.243262 0.672110 0.170785 0.213780 0.119716
2 I need to sleep 0.253703 0.222462 0.725105 0.358372 0.303838