I'm attempting to remove bi-grams that are created by TfidfVectorizer
. I'm using text.TfidfVectorizer
so that I can use my own preprocessor function.
Test strings and preprocessor function:
doc2 = ['this is a test past performance here is another that has aa aa adding builing cat dog horse hurricane',
'another that has aa aa and start date and hurricane hitting south carolina']
def remove_bigrams(doc):
gram_2 = ['past performance', 'start date', 'aa aa']
res = []
for record in doc:
the_string = record
for phrase in gram_2:
the_string = the_string.replace(phrase, "")
res.append(the_string)
return res
remove_bigrams(doc2)
My TfidfVectorizer
instantiation and fit_transform
:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
custom_stop_words = [i for i in stop_words]
vec = text.TfidfVectorizer(stop_words=custom_stop_words,
analyzer='word',
ngram_range=(2, 2),
preprocessor=remove_bigrams,
)
features = vec.fit_transform(doc2)
Here is my error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Input In [49], in <cell line: 5>()
3 #t3_cv = CountVectorizer(t2, stop_words = stop_words)
4 vec = text.TfidfVectorizer(stop_words=custom_stop_words, analyzer='word', ngram_range = (2,2), preprocessor = remove_bigrams)
----> 5 features = vec.fit_transform(doc2)
File c:\Development_Solutions\Sandbox\SBVE\lib\site-packages\sklearn\feature_extraction\text.py:2079, in TfidfVectorizer.fit_transform(self, raw_documents, y)
2072 self._check_params()
2073 self._tfidf = TfidfTransformer(
2074 norm=self.norm,
2075 use_idf=self.use_idf,
2076 smooth_idf=self.smooth_idf,
2077 sublinear_tf=self.sublinear_tf,
2078 )
-> 2079 X = super().fit_transform(raw_documents)
2080 self._tfidf.fit(X)
2081 # X is already a transformed view of raw_documents so
2082 # we set copy to False
File c:\Development_Solutions\Sandbox\SBVE\lib\site-packages\sklearn\feature_extraction\text.py:1338, in CountVectorizer.fit_transform(self, raw_documents, y)
1330 warnings.warn(
1331 "Upper case characters found in"
1332 " vocabulary while 'lowercase'"
1333 " is True. These entries will not"
1334 " be matched with any documents"
1335 )
1336 break
-> 1338 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
1340 if self.binary:
1341 X.data.fill(1)
File c:\Development_Solutions\Sandbox\SBVE\lib\site-packages\sklearn\feature_extraction\text.py:1209, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
1207 for doc in raw_documents:
1208 feature_counter = {}
-> 1209 for feature in analyze(doc):
1210 try:
1211 feature_idx = vocabulary[feature]
File c:\Development_Solutions\Sandbox\SBVE\lib\site-packages\sklearn\feature_extraction\text.py:113, in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
111 doc = preprocessor(doc)
112 if tokenizer is not None:
--> 113 doc = tokenizer(doc)
114 if ngrams is not None:
115 if stop_words is not None:
TypeError: expected string or bytes-like object
How to resolve it?
The preprocessor should handle documents, not the whole corpus. (The clues are the "expected string" in the error, and the fact that the TfidfVectorizer
docs refer to "the preprocessing (string transformation) stage". The docs could definitely be clearer.)
This should fix it:
def remove_bigrams(doc: str) -> str:
"""Remove certain bi-grams from a document."""
gram_2 = ['past performance', 'start date', 'aa aa']
for phrase in gram_2:
doc = doc.replace(phrase, "")
return doc