I am building a model using customized transformers (KeyError: "None of [Index([('A','B','C')] , dtype='object')] are in the [columns]). When I run the below code, I get an error because of .fit:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-165-289e1d466eb9> in <module>
10
11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
13
14 # metrics
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
978 sum of n_components (output dimension) over transformers.
979 """
--> 980 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
981 if not results:
982 # All transformers are None
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
1000 transformers = list(self._iter())
1001
-> 1002 return Parallel(n_jobs=self.n_jobs)(delayed(func)(
1003 transformer, X, y, weight,
1004 message_clsname='FeatureUnion',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1042 self._iterating = self._original_iterator is not None
1043
-> 1044 while self.dispatch_one_batch(iterator):
1045 pass
1046
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
The code is
# MODEL
from sklearn import tree
# Decision Tree
decision_tree = tree.DecisionTreeClassifier()
# define full pipeline --> preprocessing + model
full_pipeline = Pipeline(steps=[
('preprocess_pipeline', preprocess_pipeline),
('model', decision_tree)])
# fit on the complete pipeline
training = full_pipeline.fit(X, y) # <- this step returns the error
I have also tried with .fit_transform but I get the same error. I read this: AttributeError: 'numpy.ndarray' object has no attribute 'lower' fitting logistic model data but it seems that I am not passing X or y in the Decision tree like in that example, but maybe I am wrong. Adding
# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
('text_transformer', TextTransformer()),
('cv', CountVectorizer(analyzer='word', ngram_range=(2, 2), lowercase=False))])
I get this new error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-159-289e1d466eb9> in <module>
10
11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
13
14 # metrics
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
978 sum of n_components (output dimension) over transformers.
979 """
--> 980 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
981 if not results:
982 # All transformers are None
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
1000 transformers = list(self._iter())
1001
-> 1002 return Parallel(n_jobs=self.n_jobs)(delayed(func)(
1003 transformer, X, y, weight,
1004 message_clsname='FeatureUnion',
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1042 self._iterating = self._original_iterator is not None
1043
-> 1044 while self.dispatch_one_batch(iterator):
1045 pass
1046
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
385 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
386 if hasattr(last_step, 'fit_transform'):
--> 387 return last_step.fit_transform(Xt, y, **fit_params_last_step)
388 else:
389 return last_step.fit(Xt, y,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1200 max_features = self.max_features
1201
-> 1202 vocabulary, X = self._count_vocab(raw_documents,
1203 self.fixed_vocabulary_)
1204
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1112 for doc in raw_documents:
1113 feature_counter = {}
-> 1114 for feature in analyze(doc):
1115 try:
1116 feature_idx = vocabulary[feature]
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
104 doc = preprocessor(doc)
105 if tokenizer is not None:
--> 106 doc = tokenizer(doc)
107 if ngrams is not None:
108 if stop_words is not None:
TypeError: cannot use a string pattern on a bytes-like object
If I remove text_pipeline, the error does not occur, so it seems that something is going wrong because of the way to use countVectorizer. An example of text is
an example
example number 1
this is another small example
I have other columns that are numerical and categorical.
Have you experienced a similar issue? If yes, how did you handle it?
A common error in text transformers of sklearn
involves the shape of the data: unlike most other sklearn
preprocessors, text transformers generally expect a one-dimensional input, and python's duck-typing causes weird errors from both arrays and strings being iterables.
Your TextTransformer.transform
returns X[['Tweet']]
, which is 2-dimensional, and will cause problems with the subsequent CountVectorizer
. (Converting to a numpy array with .values
doesn't change the dimensionality problem, but there's also no compelling reason to do that conversion.) Returning X['Tweet']
instead should cure that problem.