pythonscikit-learnscikit-learn-pipeline

AttributeError and TypeError using CustomTransformers


I am building a model using customized transformers (KeyError: "None of [Index([('A','B','C')] , dtype='object')] are in the [columns]). When I run the below code, I get an error because of .fit:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-165-289e1d466eb9> in <module>
     10 
     11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
     13 
     14 # metrics

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    339         """
    340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
    342         with _print_elapsed_time('Pipeline',
    343                                  self._log_message(len(self.steps) - 1)):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    301                 cloned_transformer = clone(transformer)
    302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
    304                 cloned_transformer, X, y, None,
    305                 message_clsname='Pipeline',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    978             sum of n_components (output dimension) over transformers.
    979         """
--> 980         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    981         if not results:
    982             # All transformers are None

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
   1000         transformers = list(self._iter())
   1001 
-> 1002         return Parallel(n_jobs=self.n_jobs)(delayed(func)(
   1003             transformer, X, y, weight,
   1004             message_clsname='FeatureUnion',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
   1042                 self._iterating = self._original_iterator is not None
   1043 
-> 1044             while self.dispatch_one_batch(iterator):
   1045                 pass
   1046 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1200         max_features = self.max_features
   1201 
-> 1202         vocabulary, X = self._count_vocab(raw_documents,
   1203                                           self.fixed_vocabulary_)
   1204 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1112         for doc in raw_documents:
   1113             feature_counter = {}
-> 1114             for feature in analyze(doc):
   1115                 try:
   1116                     feature_idx = vocabulary[feature]

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    102     else:
    103         if preprocessor is not None:
--> 104             doc = preprocessor(doc)
    105         if tokenizer is not None:
    106             doc = tokenizer(doc)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _preprocess(doc, accent_function, lower)
     67     """
     68     if lower:
---> 69         doc = doc.lower()
     70     if accent_function is not None:
     71         doc = accent_function(doc)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

The code is

# MODEL
from sklearn import tree

# Decision Tree
decision_tree = tree.DecisionTreeClassifier()
# define full pipeline --> preprocessing + model
full_pipeline = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline),
    ('model', decision_tree)])

# fit on the complete pipeline
training = full_pipeline.fit(X, y) # <- this step returns the error

I have also tried with .fit_transform but I get the same error. I read this: AttributeError: 'numpy.ndarray' object has no attribute 'lower' fitting logistic model data but it seems that I am not passing X or y in the Decision tree like in that example, but maybe I am wrong. Adding

# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
    ('text_transformer', TextTransformer()),
    ('cv', CountVectorizer(analyzer='word', ngram_range=(2, 2), lowercase=False))])

I get this new error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-159-289e1d466eb9> in <module>
     10 
     11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
     13 
     14 # metrics

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    339         """
    340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
    342         with _print_elapsed_time('Pipeline',
    343                                  self._log_message(len(self.steps) - 1)):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    301                 cloned_transformer = clone(transformer)
    302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
    304                 cloned_transformer, X, y, None,
    305                 message_clsname='Pipeline',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    978             sum of n_components (output dimension) over transformers.
    979         """
--> 980         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    981         if not results:
    982             # All transformers are None

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
   1000         transformers = list(self._iter())
   1001 
-> 1002         return Parallel(n_jobs=self.n_jobs)(delayed(func)(
   1003             transformer, X, y, weight,
   1004             message_clsname='FeatureUnion',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
   1042                 self._iterating = self._original_iterator is not None
   1043 
-> 1044             while self.dispatch_one_batch(iterator):
   1045                 pass
   1046 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1200         max_features = self.max_features
   1201 
-> 1202         vocabulary, X = self._count_vocab(raw_documents,
   1203                                           self.fixed_vocabulary_)
   1204 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1112         for doc in raw_documents:
   1113             feature_counter = {}
-> 1114             for feature in analyze(doc):
   1115                 try:
   1116                     feature_idx = vocabulary[feature]

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    104             doc = preprocessor(doc)
    105         if tokenizer is not None:
--> 106             doc = tokenizer(doc)
    107         if ngrams is not None:
    108             if stop_words is not None:

TypeError: cannot use a string pattern on a bytes-like object

If I remove text_pipeline, the error does not occur, so it seems that something is going wrong because of the way to use countVectorizer. An example of text is

an example
example number 1
this is another small example

I have other columns that are numerical and categorical.

Have you experienced a similar issue? If yes, how did you handle it?


Solution

  • A common error in text transformers of sklearn involves the shape of the data: unlike most other sklearn preprocessors, text transformers generally expect a one-dimensional input, and python's duck-typing causes weird errors from both arrays and strings being iterables.

    Your TextTransformer.transform returns X[['Tweet']], which is 2-dimensional, and will cause problems with the subsequent CountVectorizer. (Converting to a numpy array with .values doesn't change the dimensionality problem, but there's also no compelling reason to do that conversion.) Returning X['Tweet'] instead should cure that problem.