pythonmachine-learningdeep-learningauto-keras

TypeError: Unsupported type <class 'scipy.sparse.csr.csr_matrix'> for StructuredDataAdapter


can anyone help me to resolve above error?

### using trasnformers 
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

column_trans = ColumnTransformer(
         [
          ('CompanyName_bow', TfidfVectorizer(), 'CompanyName'),
          ('state_category', OneHotEncoder(), ['state']),
          ('Termination_Reason_Desc_bow', TfidfVectorizer(), 'Termination_Reason_Desc'),
          ('TermType_category', OneHotEncoder(), ['TermType'])
         ],
         remainder=MinMaxScaler()
        )
X = column_trans.fit_transform(X.head(100))

from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y.head(100))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)

X_train.shape  #(80, 92)
X_test.shape   #(20, 92)
y_train.shape  #(80,)
X_train.todense()
matrix([[0.        , 0.        , 0.        , ..., 0.26921709, 1.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         1.        ],
        [0.        , 0.        , 0.        , ..., 0.46148896, 1.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.46148896, 1.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         1.        ],
        [0.        , 0.        , 0.        , ..., 0.46148896, 1.        ,
         0.        ]])

type(X_train)
--> scipy.sparse.csr.csr_matrix

print(y_train)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 
type(y_train)
numpy.ndarray

# use autokeras to find a model for the sonar dataset
from numpy import asarray
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from autokeras import StructuredDataClassifier

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# define the search
search = StructuredDataClassifier(max_trials=15)
# perform the search
search.fit(x=(X_train), y=y_train, verbose=0)
# evaluate the model
loss, acc = search.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %.3f' % acc)

Error

(80, 92) (20, 92) (80,) (20,)
INFO:tensorflow:Reloading Oracle from existing project .\structured_data_classifier\oracle.json
INFO:tensorflow:Reloading Tuner from .\structured_data_classifier\tuner0.json
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-106-94708e5d279d> in <module>
     10 search = StructuredDataClassifier(max_trials=15)
     11 # perform the search
---> 12 search.fit(x=(X_train), y=y_train, verbose=0)
     13 # evaluate the model
     14 loss, acc = search.evaluate(X_test, y_test, verbose=0)

~\anaconda3\lib\site-packages\autokeras\tasks\structured_data.py in fit(self, x, y, epochs, callbacks, validation_split, validation_data, **kwargs)
    313                 [keras.Model.fit](https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit).
    314         """
--> 315         super().fit(
    316             x=x,
    317             y=y,

~\anaconda3\lib\site-packages\autokeras\tasks\structured_data.py in fit(self, x, y, epochs, callbacks, validation_split, validation_data, **kwargs)
    132         self.check_in_fit(x)
    133 
--> 134         super().fit(
    135             x=x,
    136             y=y,

~\anaconda3\lib\site-packages\autokeras\auto_model.py in fit(self, x, y, batch_size, epochs, callbacks, validation_split, validation_data, **kwargs)
    259             validation_split = 0
    260 
--> 261         dataset, validation_data = self._convert_to_dataset(
    262             x=x, y=y, validation_data=validation_data, batch_size=batch_size
    263         )

~\anaconda3\lib\site-packages\autokeras\auto_model.py in _convert_to_dataset(self, x, y, validation_data, batch_size)
    373             x = dataset.map(lambda x, y: x)
    374             y = dataset.map(lambda x, y: y)
--> 375         x = self._adapt(x, self.inputs, batch_size)
    376         y = self._adapt(y, self._heads, batch_size)
    377         dataset = tf.data.Dataset.zip((x, y))

~\anaconda3\lib\site-packages\autokeras\auto_model.py in _adapt(self, dataset, hms, batch_size)
    287         adapted = []
    288         for source, hm in zip(sources, hms):
--> 289             source = hm.get_adapter().adapt(source, batch_size)
    290             adapted.append(source)
    291         if len(adapted) == 1:

~\anaconda3\lib\site-packages\autokeras\engine\adapter.py in adapt(self, dataset, batch_size)
     65             tf.data.Dataset. The converted dataset.
     66         """
---> 67         self.check(dataset)
     68         dataset = self.convert_to_dataset(dataset, batch_size)
     69         return dataset

~\anaconda3\lib\site-packages\autokeras\adapters\input_adapters.py in check(self, x)
     63     def check(self, x):
     64         if not isinstance(x, (pd.DataFrame, np.ndarray, tf.data.Dataset)):
---> 65             raise TypeError(
     66                 "Unsupported type {type} for "
     67                 "{name}.".format(type=type(x), name=self.__class__.__name__)

TypeError: Unsupported type <class 'scipy.sparse.csr.csr_matrix'> for StructuredDataAdapter.

Solution

  • As noticed in the Github issue you opened in parallel with this thread, sparse matrices are not (currently) supported in AutoKeras, and the advice is to convert them to dense Numpy arrays. Indeed, from the documentation of AutoKeras StructuredDataClassifier, the training data x in the respective .fit method are expected to be:

    String, numpy.ndarray, pandas.DataFrame or tensorflow.Dataset

    and not SciPy sparse matrix.

    Given that here your X_train is really small:

    X_train.shape  
    # (80, 92)
    

    you have absolutely no reason whatsoever to use a sparse matrix. And although here you seem to try to convert X_train to a dense one, you do not re-assign it, the result being that it remains a sparse one; from your own code above:

    X_train.todense()
    # ...
    type(X_train)
    # scipy.sparse.csr.csr_matrix
    

    What you need to do is simply to reassign it to a dense array:

    from scipy.sparse import csr_matrix
    X_train = X_train.toarray()
    

    Here is a short demo that this works with dummy data:

    import numpy as np
    from scipy.sparse import csr_matrix
    X_train = csr_matrix((3, 4), dtype=np.float)
    
    type(X_train)
    # scipy.sparse.csr.csr_matrix
    
    # this will not work:
    X_train.todense()
    type(X_train)
    # scipy.sparse.csr.csr_matrix # still sparse
    
    # this will work:
    X_train = X_train.toarray()
    type(X_train)
    # numpy.ndarray
    

    You should follow a similar procedure for your X_test data (your y_train and y_test seem to be already dense Numpy arrays).