pythonpandasdataframevectorscikit-learn

Convert Pandas dataframe of objects to a dataframe of vectors


I have a Pandas dataframe (over 1k of rows). There are numbers, objects, strings, and Boolean values in my dataframe. I want to convert each 'cell' of the dataframe to a vector, and work with the resulting vectors. I then plan to compare each row of vectors for similarities.

For example, My data is:

Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10

12,65e1e35b7fe333,harry Joe,1,FALSE,swe,1,142.158.0.2,10.10.0.2,text1,0
13,65e1e35b7fe599,allen,1,FALSE,swe,1,142.158.0.20,10.10.0.20,text2,0
14,65e1e35b7fe165,carter,1,FALSE,swe,1,142.158.0.21,10.10.0.21,text3,0

I want to end up with a dataframe of vectors that looks like:

Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10

Vect1,Vect2,Vect3,Vect4,Vect5,Vect6,Vect7,Vect8,Vect9,Vect10,Vect11

Vect12,Vect 13,Vect 14,Vect4,Vect5,Vect6,Vect7,Vect 15,Vect 16,Vect 17,Vect11

Vect18,Vect 19,Vect 20,Vect4,Vect5,Vect6,Vect7,Vect 21,Vect 22,Vect 23,Vect11

Is there a good way to do this in Python w SciKit maybe?

I have tried:

import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

df=mydata

vectorizer = TfidfVectorizer()

# Transform the data to feature vectors
X = vectorizer.fit_transform(df)
X = pd.DataFrame(_X.todense(), index=df.index, columns=vectorizer.vocabulary_)
X.head()

# Labels
y = df['label']

What I got was:

TypeError                                 Traceback (most recent call last)
Cell In[11], line 14
     11 vectorizer = TfidfVectorizer()
     13 # Transform the text data to feature vectors
---> 14 X = vectorizer.fit_transform(df)
     16 X = pd.DataFrame(_X.todense(), index=df.index, columns=vectorizer.vocabulary_)
     17 X.head()

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:2079, in TfidfVectorizer.fit_transform(self, raw_documents, y)
   2072 self._check_params()
   2073 self._tfidf = TfidfTransformer(
   2074     norm=self.norm,
   2075     use_idf=self.use_idf,
   2076     smooth_idf=self.smooth_idf,
   2077     sublinear_tf=self.sublinear_tf,
   2078 )
-> 2079 X = super().fit_transform(raw_documents)
   2080 self._tfidf.fit(X)
   2081 # X is already a transformed view of raw_documents so
   2082 # we set copy to False

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1338, in CountVectorizer.fit_transform(self, raw_documents, y)
   1330             warnings.warn(
   1331                 "Upper case characters found in"
   1332                 " vocabulary while 'lowercase'"
   1333                 " is True. These entries will not"
   1334                 " be matched with any documents"
   1335             )
   1336             break
-> 1338 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
   1340 if self.binary:
   1341     X.data.fill(1)

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1207, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
   1205 values = _make_int_array()
   1206 indptr.append(0)
-> 1207 for doc in raw_documents:
   1208     feature_counter = {}
   1209     for feature in analyze(doc):

TypeError: 'Data' object is not iterable

Solution

  • Go through the code. I think , This is what you want.

    import pandas as pd
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    
    
    df = pd.DataFrame({
        'Col 0': [10, 100, 1000],
        'Col 1': ['65e1e35b7fe333', '65e1e35b7fe599', '65e1e35b7fe165'],
        'Col 2': [True, False, False],
        'Col 3': ['Protijayi', 'Roopa', 'Gina'],
    
    })
    print(df)
    '''
      Col 0           Col 1  Col 2      Col 3
    0     10  65e1e35b7fe333   True  Protijayi
    1    100  65e1e35b7fe599  False      Roopa
    2   1000  65e1e35b7fe165  False       Gina
    '''
    # Define transformations for different types of data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['Col 0']),  # For numerical columns
            ('cat', OneHotEncoder(), ['Col 1']),  # For categorical columns
            ('bool', 'passthrough', ['Col 2']),  # For Boolean columns (already 0 or 1)
            ('text', TfidfVectorizer(), 'Col 3')  # For text data
        ],
        remainder='drop'  # Drop untransformed columns
    )
    
    # Apply transformations
    X = preprocessor.fit_transform(df)
    
    # Convert the resulting matrix to a DataFrame for easy inspection
    X_df = pd.DataFrame(X.toarray() if hasattr(X, 'toarray') else X)
    
    print(X_df)
    '''
             0    1    2    3    4    5    6    7
    0 -0.805387  0.0  1.0  0.0  1.0  0.0  1.0  0.0
    1 -0.604040  0.0  0.0  1.0  0.0  0.0  0.0  1.0
    2  1.409428  1.0  0.0  0.0  0.0  1.0  0.0  0.0
    '''
    

    Method 2 :

    import pandas as pd
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.decomposition import PCA
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.preprocessing import OneHotEncoder
    
    #DataFrame with 20 rows.df.shape = (20, 4)
    df = pd.DataFrame({
        'Col_0': [10, 100, 1000, 500, 200, 50, 600, 300, 800, 400, 150, 700, 900, 100,
                   350, 250, 50, 450, 750, 850],
        'Col_1': [
            '65e1e35b7fe333', '65e1e35b7fe599', '65e1e35b7fe165', '65e1e35b7fe444',
            '65e1e35b7fe555', '65e1e35b7fe666', '65e1e35b7fe777', '65e1e35b7fe888',
            '65e1e35b7fe999', '65e1e35b7fea11', '65e1e35b7fea22', '65e1e35b7fea33',
            '65e1e35b7fea44', '65e1e35b7fea55', '65e1e35b7fea66', '65e1e35b7fea77',
            '65e1e35b7fea88', '65e1e35b7fea99', '65e1e35b7fea10', '65e1e35b7fea20'
        ],
        'Col_2': [
            True, False, False, True, False, True, False, True,
            False, True, False, True, False, True, False, True,
            True, False, True, False
        ],
        'Col_3': [
            'Protijayi', 'Roopa', 'Gina', 'Anna', 'John', 'Bob', 'Alice', 'Zara',
            'Tom', 'Jerry', 'Donald', 'Mickey', 'HeMan', 'Soudipta', 'Phantom', 'YogiBear',
            'Thundarr', 'GummyBears', 'Tabu', 'Stanley'
        ],
    })
    print(df.shape)#(20, 4)
    print(df)
    # Label Encoding for categorical data
    label_encoder = LabelEncoder()
    df['Col_1'] = label_encoder.fit_transform(df['Col_1'])
    
    # Pipeline for text transformation and dimensionality reduction
    text_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=50)),  # Adjust max_features if needed
        ('pca', PCA(n_components=16))  # Reduce to 16 components
    ])
    
    # Preprocessor with ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['Col_0']),
            ('cat', 'passthrough', ['Col_1']),  # Already label encoded
            ('bool', 'passthrough', ['Col_2']),
            ('text', text_pipeline, 'Col_3')
        ]
    )
    
    # Apply transformations
    X = preprocessor.fit_transform(df)
    
    # Create the appropriate number of column names
    columns = ['Col_0_scaled', 'Col_1_encoded', 'Col_2'] + [f'Col_3_pca_{i+1}' for i in range(16)]
    
    # Convert the resulting matrix to a DataFrame
    X_df = pd.DataFrame(X, columns=columns)
    
    print(X_df.to_string())
    '''
       Col_0_scaled  Col_1_encoded  Col_2  Col_3_pca_1  Col_3_pca_2  Col_3_pca_3  Col_3_pca_4  Col_3_pca_5  Col_3_pca_6  Col_3_pca_7  Col_3_pca_8  Col_3_pca_9  Col_3_pca_10  Col_3_pca_11  Col_3_pca_12  Col_3_pca_13  Col_3_pca_14  Col_3_pca_15  Col_3_pca_16
    0      -1.334401            1.0    1.0    -0.073105    -0.146901     0.067336     0.125044     0.090760     0.009809     0.305958     0.139551     0.156774      0.527018      0.060816     -0.203797     -0.374745     -0.141668     -0.421094      0.188830
    1      -1.045361            4.0    0.0     0.115282    -0.015415    -0.150784    -0.360184    -0.080453     0.019688    -0.031309     0.255278     0.108915     -0.311483     -0.167477      0.008409     -0.300929     -0.220409      0.084341     -0.017631
    2       1.845039            0.0    0.0     0.160889    -0.020035     0.330186     0.127126     0.273287    -0.071395    -0.319679     0.219527    -0.279286     -0.004423     -0.239776     -0.416249      0.067402      0.081037     -0.241028     -0.213331
    3       0.239261            2.0    1.0     0.554679     0.130482     0.081359     0.118546     0.214038    -0.363213     0.385183    -0.148279    -0.027544     -0.001442     -0.244361      0.424055      0.026407     -0.058623      0.014459      0.048063
    4      -0.724206            3.0    0.0     0.307266     0.050949     0.310680    -0.297831    -0.383361     0.129143    -0.158281    -0.122024     0.075167      0.182113      0.224166      0.004674      0.072569      0.380125     -0.010579      0.439877
    5      -1.205939            5.0    1.0    -0.113145     0.538810    -0.043185    -0.150704     0.057516     0.046128     0.343291     0.039415    -0.118799     -0.155313      0.381745     -0.250619      0.343292     -0.315427     -0.122863      0.068318
    6       0.560416            6.0    0.0    -0.204690     0.181405     0.543654     0.340886    -0.275855    -0.037939    -0.050138    -0.239225    -0.052624     -0.239889      0.117819      0.052465     -0.373745     -0.149760      0.156353     -0.179371
    7      -0.403050            7.0    1.0    -0.178138    -0.281203    -0.081624     0.090432     0.090038    -0.427198    -0.084569    -0.062534     0.569291     -0.134243      0.239196     -0.085040      0.016309     -0.078131      0.113432      0.097394
    8       1.202728            8.0    0.0     0.071775    -0.180170     0.087681    -0.318030    -0.142572     0.139888    -0.008269     0.168043     0.011826     -0.299417     -0.171258      0.052936     -0.101520     -0.211515     -0.154248      0.049600
    9      -0.081895           10.0    1.0    -0.353030     0.077459    -0.044642     0.225100     0.078206     0.115056    -0.117477    -0.203697    -0.136310      0.074003     -0.313551      0.184199      0.171788     -0.074015      0.060184      0.428194
    10     -0.884784           12.0    0.0    -0.183930     0.011794     0.095495     0.182962    -0.045277     0.198398     0.415487     0.507330     0.169621     -0.066478     -0.165655     -0.000376      0.133879      0.459423      0.324964     -0.064214
    11      0.881572           13.0    1.0    -0.195712     0.218775    -0.177670    -0.200639    -0.208203    -0.025052     0.042678    -0.401426     0.275074      0.075878     -0.359991     -0.039838      0.072103      0.257290     -0.342703     -0.426637
    12      1.523883           14.0    0.0     0.128607     0.480509    -0.385878     0.091855     0.056152    -0.134789    -0.352402     0.208674    -0.017009      0.200666      0.185478      0.041582     -0.337446      0.178371      0.190868     -0.087371
    13     -1.045361           15.0    1.0     0.166597    -0.089110    -0.109782     0.104984     0.528854     0.529754    -0.056245    -0.326489     0.153271     -0.298661      0.157056     -0.061705     -0.097035      0.178674     -0.027044      0.043495
    14     -0.242472           16.0    0.0    -0.085249     0.037130     0.142847    -0.251928     0.195658     0.027961    -0.248846    -0.007671     0.135104      0.343121     -0.248098     -0.064966      0.228748     -0.321116      0.422975      0.018823
    15     -0.563628           17.0    1.0    -0.190203    -0.053749    -0.160313     0.149640    -0.063512    -0.206393    -0.264280     0.238996    -0.146314     -0.244733      0.020880      0.278250      0.181423      0.106724     -0.400457      0.237057
    16     -1.205939           18.0    1.0    -0.083474    -0.194554     0.005645    -0.045870     0.003321     0.321345    -0.048387     0.106808    -0.108053      0.278396      0.292693      0.555157      0.063186     -0.166453     -0.074729     -0.392409
    17      0.078683           19.0    0.0     0.138741    -0.283597     0.057301    -0.004913    -0.023166    -0.249826    -0.017610    -0.051725    -0.066780      0.012460      0.282140     -0.096769      0.381560      0.099963      0.068412     -0.286344
    18      1.042150            9.0    1.0     0.283151    -0.208560    -0.415019     0.392379    -0.470813     0.187894     0.039962    -0.102994    -0.140439      0.044455     -0.102141     -0.281541      0.093164     -0.231794      0.139206     -0.001186
    19      1.363305           11.0    0.0    -0.266310    -0.254019    -0.153286    -0.318855     0.105382    -0.209260     0.224933    -0.217558    -0.561887      0.017975      0.050319     -0.100827     -0.266410      0.227304      0.219550      0.048843
    
    '''
    

    Undertsanding ColumnTransformer :

    ColumnTransformer applies its transformers in parallel. When using a ColumnTransformer, if you pass the same column to multiple transformers, that column will appear multiple times in the output, once for each transformation applied to it.

     from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    import pandas as pd
    
    data = pd.DataFrame({
        'numeric_feature': [1, 2, 3],
        'categorical_feature': ['A', 'B', 'A']
    })
    print(data)
    '''
       numeric_feature categorical_feature
    0                1                   A
    1                2                   B
    2                3                   A
    '''
    ct = ColumnTransformer(
    transformers = [
    ('num_identity','passthrough',['numeric_feature']),
    #passing numeric_feature again
    ('num',StandardScaler(),['numeric_feature']),
    ('cat',OneHotEncoder(),['categorical_feature'])] )
    transformedData = ct.fit_transform(data)
    df_transformed = pd.DataFrame(
    transformedData,columns = ['original_numeric','scaled_numeric','catA','catB'])
    print(df_transformed)
    '''
       original_numeric  scaled_numeric  catA  catB
    0               1.0       -1.224745   1.0   0.0
    1               2.0        0.000000   0.0   1.0
    2               3.0        1.224745   1.0   0.0
    '''
    

    method 3 :

    import pandas as pd
    from sklearn.compose import ColumnTransformer
    from sklearn.decomposition import IncrementalPCA
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.feature_extraction.text import TfidfVectorizer
    from scipy.sparse import hstack, csr_matrix
    import scipy.sparse
    
    df = pd.DataFrame({
        'num_col1': [10, 20, 30, 40, 50],  # Numeric column to be standardized and reduced
        'lookupTable': [1, 2, 3, 4, 5],    # Lookup table (should not be standardized)
        'cat_col': ['R', 'A', 'S', 'R', 'S'],  # Categorical column
        'text_col': ['R world', 'S fun', 'Data science', 'Text data', 'Preprocessing text']  # Text column
    })
    def selectNumCols(df1):
        aa = df1.select_dtypes(include=['int64']).columns.difference(['lookupTable'])
        return aa 
    
    num_cols = selectNumCols(df)
    
    n_components = min(2,len(num_cols))
    
    if len(num_cols) > 1 :
        num_pipeline = Pipeline([
        ('scaler',StandardScaler()),
        ('IPCA',IncrementalPCA(n_components = min(2,len(num_cols))))
            
        ])
    else:
        num_pipeline = Pipeline([('scaler',StandardScaler())])  
    
    # Define the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),  # Apply the num_pipeline to numeric columns
            ('cat', OneHotEncoder(), ['cat_col']),   # Apply OneHotEncoder to the categorical column
            ('text', TfidfVectorizer(), 'text_col'),  # Apply TfidfVectorizer to the text column
            ('lookup', 'passthrough', ['lookupTable'])  # Pass through the lookup table column
        ]
        #sparse_threshold=0.3  # Keep sparse matrices when the data is more than 30% sparse
    )
    
    X = preprocessor.fit_transform(df)
    
    num_cols = [f'num_ipca{i}' for i in range(n_components)] if len(num_cols) > 1 else list(num_cols)
    
    oneHotColNames = preprocessor.named_transformers_['cat'].get_feature_names_out()
    print(oneHotColNames)
    
    tfidColNames = preprocessor.named_transformers_['text'].get_feature_names_out()
    print(tfidColNames)
    
    cols =  ['lookupTable'] + num_cols + list(oneHotColNames) + list(tfidColNames) 
    print(cols)
    X_df = pd.DataFrame(X,columns = cols)
    print(X_df.to_string())
    '''
       lookupTable  num_col1  cat_col_A  cat_col_R  cat_col_S  data       fun  preprocessing   science  text  world
    0    -1.414214       0.0        1.0        0.0   0.000000   0.0  0.000000       0.000000  0.000000   1.0    1.0
    1    -0.707107       1.0        0.0        0.0   0.000000   1.0  0.000000       0.000000  0.000000   0.0    2.0
    2     0.000000       0.0        0.0        1.0   0.627914   0.0  0.000000       0.778283  0.000000   0.0    3.0
    3     0.707107       0.0        1.0        0.0   0.707107   0.0  0.000000       0.000000  0.707107   0.0    4.0
    4     1.414214       0.0        0.0        1.0   0.000000   0.0  0.778283       0.000000  0.627914   0.0    5.0
    
    '''
    

    method 5 :

    import pandas as pd
    from sklearn.compose import ColumnTransformer
    from sklearn.decomposition import IncrementalPCA
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.feature_extraction.text import TfidfVectorizer
    from scipy.sparse import hstack, csr_matrix
    from scipy.sparse import csr_matrix
    from Only_Testing1.test1 import aa
    
    df = pd.DataFrame({
        'num_col1': [10, 20, 30, 40, 50],  # Numeric column to be standardized and reduced
        'lookupTable': [1, 2, 3, 4, 5],    # Lookup table (should not be standardized)
        'cat_col': ['R', 'A', 'S', 'R', 'S'],  # Categorical column
        'text_col': ['R world', 'S fun', 'Data science', 'Text data', 'Preprocessing text']  # Text column
    })
    # Custom function to select numeric columns except 'lookupTable'
    def select_numeric_columns(X):
        return X.select_dtypes(include=['int64']).columns.difference(['lookupTable'])
    
    # Determine the number of numeric columns after excluding 'lookupTable'
    numeric_columns = select_numeric_columns(df)
    n_numeric_features = len(numeric_columns)
    
    # Set n_components to the minimum of 2 or the number of numeric features
    n_components = min(2, n_numeric_features)
    
    # Conditionally include IncrementalPCA in the pipeline
    if n_numeric_features > 1:
        num_pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Standardize numeric data
            ('ipca', IncrementalPCA(n_components=n_components))   # Apply IncrementalPCA to reduce to n_components
        ])
    else:
        num_pipeline = Pipeline([
            ('scaler', StandardScaler())  # Standardize numeric data only (no PCA)
        ])
    
    # Define the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, numeric_columns),  # Apply the num_pipeline to numeric columns
            ('cat', OneHotEncoder(sparse_output=True), ['cat_col']),   # Apply OneHotEncoder to the categorical column
            ('text', TfidfVectorizer(), 'text_col'),  # Apply TfidfVectorizer to the text column
            ('lookup', 'passthrough', ['lookupTable'])  # Pass through the lookup table column
        ],
        sparse_threshold=0.3  # Keep sparse matrices when the data is more than 30% sparse
    )
    
    # Apply transformations
    X = preprocessor.fit_transform(df)
    
    # Convert dense matrix to sparse if it's not already sparse
    if not isinstance(X, csr_matrix):
        X = csr_matrix(X)
    
    
    # Automatically generate column names from transformers
    num_cols = [f'num_ipca_{i+1}' for i in range(n_components)] if n_numeric_features > 1 else list(numeric_columns)
    onehot_col_names = preprocessor.named_transformers_['cat'].get_feature_names_out(['cat_col'])
    tfidf_feature_names = preprocessor.named_transformers_['text'].get_feature_names_out()
    
    # Combine all the column names
    cols = num_cols + list(onehot_col_names) + list(tfidf_feature_names) + ['lookupTable']
    
    # Convert the resulting sparse matrix to a sparse DataFrame
    X_df = pd.DataFrame.sparse.from_spmatrix(X, columns=cols)
    
    print(X_df.to_string())
    '''
       num_col1  cat_col_A  cat_col_R  cat_col_S      data  fun  preprocessing   science      text  world  lookupTable
    0 -1.414214          0        1.0          0         0    0              0         0         0    1.0          1.0
    1 -0.707107        1.0          0          0         0  1.0              0         0         0      0          2.0
    2         0          0          0        1.0  0.627914    0              0  0.778283         0      0          3.0
    3  0.707107          0        1.0          0  0.707107    0              0         0  0.707107      0          4.0
    4  1.414214          0          0        1.0         0    0       0.778283         0  0.627914      0          5.0
    
    '''