python pandas dataframe vector scikit-learn

Convert Pandas dataframe of objects to a dataframe of vectors

I have a Pandas dataframe (over 1k of rows). There are numbers, objects, strings, and Boolean values in my dataframe. I want to convert each 'cell' of the dataframe to a vector, and work with the resulting vectors. I then plan to compare each row of vectors for similarities.

For example, My data is:

Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10

12,65e1e35b7fe333,harry Joe,1,FALSE,swe,1,142.158.0.2,10.10.0.2,text1,0
13,65e1e35b7fe599,allen,1,FALSE,swe,1,142.158.0.20,10.10.0.20,text2,0
14,65e1e35b7fe165,carter,1,FALSE,swe,1,142.158.0.21,10.10.0.21,text3,0

I want to end up with a dataframe of vectors that looks like:

Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10

Vect1,Vect2,Vect3,Vect4,Vect5,Vect6,Vect7,Vect8,Vect9,Vect10,Vect11

Vect12,Vect 13,Vect 14,Vect4,Vect5,Vect6,Vect7,Vect 15,Vect 16,Vect 17,Vect11

Vect18,Vect 19,Vect 20,Vect4,Vect5,Vect6,Vect7,Vect 21,Vect 22,Vect 23,Vect11

Is there a good way to do this in Python w SciKit maybe?

I have tried:

import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

df=mydata

vectorizer = TfidfVectorizer()

# Transform the data to feature vectors
X = vectorizer.fit_transform(df)
X = pd.DataFrame(_X.todense(), index=df.index, columns=vectorizer.vocabulary_)
X.head()

# Labels
y = df['label']

What I got was:

TypeError                                 Traceback (most recent call last)
Cell In[11], line 14
     11 vectorizer = TfidfVectorizer()
     13 # Transform the text data to feature vectors
---> 14 X = vectorizer.fit_transform(df)
     16 X = pd.DataFrame(_X.todense(), index=df.index, columns=vectorizer.vocabulary_)
     17 X.head()

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:2079, in TfidfVectorizer.fit_transform(self, raw_documents, y)
   2072 self._check_params()
   2073 self._tfidf = TfidfTransformer(
   2074     norm=self.norm,
   2075     use_idf=self.use_idf,
   2076     smooth_idf=self.smooth_idf,
   2077     sublinear_tf=self.sublinear_tf,
   2078 )
-> 2079 X = super().fit_transform(raw_documents)
   2080 self._tfidf.fit(X)
   2081 # X is already a transformed view of raw_documents so
   2082 # we set copy to False

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1338, in CountVectorizer.fit_transform(self, raw_documents, y)
   1330             warnings.warn(
   1331                 "Upper case characters found in"
   1332                 " vocabulary while 'lowercase'"
   1333                 " is True. These entries will not"
   1334                 " be matched with any documents"
   1335             )
   1336             break
-> 1338 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
   1340 if self.binary:
   1341     X.data.fill(1)

File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1207, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
   1205 values = _make_int_array()
   1206 indptr.append(0)
-> 1207 for doc in raw_documents:
   1208     feature_counter = {}
   1209     for feature in analyze(doc):

TypeError: 'Data' object is not iterable

Solution

Go through the code. I think , This is what you want.

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


df = pd.DataFrame({
    'Col 0': [10, 100, 1000],
    'Col 1': ['65e1e35b7fe333', '65e1e35b7fe599', '65e1e35b7fe165'],
    'Col 2': [True, False, False],
    'Col 3': ['Protijayi', 'Roopa', 'Gina'],

})
print(df)
'''
  Col 0           Col 1  Col 2      Col 3
0     10  65e1e35b7fe333   True  Protijayi
1    100  65e1e35b7fe599  False      Roopa
2   1000  65e1e35b7fe165  False       Gina
'''
# Define transformations for different types of data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Col 0']),  # For numerical columns
        ('cat', OneHotEncoder(), ['Col 1']),  # For categorical columns
        ('bool', 'passthrough', ['Col 2']),  # For Boolean columns (already 0 or 1)
        ('text', TfidfVectorizer(), 'Col 3')  # For text data
    ],
    remainder='drop'  # Drop untransformed columns
)

# Apply transformations
X = preprocessor.fit_transform(df)

# Convert the resulting matrix to a DataFrame for easy inspection
X_df = pd.DataFrame(X.toarray() if hasattr(X, 'toarray') else X)

print(X_df)
'''
         0    1    2    3    4    5    6    7
0 -0.805387  0.0  1.0  0.0  1.0  0.0  1.0  0.0
1 -0.604040  0.0  0.0  1.0  0.0  0.0  0.0  1.0
2  1.409428  1.0  0.0  0.0  0.0  1.0  0.0  0.0
'''

Method 2 :

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

#DataFrame with 20 rows.df.shape = (20, 4)
df = pd.DataFrame({
    'Col_0': [10, 100, 1000, 500, 200, 50, 600, 300, 800, 400, 150, 700, 900, 100,
               350, 250, 50, 450, 750, 850],
    'Col_1': [
        '65e1e35b7fe333', '65e1e35b7fe599', '65e1e35b7fe165', '65e1e35b7fe444',
        '65e1e35b7fe555', '65e1e35b7fe666', '65e1e35b7fe777', '65e1e35b7fe888',
        '65e1e35b7fe999', '65e1e35b7fea11', '65e1e35b7fea22', '65e1e35b7fea33',
        '65e1e35b7fea44', '65e1e35b7fea55', '65e1e35b7fea66', '65e1e35b7fea77',
        '65e1e35b7fea88', '65e1e35b7fea99', '65e1e35b7fea10', '65e1e35b7fea20'
    ],
    'Col_2': [
        True, False, False, True, False, True, False, True,
        False, True, False, True, False, True, False, True,
        True, False, True, False
    ],
    'Col_3': [
        'Protijayi', 'Roopa', 'Gina', 'Anna', 'John', 'Bob', 'Alice', 'Zara',
        'Tom', 'Jerry', 'Donald', 'Mickey', 'HeMan', 'Soudipta', 'Phantom', 'YogiBear',
        'Thundarr', 'GummyBears', 'Tabu', 'Stanley'
    ],
})
print(df.shape)#(20, 4)
print(df)
# Label Encoding for categorical data
label_encoder = LabelEncoder()
df['Col_1'] = label_encoder.fit_transform(df['Col_1'])

# Pipeline for text transformation and dimensionality reduction
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50)),  # Adjust max_features if needed
    ('pca', PCA(n_components=16))  # Reduce to 16 components
])

# Preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Col_0']),
        ('cat', 'passthrough', ['Col_1']),  # Already label encoded
        ('bool', 'passthrough', ['Col_2']),
        ('text', text_pipeline, 'Col_3')
    ]
)

# Apply transformations
X = preprocessor.fit_transform(df)

# Create the appropriate number of column names
columns = ['Col_0_scaled', 'Col_1_encoded', 'Col_2'] + [f'Col_3_pca_{i+1}' for i in range(16)]

# Convert the resulting matrix to a DataFrame
X_df = pd.DataFrame(X, columns=columns)

print(X_df.to_string())
'''
   Col_0_scaled  Col_1_encoded  Col_2  Col_3_pca_1  Col_3_pca_2  Col_3_pca_3  Col_3_pca_4  Col_3_pca_5  Col_3_pca_6  Col_3_pca_7  Col_3_pca_8  Col_3_pca_9  Col_3_pca_10  Col_3_pca_11  Col_3_pca_12  Col_3_pca_13  Col_3_pca_14  Col_3_pca_15  Col_3_pca_16
0      -1.334401            1.0    1.0    -0.073105    -0.146901     0.067336     0.125044     0.090760     0.009809     0.305958     0.139551     0.156774      0.527018      0.060816     -0.203797     -0.374745     -0.141668     -0.421094      0.188830
1      -1.045361            4.0    0.0     0.115282    -0.015415    -0.150784    -0.360184    -0.080453     0.019688    -0.031309     0.255278     0.108915     -0.311483     -0.167477      0.008409     -0.300929     -0.220409      0.084341     -0.017631
2       1.845039            0.0    0.0     0.160889    -0.020035     0.330186     0.127126     0.273287    -0.071395    -0.319679     0.219527    -0.279286     -0.004423     -0.239776     -0.416249      0.067402      0.081037     -0.241028     -0.213331
3       0.239261            2.0    1.0     0.554679     0.130482     0.081359     0.118546     0.214038    -0.363213     0.385183    -0.148279    -0.027544     -0.001442     -0.244361      0.424055      0.026407     -0.058623      0.014459      0.048063
4      -0.724206            3.0    0.0     0.307266     0.050949     0.310680    -0.297831    -0.383361     0.129143    -0.158281    -0.122024     0.075167      0.182113      0.224166      0.004674      0.072569      0.380125     -0.010579      0.439877
5      -1.205939            5.0    1.0    -0.113145     0.538810    -0.043185    -0.150704     0.057516     0.046128     0.343291     0.039415    -0.118799     -0.155313      0.381745     -0.250619      0.343292     -0.315427     -0.122863      0.068318
6       0.560416            6.0    0.0    -0.204690     0.181405     0.543654     0.340886    -0.275855    -0.037939    -0.050138    -0.239225    -0.052624     -0.239889      0.117819      0.052465     -0.373745     -0.149760      0.156353     -0.179371
7      -0.403050            7.0    1.0    -0.178138    -0.281203    -0.081624     0.090432     0.090038    -0.427198    -0.084569    -0.062534     0.569291     -0.134243      0.239196     -0.085040      0.016309     -0.078131      0.113432      0.097394
8       1.202728            8.0    0.0     0.071775    -0.180170     0.087681    -0.318030    -0.142572     0.139888    -0.008269     0.168043     0.011826     -0.299417     -0.171258      0.052936     -0.101520     -0.211515     -0.154248      0.049600
9      -0.081895           10.0    1.0    -0.353030     0.077459    -0.044642     0.225100     0.078206     0.115056    -0.117477    -0.203697    -0.136310      0.074003     -0.313551      0.184199      0.171788     -0.074015      0.060184      0.428194
10     -0.884784           12.0    0.0    -0.183930     0.011794     0.095495     0.182962    -0.045277     0.198398     0.415487     0.507330     0.169621     -0.066478     -0.165655     -0.000376      0.133879      0.459423      0.324964     -0.064214
11      0.881572           13.0    1.0    -0.195712     0.218775    -0.177670    -0.200639    -0.208203    -0.025052     0.042678    -0.401426     0.275074      0.075878     -0.359991     -0.039838      0.072103      0.257290     -0.342703     -0.426637
12      1.523883           14.0    0.0     0.128607     0.480509    -0.385878     0.091855     0.056152    -0.134789    -0.352402     0.208674    -0.017009      0.200666      0.185478      0.041582     -0.337446      0.178371      0.190868     -0.087371
13     -1.045361           15.0    1.0     0.166597    -0.089110    -0.109782     0.104984     0.528854     0.529754    -0.056245    -0.326489     0.153271     -0.298661      0.157056     -0.061705     -0.097035      0.178674     -0.027044      0.043495
14     -0.242472           16.0    0.0    -0.085249     0.037130     0.142847    -0.251928     0.195658     0.027961    -0.248846    -0.007671     0.135104      0.343121     -0.248098     -0.064966      0.228748     -0.321116      0.422975      0.018823
15     -0.563628           17.0    1.0    -0.190203    -0.053749    -0.160313     0.149640    -0.063512    -0.206393    -0.264280     0.238996    -0.146314     -0.244733      0.020880      0.278250      0.181423      0.106724     -0.400457      0.237057
16     -1.205939           18.0    1.0    -0.083474    -0.194554     0.005645    -0.045870     0.003321     0.321345    -0.048387     0.106808    -0.108053      0.278396      0.292693      0.555157      0.063186     -0.166453     -0.074729     -0.392409
17      0.078683           19.0    0.0     0.138741    -0.283597     0.057301    -0.004913    -0.023166    -0.249826    -0.017610    -0.051725    -0.066780      0.012460      0.282140     -0.096769      0.381560      0.099963      0.068412     -0.286344
18      1.042150            9.0    1.0     0.283151    -0.208560    -0.415019     0.392379    -0.470813     0.187894     0.039962    -0.102994    -0.140439      0.044455     -0.102141     -0.281541      0.093164     -0.231794      0.139206     -0.001186
19      1.363305           11.0    0.0    -0.266310    -0.254019    -0.153286    -0.318855     0.105382    -0.209260     0.224933    -0.217558    -0.561887      0.017975      0.050319     -0.100827     -0.266410      0.227304      0.219550      0.048843

'''

Undertsanding ColumnTransformer :

ColumnTransformer applies its transformers in parallel. When using a ColumnTransformer, if you pass the same column to multiple transformers, that column will appear multiple times in the output, once for each transformation applied to it.

 from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

data = pd.DataFrame({
    'numeric_feature': [1, 2, 3],
    'categorical_feature': ['A', 'B', 'A']
})
print(data)
'''
   numeric_feature categorical_feature
0                1                   A
1                2                   B
2                3                   A
'''
ct = ColumnTransformer(
transformers = [
('num_identity','passthrough',['numeric_feature']),
#passing numeric_feature again
('num',StandardScaler(),['numeric_feature']),
('cat',OneHotEncoder(),['categorical_feature'])] )
transformedData = ct.fit_transform(data)
df_transformed = pd.DataFrame(
transformedData,columns = ['original_numeric','scaled_numeric','catA','catB'])
print(df_transformed)
'''
   original_numeric  scaled_numeric  catA  catB
0               1.0       -1.224745   1.0   0.0
1               2.0        0.000000   0.0   1.0
2               3.0        1.224745   1.0   0.0
'''

method 3 :

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import IncrementalPCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import scipy.sparse

df = pd.DataFrame({
    'num_col1': [10, 20, 30, 40, 50],  # Numeric column to be standardized and reduced
    'lookupTable': [1, 2, 3, 4, 5],    # Lookup table (should not be standardized)
    'cat_col': ['R', 'A', 'S', 'R', 'S'],  # Categorical column
    'text_col': ['R world', 'S fun', 'Data science', 'Text data', 'Preprocessing text']  # Text column
})
def selectNumCols(df1):
    aa = df1.select_dtypes(include=['int64']).columns.difference(['lookupTable'])
    return aa 

num_cols = selectNumCols(df)

n_components = min(2,len(num_cols))

if len(num_cols) > 1 :
    num_pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('IPCA',IncrementalPCA(n_components = min(2,len(num_cols))))
        
    ])
else:
    num_pipeline = Pipeline([('scaler',StandardScaler())])  

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),  # Apply the num_pipeline to numeric columns
        ('cat', OneHotEncoder(), ['cat_col']),   # Apply OneHotEncoder to the categorical column
        ('text', TfidfVectorizer(), 'text_col'),  # Apply TfidfVectorizer to the text column
        ('lookup', 'passthrough', ['lookupTable'])  # Pass through the lookup table column
    ]
    #sparse_threshold=0.3  # Keep sparse matrices when the data is more than 30% sparse
)

X = preprocessor.fit_transform(df)

num_cols = [f'num_ipca{i}' for i in range(n_components)] if len(num_cols) > 1 else list(num_cols)

oneHotColNames = preprocessor.named_transformers_['cat'].get_feature_names_out()
print(oneHotColNames)

tfidColNames = preprocessor.named_transformers_['text'].get_feature_names_out()
print(tfidColNames)

cols =  ['lookupTable'] + num_cols + list(oneHotColNames) + list(tfidColNames) 
print(cols)
X_df = pd.DataFrame(X,columns = cols)
print(X_df.to_string())
'''
   lookupTable  num_col1  cat_col_A  cat_col_R  cat_col_S  data       fun  preprocessing   science  text  world
0    -1.414214       0.0        1.0        0.0   0.000000   0.0  0.000000       0.000000  0.000000   1.0    1.0
1    -0.707107       1.0        0.0        0.0   0.000000   1.0  0.000000       0.000000  0.000000   0.0    2.0
2     0.000000       0.0        0.0        1.0   0.627914   0.0  0.000000       0.778283  0.000000   0.0    3.0
3     0.707107       0.0        1.0        0.0   0.707107   0.0  0.000000       0.000000  0.707107   0.0    4.0
4     1.414214       0.0        0.0        1.0   0.000000   0.0  0.778283       0.000000  0.627914   0.0    5.0

'''

method 5 :

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import IncrementalPCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from scipy.sparse import csr_matrix
from Only_Testing1.test1 import aa

df = pd.DataFrame({
    'num_col1': [10, 20, 30, 40, 50],  # Numeric column to be standardized and reduced
    'lookupTable': [1, 2, 3, 4, 5],    # Lookup table (should not be standardized)
    'cat_col': ['R', 'A', 'S', 'R', 'S'],  # Categorical column
    'text_col': ['R world', 'S fun', 'Data science', 'Text data', 'Preprocessing text']  # Text column
})
# Custom function to select numeric columns except 'lookupTable'
def select_numeric_columns(X):
    return X.select_dtypes(include=['int64']).columns.difference(['lookupTable'])

# Determine the number of numeric columns after excluding 'lookupTable'
numeric_columns = select_numeric_columns(df)
n_numeric_features = len(numeric_columns)

# Set n_components to the minimum of 2 or the number of numeric features
n_components = min(2, n_numeric_features)

# Conditionally include IncrementalPCA in the pipeline
if n_numeric_features > 1:
    num_pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize numeric data
        ('ipca', IncrementalPCA(n_components=n_components))   # Apply IncrementalPCA to reduce to n_components
    ])
else:
    num_pipeline = Pipeline([
        ('scaler', StandardScaler())  # Standardize numeric data only (no PCA)
    ])

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numeric_columns),  # Apply the num_pipeline to numeric columns
        ('cat', OneHotEncoder(sparse_output=True), ['cat_col']),   # Apply OneHotEncoder to the categorical column
        ('text', TfidfVectorizer(), 'text_col'),  # Apply TfidfVectorizer to the text column
        ('lookup', 'passthrough', ['lookupTable'])  # Pass through the lookup table column
    ],
    sparse_threshold=0.3  # Keep sparse matrices when the data is more than 30% sparse
)

# Apply transformations
X = preprocessor.fit_transform(df)

# Convert dense matrix to sparse if it's not already sparse
if not isinstance(X, csr_matrix):
    X = csr_matrix(X)


# Automatically generate column names from transformers
num_cols = [f'num_ipca_{i+1}' for i in range(n_components)] if n_numeric_features > 1 else list(numeric_columns)
onehot_col_names = preprocessor.named_transformers_['cat'].get_feature_names_out(['cat_col'])
tfidf_feature_names = preprocessor.named_transformers_['text'].get_feature_names_out()

# Combine all the column names
cols = num_cols + list(onehot_col_names) + list(tfidf_feature_names) + ['lookupTable']

# Convert the resulting sparse matrix to a sparse DataFrame
X_df = pd.DataFrame.sparse.from_spmatrix(X, columns=cols)

print(X_df.to_string())
'''
   num_col1  cat_col_A  cat_col_R  cat_col_S      data  fun  preprocessing   science      text  world  lookupTable
0 -1.414214          0        1.0          0         0    0              0         0         0    1.0          1.0
1 -0.707107        1.0          0          0         0  1.0              0         0         0      0          2.0
2         0          0          0        1.0  0.627914    0              0  0.778283         0      0          3.0
3  0.707107          0        1.0          0  0.707107    0              0         0  0.707107      0          4.0
4  1.414214          0          0        1.0         0    0       0.778283         0  0.627914      0          5.0

'''