I have a Pandas dataframe (over 1k of rows). There are numbers, objects, strings, and Boolean values in my dataframe. I want to convert each 'cell' of the dataframe to a vector, and work with the resulting vectors. I then plan to compare each row of vectors for similarities.
For example, My data is:
Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10
12,65e1e35b7fe333,harry Joe,1,FALSE,swe,1,142.158.0.2,10.10.0.2,text1,0
13,65e1e35b7fe599,allen,1,FALSE,swe,1,142.158.0.20,10.10.0.20,text2,0
14,65e1e35b7fe165,carter,1,FALSE,swe,1,142.158.0.21,10.10.0.21,text3,0
I want to end up with a dataframe of vectors that looks like:
Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,Col 8,Col 9,Col 10
Vect1,Vect2,Vect3,Vect4,Vect5,Vect6,Vect7,Vect8,Vect9,Vect10,Vect11
Vect12,Vect 13,Vect 14,Vect4,Vect5,Vect6,Vect7,Vect 15,Vect 16,Vect 17,Vect11
Vect18,Vect 19,Vect 20,Vect4,Vect5,Vect6,Vect7,Vect 21,Vect 22,Vect 23,Vect11
Is there a good way to do this in Python w SciKit maybe?
I have tried:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
df=mydata
vectorizer = TfidfVectorizer()
# Transform the data to feature vectors
X = vectorizer.fit_transform(df)
X = pd.DataFrame(_X.todense(), index=df.index, columns=vectorizer.vocabulary_)
X.head()
# Labels
y = df['label']
What I got was:
TypeError Traceback (most recent call last)
Cell In[11], line 14
11 vectorizer = TfidfVectorizer()
13 # Transform the text data to feature vectors
---> 14 X = vectorizer.fit_transform(df)
16 X = pd.DataFrame(_X.todense(), index=df.index, columns=vectorizer.vocabulary_)
17 X.head()
File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:2079, in TfidfVectorizer.fit_transform(self, raw_documents, y)
2072 self._check_params()
2073 self._tfidf = TfidfTransformer(
2074 norm=self.norm,
2075 use_idf=self.use_idf,
2076 smooth_idf=self.smooth_idf,
2077 sublinear_tf=self.sublinear_tf,
2078 )
-> 2079 X = super().fit_transform(raw_documents)
2080 self._tfidf.fit(X)
2081 # X is already a transformed view of raw_documents so
2082 # we set copy to False
File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1338, in CountVectorizer.fit_transform(self, raw_documents, y)
1330 warnings.warn(
1331 "Upper case characters found in"
1332 " vocabulary while 'lowercase'"
1333 " is True. These entries will not"
1334 " be matched with any documents"
1335 )
1336 break
-> 1338 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
1340 if self.binary:
1341 X.data.fill(1)
File /anaconda/envs/xxx_py38/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1207, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
1205 values = _make_int_array()
1206 indptr.append(0)
-> 1207 for doc in raw_documents:
1208 feature_counter = {}
1209 for feature in analyze(doc):
TypeError: 'Data' object is not iterable
Go through the code. I think , This is what you want.
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
df = pd.DataFrame({
'Col 0': [10, 100, 1000],
'Col 1': ['65e1e35b7fe333', '65e1e35b7fe599', '65e1e35b7fe165'],
'Col 2': [True, False, False],
'Col 3': ['Protijayi', 'Roopa', 'Gina'],
})
print(df)
'''
Col 0 Col 1 Col 2 Col 3
0 10 65e1e35b7fe333 True Protijayi
1 100 65e1e35b7fe599 False Roopa
2 1000 65e1e35b7fe165 False Gina
'''
# Define transformations for different types of data
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['Col 0']), # For numerical columns
('cat', OneHotEncoder(), ['Col 1']), # For categorical columns
('bool', 'passthrough', ['Col 2']), # For Boolean columns (already 0 or 1)
('text', TfidfVectorizer(), 'Col 3') # For text data
],
remainder='drop' # Drop untransformed columns
)
# Apply transformations
X = preprocessor.fit_transform(df)
# Convert the resulting matrix to a DataFrame for easy inspection
X_df = pd.DataFrame(X.toarray() if hasattr(X, 'toarray') else X)
print(X_df)
'''
0 1 2 3 4 5 6 7
0 -0.805387 0.0 1.0 0.0 1.0 0.0 1.0 0.0
1 -0.604040 0.0 0.0 1.0 0.0 0.0 0.0 1.0
2 1.409428 1.0 0.0 0.0 0.0 1.0 0.0 0.0
'''
Method 2 :
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
#DataFrame with 20 rows.df.shape = (20, 4)
df = pd.DataFrame({
'Col_0': [10, 100, 1000, 500, 200, 50, 600, 300, 800, 400, 150, 700, 900, 100,
350, 250, 50, 450, 750, 850],
'Col_1': [
'65e1e35b7fe333', '65e1e35b7fe599', '65e1e35b7fe165', '65e1e35b7fe444',
'65e1e35b7fe555', '65e1e35b7fe666', '65e1e35b7fe777', '65e1e35b7fe888',
'65e1e35b7fe999', '65e1e35b7fea11', '65e1e35b7fea22', '65e1e35b7fea33',
'65e1e35b7fea44', '65e1e35b7fea55', '65e1e35b7fea66', '65e1e35b7fea77',
'65e1e35b7fea88', '65e1e35b7fea99', '65e1e35b7fea10', '65e1e35b7fea20'
],
'Col_2': [
True, False, False, True, False, True, False, True,
False, True, False, True, False, True, False, True,
True, False, True, False
],
'Col_3': [
'Protijayi', 'Roopa', 'Gina', 'Anna', 'John', 'Bob', 'Alice', 'Zara',
'Tom', 'Jerry', 'Donald', 'Mickey', 'HeMan', 'Soudipta', 'Phantom', 'YogiBear',
'Thundarr', 'GummyBears', 'Tabu', 'Stanley'
],
})
print(df.shape)#(20, 4)
print(df)
# Label Encoding for categorical data
label_encoder = LabelEncoder()
df['Col_1'] = label_encoder.fit_transform(df['Col_1'])
# Pipeline for text transformation and dimensionality reduction
text_pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=50)), # Adjust max_features if needed
('pca', PCA(n_components=16)) # Reduce to 16 components
])
# Preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['Col_0']),
('cat', 'passthrough', ['Col_1']), # Already label encoded
('bool', 'passthrough', ['Col_2']),
('text', text_pipeline, 'Col_3')
]
)
# Apply transformations
X = preprocessor.fit_transform(df)
# Create the appropriate number of column names
columns = ['Col_0_scaled', 'Col_1_encoded', 'Col_2'] + [f'Col_3_pca_{i+1}' for i in range(16)]
# Convert the resulting matrix to a DataFrame
X_df = pd.DataFrame(X, columns=columns)
print(X_df.to_string())
'''
Col_0_scaled Col_1_encoded Col_2 Col_3_pca_1 Col_3_pca_2 Col_3_pca_3 Col_3_pca_4 Col_3_pca_5 Col_3_pca_6 Col_3_pca_7 Col_3_pca_8 Col_3_pca_9 Col_3_pca_10 Col_3_pca_11 Col_3_pca_12 Col_3_pca_13 Col_3_pca_14 Col_3_pca_15 Col_3_pca_16
0 -1.334401 1.0 1.0 -0.073105 -0.146901 0.067336 0.125044 0.090760 0.009809 0.305958 0.139551 0.156774 0.527018 0.060816 -0.203797 -0.374745 -0.141668 -0.421094 0.188830
1 -1.045361 4.0 0.0 0.115282 -0.015415 -0.150784 -0.360184 -0.080453 0.019688 -0.031309 0.255278 0.108915 -0.311483 -0.167477 0.008409 -0.300929 -0.220409 0.084341 -0.017631
2 1.845039 0.0 0.0 0.160889 -0.020035 0.330186 0.127126 0.273287 -0.071395 -0.319679 0.219527 -0.279286 -0.004423 -0.239776 -0.416249 0.067402 0.081037 -0.241028 -0.213331
3 0.239261 2.0 1.0 0.554679 0.130482 0.081359 0.118546 0.214038 -0.363213 0.385183 -0.148279 -0.027544 -0.001442 -0.244361 0.424055 0.026407 -0.058623 0.014459 0.048063
4 -0.724206 3.0 0.0 0.307266 0.050949 0.310680 -0.297831 -0.383361 0.129143 -0.158281 -0.122024 0.075167 0.182113 0.224166 0.004674 0.072569 0.380125 -0.010579 0.439877
5 -1.205939 5.0 1.0 -0.113145 0.538810 -0.043185 -0.150704 0.057516 0.046128 0.343291 0.039415 -0.118799 -0.155313 0.381745 -0.250619 0.343292 -0.315427 -0.122863 0.068318
6 0.560416 6.0 0.0 -0.204690 0.181405 0.543654 0.340886 -0.275855 -0.037939 -0.050138 -0.239225 -0.052624 -0.239889 0.117819 0.052465 -0.373745 -0.149760 0.156353 -0.179371
7 -0.403050 7.0 1.0 -0.178138 -0.281203 -0.081624 0.090432 0.090038 -0.427198 -0.084569 -0.062534 0.569291 -0.134243 0.239196 -0.085040 0.016309 -0.078131 0.113432 0.097394
8 1.202728 8.0 0.0 0.071775 -0.180170 0.087681 -0.318030 -0.142572 0.139888 -0.008269 0.168043 0.011826 -0.299417 -0.171258 0.052936 -0.101520 -0.211515 -0.154248 0.049600
9 -0.081895 10.0 1.0 -0.353030 0.077459 -0.044642 0.225100 0.078206 0.115056 -0.117477 -0.203697 -0.136310 0.074003 -0.313551 0.184199 0.171788 -0.074015 0.060184 0.428194
10 -0.884784 12.0 0.0 -0.183930 0.011794 0.095495 0.182962 -0.045277 0.198398 0.415487 0.507330 0.169621 -0.066478 -0.165655 -0.000376 0.133879 0.459423 0.324964 -0.064214
11 0.881572 13.0 1.0 -0.195712 0.218775 -0.177670 -0.200639 -0.208203 -0.025052 0.042678 -0.401426 0.275074 0.075878 -0.359991 -0.039838 0.072103 0.257290 -0.342703 -0.426637
12 1.523883 14.0 0.0 0.128607 0.480509 -0.385878 0.091855 0.056152 -0.134789 -0.352402 0.208674 -0.017009 0.200666 0.185478 0.041582 -0.337446 0.178371 0.190868 -0.087371
13 -1.045361 15.0 1.0 0.166597 -0.089110 -0.109782 0.104984 0.528854 0.529754 -0.056245 -0.326489 0.153271 -0.298661 0.157056 -0.061705 -0.097035 0.178674 -0.027044 0.043495
14 -0.242472 16.0 0.0 -0.085249 0.037130 0.142847 -0.251928 0.195658 0.027961 -0.248846 -0.007671 0.135104 0.343121 -0.248098 -0.064966 0.228748 -0.321116 0.422975 0.018823
15 -0.563628 17.0 1.0 -0.190203 -0.053749 -0.160313 0.149640 -0.063512 -0.206393 -0.264280 0.238996 -0.146314 -0.244733 0.020880 0.278250 0.181423 0.106724 -0.400457 0.237057
16 -1.205939 18.0 1.0 -0.083474 -0.194554 0.005645 -0.045870 0.003321 0.321345 -0.048387 0.106808 -0.108053 0.278396 0.292693 0.555157 0.063186 -0.166453 -0.074729 -0.392409
17 0.078683 19.0 0.0 0.138741 -0.283597 0.057301 -0.004913 -0.023166 -0.249826 -0.017610 -0.051725 -0.066780 0.012460 0.282140 -0.096769 0.381560 0.099963 0.068412 -0.286344
18 1.042150 9.0 1.0 0.283151 -0.208560 -0.415019 0.392379 -0.470813 0.187894 0.039962 -0.102994 -0.140439 0.044455 -0.102141 -0.281541 0.093164 -0.231794 0.139206 -0.001186
19 1.363305 11.0 0.0 -0.266310 -0.254019 -0.153286 -0.318855 0.105382 -0.209260 0.224933 -0.217558 -0.561887 0.017975 0.050319 -0.100827 -0.266410 0.227304 0.219550 0.048843
'''
Undertsanding ColumnTransformer :
ColumnTransformer applies its transformers in parallel. When using a ColumnTransformer, if you pass the same column to multiple transformers, that column will appear multiple times in the output, once for each transformation applied to it.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
data = pd.DataFrame({
'numeric_feature': [1, 2, 3],
'categorical_feature': ['A', 'B', 'A']
})
print(data)
'''
numeric_feature categorical_feature
0 1 A
1 2 B
2 3 A
'''
ct = ColumnTransformer(
transformers = [
('num_identity','passthrough',['numeric_feature']),
#passing numeric_feature again
('num',StandardScaler(),['numeric_feature']),
('cat',OneHotEncoder(),['categorical_feature'])] )
transformedData = ct.fit_transform(data)
df_transformed = pd.DataFrame(
transformedData,columns = ['original_numeric','scaled_numeric','catA','catB'])
print(df_transformed)
'''
original_numeric scaled_numeric catA catB
0 1.0 -1.224745 1.0 0.0
1 2.0 0.000000 0.0 1.0
2 3.0 1.224745 1.0 0.0
'''
method 3 :
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import IncrementalPCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import scipy.sparse
df = pd.DataFrame({
'num_col1': [10, 20, 30, 40, 50], # Numeric column to be standardized and reduced
'lookupTable': [1, 2, 3, 4, 5], # Lookup table (should not be standardized)
'cat_col': ['R', 'A', 'S', 'R', 'S'], # Categorical column
'text_col': ['R world', 'S fun', 'Data science', 'Text data', 'Preprocessing text'] # Text column
})
def selectNumCols(df1):
aa = df1.select_dtypes(include=['int64']).columns.difference(['lookupTable'])
return aa
num_cols = selectNumCols(df)
n_components = min(2,len(num_cols))
if len(num_cols) > 1 :
num_pipeline = Pipeline([
('scaler',StandardScaler()),
('IPCA',IncrementalPCA(n_components = min(2,len(num_cols))))
])
else:
num_pipeline = Pipeline([('scaler',StandardScaler())])
# Define the column transformer
preprocessor = ColumnTransformer(
transformers=[
('num', num_pipeline, num_cols), # Apply the num_pipeline to numeric columns
('cat', OneHotEncoder(), ['cat_col']), # Apply OneHotEncoder to the categorical column
('text', TfidfVectorizer(), 'text_col'), # Apply TfidfVectorizer to the text column
('lookup', 'passthrough', ['lookupTable']) # Pass through the lookup table column
]
#sparse_threshold=0.3 # Keep sparse matrices when the data is more than 30% sparse
)
X = preprocessor.fit_transform(df)
num_cols = [f'num_ipca{i}' for i in range(n_components)] if len(num_cols) > 1 else list(num_cols)
oneHotColNames = preprocessor.named_transformers_['cat'].get_feature_names_out()
print(oneHotColNames)
tfidColNames = preprocessor.named_transformers_['text'].get_feature_names_out()
print(tfidColNames)
cols = ['lookupTable'] + num_cols + list(oneHotColNames) + list(tfidColNames)
print(cols)
X_df = pd.DataFrame(X,columns = cols)
print(X_df.to_string())
'''
lookupTable num_col1 cat_col_A cat_col_R cat_col_S data fun preprocessing science text world
0 -1.414214 0.0 1.0 0.0 0.000000 0.0 0.000000 0.000000 0.000000 1.0 1.0
1 -0.707107 1.0 0.0 0.0 0.000000 1.0 0.000000 0.000000 0.000000 0.0 2.0
2 0.000000 0.0 0.0 1.0 0.627914 0.0 0.000000 0.778283 0.000000 0.0 3.0
3 0.707107 0.0 1.0 0.0 0.707107 0.0 0.000000 0.000000 0.707107 0.0 4.0
4 1.414214 0.0 0.0 1.0 0.000000 0.0 0.778283 0.000000 0.627914 0.0 5.0
'''
method 5 :
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import IncrementalPCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from scipy.sparse import csr_matrix
from Only_Testing1.test1 import aa
df = pd.DataFrame({
'num_col1': [10, 20, 30, 40, 50], # Numeric column to be standardized and reduced
'lookupTable': [1, 2, 3, 4, 5], # Lookup table (should not be standardized)
'cat_col': ['R', 'A', 'S', 'R', 'S'], # Categorical column
'text_col': ['R world', 'S fun', 'Data science', 'Text data', 'Preprocessing text'] # Text column
})
# Custom function to select numeric columns except 'lookupTable'
def select_numeric_columns(X):
return X.select_dtypes(include=['int64']).columns.difference(['lookupTable'])
# Determine the number of numeric columns after excluding 'lookupTable'
numeric_columns = select_numeric_columns(df)
n_numeric_features = len(numeric_columns)
# Set n_components to the minimum of 2 or the number of numeric features
n_components = min(2, n_numeric_features)
# Conditionally include IncrementalPCA in the pipeline
if n_numeric_features > 1:
num_pipeline = Pipeline([
('scaler', StandardScaler()), # Standardize numeric data
('ipca', IncrementalPCA(n_components=n_components)) # Apply IncrementalPCA to reduce to n_components
])
else:
num_pipeline = Pipeline([
('scaler', StandardScaler()) # Standardize numeric data only (no PCA)
])
# Define the column transformer
preprocessor = ColumnTransformer(
transformers=[
('num', num_pipeline, numeric_columns), # Apply the num_pipeline to numeric columns
('cat', OneHotEncoder(sparse_output=True), ['cat_col']), # Apply OneHotEncoder to the categorical column
('text', TfidfVectorizer(), 'text_col'), # Apply TfidfVectorizer to the text column
('lookup', 'passthrough', ['lookupTable']) # Pass through the lookup table column
],
sparse_threshold=0.3 # Keep sparse matrices when the data is more than 30% sparse
)
# Apply transformations
X = preprocessor.fit_transform(df)
# Convert dense matrix to sparse if it's not already sparse
if not isinstance(X, csr_matrix):
X = csr_matrix(X)
# Automatically generate column names from transformers
num_cols = [f'num_ipca_{i+1}' for i in range(n_components)] if n_numeric_features > 1 else list(numeric_columns)
onehot_col_names = preprocessor.named_transformers_['cat'].get_feature_names_out(['cat_col'])
tfidf_feature_names = preprocessor.named_transformers_['text'].get_feature_names_out()
# Combine all the column names
cols = num_cols + list(onehot_col_names) + list(tfidf_feature_names) + ['lookupTable']
# Convert the resulting sparse matrix to a sparse DataFrame
X_df = pd.DataFrame.sparse.from_spmatrix(X, columns=cols)
print(X_df.to_string())
'''
num_col1 cat_col_A cat_col_R cat_col_S data fun preprocessing science text world lookupTable
0 -1.414214 0 1.0 0 0 0 0 0 0 1.0 1.0
1 -0.707107 1.0 0 0 0 1.0 0 0 0 0 2.0
2 0 0 0 1.0 0.627914 0 0 0.778283 0 0 3.0
3 0.707107 0 1.0 0 0.707107 0 0 0 0.707107 0 4.0
4 1.414214 0 0 1.0 0 0 0.778283 0 0.627914 0 5.0
'''