I created a text classifier that uses Tf-Idf using sklearn, and I want to use BERT and Elmo embedding instead of Tf-Idf.
How would one do that ?
I'm getting Bert embedding using the code below:
from flair.data import Sentence
from flair.embeddings import TransformerWordEmbeddings
# init embedding
embedding = TransformerWordEmbeddings('bert-base-uncased')
# create a sentence
sentence = Sentence('The grass is green .')
# embed words in sentence
embedding.embed(sentence)
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
column_trans = ColumnTransformer([
('tfidf', TfidfVectorizer(), 'text'),
('number_scaler', MinMaxScaler(), ['number'])
])
# Initialize data
data = [
['This process, however, afforded me no means of.', 20, 1],
['another long description', 21, 1],
['It never once occurred to me that the fumbling', 19, 0],
['How lovely is spring As we looked from Windsor', 18, 0]
]
# Create DataFrame
df = pd.DataFrame(data, columns=['text', 'number', 'target'])
X = column_trans.fit_transform(df)
X = X.toarray()
y = df.loc[:, "target"].values
# Perform classification
classifier = LogisticRegression(random_state=0)
classifier.fit(X, y)
Sklearn offers the possibility to make custom data transformer (unrelated to the machine learning model "transformers").
I implemented a custom sklearn data transformer that uses the flair
library that you use. Please note that I used TransformerDocumentEmbeddings
instead of TransformerWordEmbeddings
. And one that works with the transformers
library.
I'm adding a SO question that discuss which transformer layer is interesting to use here.
I'm not familiar with Elmo, though I found this that uses tensorflow. You may be able to modify the code I shared to make Elmo work.
import torch
import numpy as np
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings
from sklearn.base import BaseEstimator, TransformerMixin
class FlairTransformerEmbedding(TransformerMixin, BaseEstimator):
def __init__(self, model_name='bert-base-uncased', batch_size=None, layers=None):
# From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
# For pickling reason you should not load models in __init__
self.model_name = model_name
self.model_kw_args = {'batch_size': batch_size, 'layers': layers}
self.model_kw_args = {k: v for k, v in self.model_kw_args.items()
if v is not None}
def fit(self, X, y=None):
return self
def transform(self, X):
model = TransformerDocumentEmbeddings(
self.model_name, fine_tune=False,
**self.model_kw_args)
sentences = [Sentence(text) for text in X]
embedded = model.embed(sentences)
embedded = [e.get_embedding().reshape(1, -1) for e in embedded]
return np.array(torch.cat(embedded).cpu())
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoTokenizer, AutoModel
from more_itertools import chunked
class TransformerEmbedding(TransformerMixin, BaseEstimator):
def __init__(self, model_name='bert-base-uncased', batch_size=1, layer=-1):
# From https://lvngd.com/blog/spacy-word-vectors-as-features-in-scikit-learn/
# For pickling reason you should not load models in __init__
self.model_name = model_name
self.layer = layer
self.batch_size = batch_size
def fit(self, X, y=None):
return self
def transform(self, X):
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
model = AutoModel.from_pretrained(self.model_name)
res = []
for batch in chunked(X, self.batch_size):
encoded_input = tokenizer.batch_encode_plus(
batch, return_tensors='pt', padding=True, truncation=True)
output = model(**encoded_input)
embed = output.last_hidden_state[:,self.layer].detach().numpy()
res.append(embed)
return np.concatenate(res)
In your case replace your column transformer by this:
column_trans = ColumnTransformer([
('embedding', FlairTransformerEmbedding(), 'text'),
('number_scaler', MinMaxScaler(), ['number'])
])