I'm new to nlp, I started learning how to train the custom ner in spacy.
TRAIN_DATA = [
('what is the price of polo?', {'entities': [(21, 25, 'Product')]}),
('what is the price of ball?', {'entities': [(21, 25, 'Product')]}),
('what is the price of jegging?', {'entities': [(21, 28, 'Product')]}),
('what is the price of t-shirt?', {'entities': [(21, 28, 'Product')]}),
('what is the price of jeans?', {'entities': [(21, 26, 'Product')]}),
('what is the price of bat?', {'entities': [(21, 24, 'Product')]}),
('what is the price of shirt?', {'entities': [(21, 26, 'Product')]}),
('what is the price of bag?', {'entities': [(21, 24, 'Product')]}),
('what is the price of cup?', {'entities': [(21, 24, 'Product')]}),
('what is the price of jug?', {'entities': [(21, 24, 'Product')]}),
('what is the price of plate?', {'entities': [(21, 26, 'Product')]}),
('what is the price of glass?', {'entities': [(21, 26, 'Product')]}),
('what is the price of moniter?', {'entities': [(21, 28, 'Product')]}),
('what is the price of desktop?', {'entities': [(21, 28, 'Product')]}),
('what is the price of bottle?', {'entities': [(21, 27, 'Product')]}),
('what is the price of mouse?', {'entities': [(21, 26, 'Product')]}),
('what is the price of keyboad?', {'entities': [(21, 28, 'Product')]}),
('what is the price of chair?', {'entities': [(21, 26, 'Product')]}),
('what is the price of table?', {'entities': [(21, 26, 'Product')]}),
('what is the price of watch?', {'entities': [(21, 26, 'Product')]})
]
Training the blank spacy model for the first time:
def train_spacy(data,iterations):
TRAIN_DATA = data
nlp = spacy.blank('en') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(iterations):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
return nlp
start_training = train_spacy(TRAIN_DATA, 20)
saving my trained spacy model:
# Saveing the trained model
start_training.to_disk("spacy_start_model")
my question here is how to update the saved model with new training data? New training data:
TRAIN_DATA_2 = [('Who is Chaka Khan?', {"entities": [(7, 17, 'PERSON')]}),
('I like London and Berlin.', {"entities": [(7, 13, 'LOC')]})]
could any one help me with your solution and tip for this? Thanks in advance!
As far as I know, you could retrain your model using your new data examples, but instead of starting from a blank model, you would now start from your existing model.
In order to achieve this, it will first remove the following line from your train_spacy
method, and may be receives the model as a parameter:
nlp = spacy.blank('en') # create blank Language class
Then to retrain your model instead of loading a spacy blank model and pass to your training method, load your existing model using the load
method and then call your training method (read more about spacy save/load here).
start_training = spacy.load("spacy_start_model")
One final suggestion, in my practice I have obtained better results by retraining a spacy NER model from an existing one such as en_core_web_md
or en_core_web_lg
, adding my custom entities, than training from scratch from a spacy blank model.
ALL TOGETHER:
def train_spacy(data, iterations, nlp): # <-- Add model as nlp parameter
TRAIN_DATA = data
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(iterations):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
return nlp
nlp = spacy.blank('en') # create blank Language class
start_training = train_spacy(TRAIN_DATA, 20, nlp)
TRAIN_DATA_2 = [('Who is Chaka Khan?', {"entities": [(7, 17, 'PERSON')]}),
('I like London and Berlin.', {"entities": [(7, 13, 'LOC')]})]
nlp = spacy.load("spacy_start_model") # <-- Now your base model is your custom model
start_training = train_spacy(TRAIN_DATA_2, 20, nlp)
I hopethis works for you!