As the title states: I seem to have followed the documentation as described and I have looked all over the web for a useful answer but have so far have not found much. Any help is much appreciated! Thank you!
I am running the command:
python -m spacy debug config config.cfg --code 'matcher.py' --code 'sentence.py'
and
python -m spacy train 'config.cfg' --output 'config\' --code 'sentence.py' --code 'matcher.py'
Both get the same error:
ValueError: [E002] Can't find factory for 'sentence_splitter' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a
custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. I
f you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).
Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, le
mmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, matcher, en.lemmatizer
Here is my config file:
[paths]
train = "output_data.spacy"
dev = "output_data.spacy"
vectors = null
init_tok2vec = null
[system]
gpu_allocator = null
seed = 0
[nlp]
lang = "en"
pipeline = ["tok2vec","ner","tagger","sentence_splitter", "parser", "senter","attribute_ruler","matcher","lemmatizer","spacytextblob"]
disabled = ["senter", "tagger", "attribute_ruler","spacytextblob"]
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 256
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
[components]
[components.sentence_splitter]
factory = "sentence_splitter"
[components.attribute_ruler]
factory = "attribute_ruler"
scorer = {"@scorers":"spacy.attribute_ruler_scorer.v1"}
validate = false
[components.lemmatizer]
factory = "lemmatizer"
mode = "rule"
model = null
overwrite = false
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}
[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
[components.ner.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 96
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,1000,2500,2500]
include_static_vectors = true
[components.ner.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[components.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 30
moves = null
scorer = {"@scorers":"spacy.parser_scorer.v1"}
update_with_oracle_cut_size = 100
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "tok2vec"
[components.senter]
factory = "senter"
overwrite = false
scorer = {"@scorers":"spacy.senter_scorer.v1"}
[components.senter.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false
[components.senter.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"
[components.senter.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 16
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
rows = [1000,500,500,500,50]
include_static_vectors = true
[components.senter.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 16
depth = 2
window_size = 1
maxout_pieces = 2
[components.spacytextblob]
factory = "spacytextblob"
blob_only = false
custom_blob = null
[components.tagger]
factory = "tagger"
label_smoothing = 0.0
neg_prefix = "!"
overwrite = false
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null
normalize = false
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
upstream = "tok2vec"
[components.matcher]
factory = "matcher"
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode:width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY","IS_SPACE"]
rows = [5000,1000,2500,2500,50,50]
include_static_vectors = true
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null
[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system:seed}
gpu_allocator = ${system:gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 5000
max_epochs = 0
max_steps = 100000
eval_frequency = 1000
frozen_components = []
before_to_disk = null
annotating_components = []
before_update = null
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 0.00000001
learn_rate = 0.001
[training.score_weights]
tag_acc = 0.16
dep_uas = 0.0
dep_las = 0.16
dep_las_per_type = null
sents_p = null
sents_r = null
sents_f = 0.02
lemma_acc = 0.5
ents_f = 0.16
ents_p = 0.0
ents_r = 0.0
ents_per_type = null
speed = 0.0
[pretraining]
[initialize]
vocab_data = null
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
before_init = null
after_init = null
[initialize.components]
[initialize.components.ner]
[initialize.components.ner.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/ner.json"
require = false
[initialize.components.parser]
[initialize.components.parser.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/parser.json"
require = false
[initialize.components.tagger]
[initialize.components.tagger.labels]
@readers = "spacy.read_labels.v1"
path = "corpus/labels/tagger.json"
require = false
[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]
[initialize.tokenizer]
sentence.py file
import spacy
from spacy.language import Language
import re
@Language.component("sentence_splitter") # stateless
def sentence_splitter(doc):
start = 0
i = 0
# print("Processing custom_sentence_splitter_improved")
#delimiter_pattern = re.compile(r"(\r?\n)+|(\n)+") # This is the magic regex
delimiter_pattern = re.compile(r"(\r?\n\s*)+|(\n\s*)+")
while i < len(doc):
if delimiter_pattern.fullmatch(doc[i].text):
# print(f"Found delimiter '{doc[i].text}' at position {i}")
for token in doc[start:i]:
token.sent_start = False
doc[i].sent_start = True
start = i + 1
# Skip consecutive occurrences of '\r' and '\n'
while i + 1 < len(doc) and delimiter_pattern.fullmatch(doc[i + 1].text):
doc[i + 1].sent_start = False
i += 1
else:
doc[i].sent_start = False
i += 1
for token in doc[start:]:
token.sent_start = False
return doc
# Used to add the custom component to the pipeline
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("sentence_splitter", name="sentence_splitter", after='ner')
matcher.py file
import spacy
from spacy.language import Language
import re
from spacy.matcher import Matcher
from spacy.tokens import Token
@Language.factory("matcher")# stateful
def create_template_matcher(nlp, name):
return TemplateMatcher(nlp.vocab)
class TemplateMatcher:
def __init__(self, vocab):
# Define multiple patterns
patterns1 = [blar blar blar ]
patterns2 = [blar blar blar ]
patterns3 = [blar blar blar ]
patterns4 = [blar blar blar ]
Token.set_extension("templates", default=False, force=True) # Register a new token extension to flag matched patterns
self.matcher = Matcher(vocab)
self.matcher.add("patterns1", patterns1)
self.matcher.add("patterns2", patterns2)
self.matcher.add("patterns3", patterns3)
self.matcher.add("patterns4", patterns4)
def __call__(self, doc):
matches = self.matcher(doc)
for match_id, start, end in matches:
for token in doc[start:end]:
token._.templates = True
return doc
# Used to add the custom component to the pipeline
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("matcher", name="matcher", after ='parser')
Per the docs:
The
--code
argument can be used to provide a Python file that’s imported before the training process starts.
Easy to miss, but it does say "file" (singular) rather than "files". I don't think you can supply the --code
argument >1 times with a different Python file for each.
However, if you add both of your custom components to the same module, e.g. custom_components.py, and run python -m spacy init fill-config config.cfg config.cfg --code custom_components.py
this should tell you if anything is wrong with your config. And if nothing is wrong, then you can proceed with training using python -m spacy train config.cfg --output config/ --code custom_componets.py
"""Contents of custom_components.py
Notes:
- I had to import `SpacyTextBlob` for the config to know what
factory to use
- I had to modify your `TemplateMatcher` to get the `fill-config`
command to work.
"""
import re
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
# NOTE: have to import `SpacyTextBlob` for config file to work
from spacytextblob.spacytextblob import SpacyTextBlob
@Language.component("sentence_splitter") # stateless
def sentence_splitter(doc):
start = 0
i = 0
# print("Processing custom_sentence_splitter_improved")
#delimiter_pattern = re.compile(r"(\r?\n)+|(\n)+") # This is the magic regex
delimiter_pattern = re.compile(r"(\r?\n\s*)+|(\n\s*)+")
while i < len(doc):
if delimiter_pattern.fullmatch(doc[i].text):
# print(f"Found delimiter '{doc[i].text}' at position {i}")
for token in doc[start:i]:
token.sent_start = False
doc[i].sent_start = True
start = i + 1
# Skip consecutive occurrences of '\r' and '\n'
while i + 1 < len(doc) and delimiter_pattern.fullmatch(doc[i + 1].text):
doc[i + 1].sent_start = False
i += 1
else:
doc[i].sent_start = False
i += 1
for token in doc[start:]:
token.sent_start = False
return doc
@Language.factory("matcher") # stateful
def create_template_matcher(nlp, name):
return TemplateMatcher(nlp.vocab)
class TemplateMatcher:
def __init__(self, vocab):
# Define multiple patterns
# NOTE *** modifications made here ***
blar = {"ORTH": "blar"}
patterns1 = [blar]
patterns2 = [blar]
patterns3 = [blar]
patterns4 = [blar]
Token.set_extension("templates", default=False, force=True) # Register a new token extension to flag matched patterns
self.matcher = Matcher(vocab)
self.matcher.add("patterns1", [patterns1])
self.matcher.add("patterns2", [patterns2])
self.matcher.add("patterns3", [patterns3])
self.matcher.add("patterns4", [patterns4])
def __call__(self, doc):
matches = self.matcher(doc)
for match_id, start, end in matches:
for token in doc[start:end]:
token._.templates = True
return doc
After running the python -m spacy init fill-config
command...
python -m spacy init fill-config config.cfg config.cfg --code custom_components.py
We get the green checkmark.
✔ Auto-filled config with all values
✔ Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
You should be good to run python -m spacy train config.cfg --output config/ --code custom_componets.py
now.