您需要创建自定义Tokenizer.
您的自定义Tokenizer将与spaCy的标记器完全相同,但它将具有’‘从前缀和后缀中删除的符号,还会添加一个新前缀和一个新后缀规则.
码:
import spacy
from spacy.tokens import Token
Token.set_extension('tag', default=False)
def create_custom_tokenizer(nlp):
from spacy import util
from spacy.tokenizer import Tokenizer
from spacy.lang.tokenizer_exceptions import TOKEN_MATCH
prefixes = nlp.Defaults.prefixes + ('^',)
suffixes = nlp.Defaults.suffixes + ('$',)
# remove the tag symbols from prefixes and suffixes
prefixes = list(prefixes)
prefixes.remove('
prefixes = tuple(prefixes)
suffixes = list(suffixes)
suffixes.remove('>')
suffixes = tuple(suffixes)
infixes = nlp.Defaults.infixes
rules = nlp.Defaults.tokenizer_exceptions