AllenNLP中常使用spacy对英文进行分词,但是spacy不能对中文分词。因此我想尝试加一个中文分词的word_splitter。前不久加了一个THUNLPSplitter,今天把jieba也加进去。
测试代码:(pos_tags指是否标注词性,only_tokens指最终是否只保留字符,去掉词性等属性,user_dict指用户自定义词典,是一个UTF-8的txt文件路径,一行是一个词)
from allennlp.data.tokenizers.word_splitter import JIEBASplitter
from allennlp.data.tokenizers.token import show_token
splitter = JIEBASplitter(pos_tags=False)
print(splitter.split_words("武汉市长江大桥"))
splitter2 = JIEBASplitter(pos_tags=True, only_tokens=False)
tokens = splitter2.split_words("武汉市长江大桥")
for token in tokens:
print(show_token(token))
splitter3 = JIEBASplitter(pos_tags=False, user_dict='F:\\test\\userdict.txt')
print(splitter3.split_words("中美合拍,文体两开花。皮皮虾我们走"))
结果如下:
用户词典是一个UTF-8的txt文件,一行有一个自定义的词。
完整代码如下:
import jieba.posseg as poss
import jieba
@WordSplitter.register('jieba')
class JIEBASplitter(WordSplitter):
"""
A ``WordSplitter`` that uses JIEBA's tokenizer. To Split Chinese sentences.
user_dict:a txt file, one word in a line.
"""
def __init__(self,pos_tags: bool = False,
only_tokens: bool = True,
user_dict: str = None) -> None:
self._pos_tags = pos_tags
if user_dict and os.path.exists(user_dict):
jieba.load_userdict(user_dict)
self._only_tokens = only_tokens
def _sanitize(self, tokens) -> List[Token]:
"""
Converts spaCy tokens to allennlp tokens. Is a no-op if
keep_spacy_tokens is True
"""
sanitize_tokens = []
if self._pos_tags:
for text, pos in tokens:
token = Token(text)
if self._only_tokens:
pass
else:
token = Token(token.text,
token.idx,
token.lemma_,
pos,
token.tag_,
token.dep_,
token.ent_type_)
sanitize_tokens.append(token)
else:
for token in tokens:
token = Token(token)
sanitize_tokens.append(token)
return sanitize_tokens
@overrides
def batch_split_words(self, sentences: List[str]) -> List[List[Token]]:
split_words = []
if self._pos_tags:
for sent in sentences:
split_words.append(self._sanitize(tokens) for tokens in poss.cut(sent))
else:
for sent in sentences:
split_words.append(self._sanitize(tokens) for tokens in jieba.cut(sent))
return split_words
@overrides
def split_words(self, sentence: str) -> List[Token]:
if self._pos_tags:
return self._sanitize(poss.cut(sentence))
else:
return self._sanitize(jieba.cut(sentence))
过几天试着用这个测试一下中文分类