注意先确保有库
1.分词:
import pymorphy2
import nltk
import pandas as pd
import time
def text_tokenizer(text):
# 转换为正常形式
morph = pymorphy2.MorphAnalyzer()
# 删除除а-яА-ЯЁё以外的所有字符
regex_tokenizer = nltk.tokenize.RegexpTokenizer('[а-яА-ЯЁё]+')
# 转化为小写
注意先确保有库
1.分词:
import pymorphy2
import nltk
import pandas as pd
import time
def text_tokenizer(text):
# 转换为正常形式
morph = pymorphy2.MorphAnalyzer()
# 删除除а-яА-ЯЁё以外的所有字符
regex_tokenizer = nltk.tokenize.RegexpTokenizer('[а-яА-ЯЁё]+')
# 转化为小写