文本预处理一般包括以下几个流程:
- 分词(主要是中文分词,英文分词较简单)
- 去除停用词 (中英文停用词表)
- 词干提取、词性转换 (针对英文,英文还有大小写转换的问题)
- 词性标注
- 文本向量化 (词袋模型、TF-IDF、分布式词向量表示)
以下是python实现的文本预处理的主要流程
import numpy as np
import nltk
import jieba
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
#分词(Tokenization)
def fenci_eng(text):
return text.split()
def fenci_cn(text):
return list(jieba.cut(text, cut_all=False)) #精确模式 (默认)
#ps:可导入自定义词典,搜狗细胞词库
#去除停用词(dropping stop terms)
def get_eng_stopwords():
#功能词、广泛使用的词
return nltk.corpus.stopwords.words('english')
def get_cn_stopwords():
# 中文停用词表(cn_stopwords)、哈工大停用词表(hit_stopwords.txt)
# 百度停用词表(baidu_stopwords.txt)、 四川大学机器智能实验室停用词库(scu_stopwords.txt)
filename = r"stopwords-master/scu_stopwords.txt"
stopwords = []
with open(filename, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
stopwords.append(line.strip())
return stopwords
#ps list无重复元素合并: a = set(b) a = list(a)
#标准化(normalization) : 词干提取(stemming)、词形还原(Lemmatization)
def stemming_eng(words_list):#词干提取(stemming)
return [SnowballStemmer(language='english').stem(word) for word in words_list]
def Lemmatization_eng(words_list):#词形还原(Lemmatization)
return [WordNetLemmatizer().lemmatize(word) for word in words_list]
#词性标注(words tagging)
def words_tag_eng(words_list):
return nltk.pos_tag(words_list, tagset='universal')
#构建文档矩阵
#bag-of-words(BOW)
#TF-IDF
def Tf_IDf(documents_list):
vectorizer = CountVectorizer(min_df=1) #词频,不统计标点
count = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names()) #单词表
# print(vectorizer.vocabulary_) #单词出现次数
# print(count.toarray()) #词频矩阵
transformer = TfidfTransformer() #calculate Tf-IDF
tfidf_matrix = transformer.fit_transform(count)
#print(tfidf_matrix.toarray()) #Tf-IDF矩阵
return tfidf_matrix.toarray()
def TF_IDF_2(documents_list):
tfidf_vec = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf_vec.fit_transform(documents_list) #calculate Tf-IDF
# print(tfidf_vec.get_feature_names()) #单词表
# print(tfidf_vec.vocabulary_) #单词出现次数
return tfidf_matrix.toarray()
corpus = [
'This is the first document',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
print(TF_IDF_2(corpus))