文本分类代码 编译通过
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
# 文本清洗预处理
def preprocessing(text):
# text = text.encode("gbk").decode("utf8")
# tokons to word 句子标记解析 单词标记解析
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
# print("单词标记解析:", tokens)
# 停用词删除
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# print("停用词删除", tokens)
# 单词字数小于3删除 并转换成小写
tokens = [word.lower() for word in tokens if len(word) >= 3]
# print("字数小于3删除", tokens)
# lemmatize
# 把(文中的词)按屈折变化形式(或异体形式)进行归类
lemter = WordNetLemmatizer()