# -*- encoding=utf-8 -*-
from sklearn import svm
from sklearn import neighbors, linear_model
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import jieba
import pyltp
from classification_practice.practice_one.search_count_auto_words import AutoWordsCounter
from statistic.default_string_search import DefaultStringSearch as ACSearcher
import pandas as pd
from sklearn import metrics
import jieba
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier
from sklearn.externals import joblib
# segmentor = pyltp.Segmentor()
# segmentor.load("\\LTPModel\\cws.model")
def read_files(filename):
x_train = []
y_train = []
with open(filename,'r',encoding="utf-8") as lines:
for line in lines:
item = line.replace("\n","").strip().split("\t")
sentence = " ".join(jieba.cut(item[2]))
# sentence = " ".join(list(segmentor.segment(item[1])))
sentence = sentence
x_train.append(sentence)
y_train.append(int(item[1]))
return x_train,y_train
def read_files1(filename):
x_train = []
y_train = []
df = pd.read_excel(filename).drop_duplicates()
labels = df["sentiment"].map({"负面": -1, "中性": 0, "正面": 1})
for sentiment,sentence in zip(labels,df['sentence']):
cut_sentence = " ".join(jieba.cut(sentence))
x_train.append(cut_sentence)
y_train.append(sentiment)
return x_train,y_train
vect = TfidfVectorizer(min_df=2,max_df=0.8)
v_test,v_useless = read_files("train1.5.1.txt")
vect.fit(v_test)
knn = neighbors.KNeighborsClassifier()
logistic = linear_model.LogisticRegression(solver='newton-cg')
X_train, y_train = read_files("train1.5.1.txt")
X_train = vect.transform(X_train)
X_test, y_test = read_files("test1.2.0.txt")
X_test = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train, y_train)
X_predict = nb.predict(X_test)
print ("NaiveBayes Result:")
print(metrics.classification_report(y_test, X_predict))
###################################################
knn = neighbors.KNeighborsClassifier()
knn.fit(X_train, y_train)
t = knn.predict(X_test)
print('KNN score: %f' % knn.score(X_test, y_test))
print(metrics.classification_report(y_test, t))
#################################################
print("LR ")
logistic = linear_model.LogisticRegression(solver='newton-cg')
logistic.fit(X_train, y_train)
X_predict = logistic.predict(X_test)
print(metrics.classification_report(y_test, X_predict))
#################################################
print("SVM-linear ")
C = 1.0 # SVM regularization parameter
X_predict = svm.SVC(kernel='linear', C=C).fit(X_train, y_train).predict(X_test)
# for ture_result, predict_result in zip(y_test,X_predict):
# print(ture_result+"\t"+ predict_result)
print(metrics.classification_report(y_test, X_predict))
#################################################
print("SVM-Rbf ")
X_predict = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X_train, y_train).predict(X_test)
print(metrics.classification_report(y_test, X_predict))
#################################################
print("SVM-poly ")
X_predict = svm.SVC(kernel='poly', degree=3, C=C).fit(X_train, y_train).predict(X_test)
print(metrics.classification_report(y_test, X_predict))
#################################################
print("SVM-svc ")
X_predict = svm.LinearSVC(C=C).fit(X_train, y_train).predict(X_test)
print(metrics.classification_report(y_test, X_predict))
#################################################
print("finish! ")
相关介绍后续补充……