以之前抓取的“科技”和“娱乐”文章为例
‘articls.csv’中的内容如图:

分类
import jieba
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
#获取停用词
def get_stop_words():
content=list()
with open(r'stop.txt',encoding='utf-8') as f:
for line in f:
content.append(line.strip())
return content
stop_hanzi=get_stop_words()
print(' '.join(stop_hanzi))
#'$ 0 1 2 3 4 5 6 7 8 9 ? _ “ ” 、 。 《 》 一 一些 一何 一切......'
#将文章进行清洗,转换
article_path=r'articls.csv'
def get_TFIDF():
labels = list() #标签库,做验证用
corpus = list() # 语料库 空格连接
# 读取语料 一行为一个文档
for line in open(article_path, 'r',encoding='utf8').readlines():
label, content=line.strip().split(',')
content=[x for x in jieba.cut(content) if x not in stop_hanzi]
#如果文章不为空
if content:
corpus.append(' '.join(content))
labels.append(label)
labels=list(map(lambda x:0 if x=='news_tech' else 1,labels))
# 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer()
# 该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
# 第一个fit_transform是计算tf-idf(词频-逆文章频率), 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
# 获取词袋模型中的所有词语
word = vectorizer.get_feature_names()
# 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
weight = tfidf.toarray()
return weight,np.array(labels)
weight,labels=get_TFIDF()
print('weight.shape: {}, lables.shape: {}'.format(weight.shape,labels.shape))
# weight.shape: (884, 44870), lables.shape: (884,),表示有884篇文章,提取了44870个特征词
#训练集与测试集的比例为6:4
x_train, x_test, y_train, y_test=train_test_split(weight,labels,test_size=0.4, random_state=0)
#使用SVM分类器
kernels=['sigmoid','linear','rbf']
for kernel in kernels:
clf = svm.SVC(kernel=kernel).fit(x_train, y_train)
print('svm_{}_accuracy:{}'.format(kernel,clf.score(x_test, y_test)))
#svm_sigmoid_accuracy:0.576271186440678
#svm_linear_accuracy:0.9971751412429378
#svm_rbf_accuracy:0.576271186440678
#使用决策树分类
clf = tree.DecisionTreeClassifier().fit(x_train, y_train)
print('{}:{}'.format('DecisionTreeClassifier', clf.score(x_test, y_test)))
#DecisionTreeClassifier:0.9293785310734464
#使用随机森林分类
clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0).fit(x_train, y_train)
print('{}:{}'.format('RandomForestClassifier', clf.score(x_test, y_test)))
#RandomForestClassifier:0.9717514124293786
#使用AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100).fit(x_train, y_train)
print('{}:{}'.format('AdaBoostClassifier', clf.score(x_test, y_test)))
#AdaBoostClassifier:0.9717514124293786
#使用GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100,random_state=0).fit(x_train, y_train)
print('{}:{}'.format('GradientBoostingClassifier', clf.score(x_test, y_test)))
#GradientBoostingClassifier:0.9548022598870056
#使用交叉验证集
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear')
scores = cross_val_score(clf, weight, labels, cv=5)
scores
#array([ 0.98876404, 0.97740113, 0.97740113, 0.97727273, 0.98863636])
#使用GridSearchCV调参
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 5]}
clf=GridSearchCV(estimator =svm.SVC(),param_grid =parameters,scoring='accuracy',cv=5)
clf.fit(weight,labels)
clf.best_params_
#{'C': 1, 'kernel': 'linear'}
clf.best_score_
#0.98190045248868774
parameters ={'n_estimators':range(10,61,10)}
clf = GridSearchCV(estimator =RandomForestClassifier(min_samples_split=2, random_state=0),param_grid =parameters,scoring='accuracy',cv=5)
clf.fit(weight,labels)
clf.best_params_
#{'n_estimators': 40}
clf.best_score_
#0.98190045248868774
#使用 kaggle神器xgboost
#下载地址https://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost
import xgboost as xgb
from xgboost import XGBClassifier
clf = XGBClassifier()
clf.fit(x_train, y_train)
print('{}_accuracy:{}'.format('xgboost',clf.score(x_test, y_test)))
#xgboost_accuracy:0.9745762711864406
#使用GridSearchCV对XGBClassifier调参
parameters = {'learning_rate' : [0.001, 0.01, 0.1, 0.4, 0.7]}
clf = GridSearchCV(estimator = XGBClassifier(), param_grid = parameters, scoring="neg_log_loss", n_jobs=-1, cv=5)
clf.fit(weight,labels)
clf.best_params_
#{'learning_rate': 0.1}
clf.best_score_
0.97285067873303166
聚类
from sklearn.cluster import KMeans
weight,labels=get_TFIDF()
# 选择2个中心点
clf = KMeans(n_clusters=2)
# clf.fit(X)可以把数据输入到分类器里
clf.fit(weight)
# 打印2个中心点
print('cluster_center:')
print(clf.cluster_centers_)
#cluster_center:
#[[ 2.65940140e-04 8.00510130e-05 7.36860595e-05 ..., 5.35179931e-05
# 3.78464085e-05 6.36585103e-05]
# [ 2.48911399e-04 5.80102231e-05 1.45779433e-04 ..., -8.80914265e-20
# 5.75982404e-20 0.00000000e+00]]
# 保存模型
joblib.dump(clf, 'kmeans.pkl')
# 载入保存的模型
clf = joblib.load('kmeans.pkl')
#预测
clf.fit_predict(weight)
#打印分类结果
print(clf.labels_)
#[1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 1 1 0 ......]
#对聚类结果打分
score = accuracy_score(clf.labels_, labels)
#因为聚类对0, 1的不确定性,结果越趋近0或1越好,0.5左右就是乱猜了
score = max(score,1-score)
score
#0.95248868778280538

4568

被折叠的 条评论
为什么被折叠?



