对文章的分类和聚类

以之前抓取的“科技”和“娱乐”文章为例

‘articls.csv’中的内容如图:
image

分类
import jieba
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

#获取停用词
def get_stop_words():
    content=list()
    with open(r'stop.txt',encoding='utf-8') as f:
        for line in f:
            content.append(line.strip())
    return content
stop_hanzi=get_stop_words()
print(' '.join(stop_hanzi))
#'$ 0 1 2 3 4 5 6 7 8 9 ? _ “ ” 、 。 《 》 一 一些 一何 一切......'

#将文章进行清洗,转换
article_path=r'articls.csv'
def get_TFIDF():
    labels = list()  #标签库,做验证用
    corpus = list()  # 语料库 空格连接
    # 读取语料  一行为一个文档
    for line in open(article_path, 'r',encoding='utf8').readlines():
        label, content=line.strip().split(',')
        content=[x for x in jieba.cut(content) if x not in stop_hanzi]
        #如果文章不为空
        if content:
            corpus.append(' '.join(content))
            labels.append(label)    
    labels=list(map(lambda x:0 if x=='news_tech' else 1,labels))    

    # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer = CountVectorizer()
    # 该类会统计每个词语的tf-idf权值
    transformer = TfidfTransformer()
    # 第一个fit_transform是计算tf-idf(词频-逆文章频率), 第二个fit_transform是将文本转为词频矩阵
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    # 获取词袋模型中的所有词语
    word = vectorizer.get_feature_names()  
    # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
    weight = tfidf.toarray()    
    return weight,np.array(labels)

weight,labels=get_TFIDF()
print('weight.shape: {}, lables.shape: {}'.format(weight.shape,labels.shape))
# weight.shape: (884, 44870), lables.shape: (884,),表示有884篇文章,提取了44870个特征词

#训练集与测试集的比例为6:4
x_train, x_test, y_train, y_test=train_test_split(weight,labels,test_size=0.4, random_state=0)

#使用SVM分类器
kernels=['sigmoid','linear','rbf']
for kernel in kernels:
    clf = svm.SVC(kernel=kernel).fit(x_train, y_train)
    print('svm_{}_accuracy:{}'.format(kernel,clf.score(x_test, y_test)))

#svm_sigmoid_accuracy:0.576271186440678
#svm_linear_accuracy:0.9971751412429378
#svm_rbf_accuracy:0.576271186440678

#使用决策树分类
clf = tree.DecisionTreeClassifier().fit(x_train, y_train)
print('{}:{}'.format('DecisionTreeClassifier', clf.score(x_test, y_test)))
#DecisionTreeClassifier:0.9293785310734464

#使用随机森林分类
clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0).fit(x_train, y_train)
print('{}:{}'.format('RandomForestClassifier', clf.score(x_test, y_test)))
#RandomForestClassifier:0.9717514124293786

#使用AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100).fit(x_train, y_train)
print('{}:{}'.format('AdaBoostClassifier', clf.score(x_test, y_test)))
#AdaBoostClassifier:0.9717514124293786

#使用GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100,random_state=0).fit(x_train, y_train)
print('{}:{}'.format('GradientBoostingClassifier', clf.score(x_test, y_test)))
#GradientBoostingClassifier:0.9548022598870056

#使用交叉验证集
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear')
scores = cross_val_score(clf, weight, labels, cv=5)
scores
#array([ 0.98876404,  0.97740113,  0.97740113,  0.97727273,  0.98863636])

#使用GridSearchCV调参
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 5]}
clf=GridSearchCV(estimator =svm.SVC(),param_grid =parameters,scoring='accuracy',cv=5)  
clf.fit(weight,labels)

clf.best_params_ 
#{'C': 1, 'kernel': 'linear'}
clf.best_score_ 
#0.98190045248868774

parameters ={'n_estimators':range(10,61,10)} 
clf = GridSearchCV(estimator =RandomForestClassifier(min_samples_split=2, random_state=0),param_grid =parameters,scoring='accuracy',cv=5)
clf.fit(weight,labels)

clf.best_params_ 
#{'n_estimators': 40}
clf.best_score_ 
#0.98190045248868774

#使用 kaggle神器xgboost
#下载地址https://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost

import xgboost as xgb
from xgboost import XGBClassifier

clf = XGBClassifier()
clf.fit(x_train, y_train)
print('{}_accuracy:{}'.format('xgboost',clf.score(x_test, y_test)))
#xgboost_accuracy:0.9745762711864406

#使用GridSearchCV对XGBClassifier调参
parameters = {'learning_rate' : [0.001, 0.01, 0.1, 0.4, 0.7]}
clf = GridSearchCV(estimator = XGBClassifier(), param_grid = parameters, scoring="neg_log_loss", n_jobs=-1, cv=5)
clf.fit(weight,labels)

clf.best_params_ 
#{'learning_rate': 0.1}
clf.best_score_ 
0.97285067873303166
聚类
from sklearn.cluster import KMeans

weight,labels=get_TFIDF()

# 选择2个中心点
clf = KMeans(n_clusters=2)
# clf.fit(X)可以把数据输入到分类器里
clf.fit(weight)

# 打印2个中心点
print('cluster_center:')
print(clf.cluster_centers_)
#cluster_center:
#[[  2.65940140e-04   8.00510130e-05   7.36860595e-05 ...,   5.35179931e-05
#    3.78464085e-05   6.36585103e-05]
# [  2.48911399e-04   5.80102231e-05   1.45779433e-04 ...,  -8.80914265e-20
#    5.75982404e-20   0.00000000e+00]]

# 保存模型
joblib.dump(clf, 'kmeans.pkl')
# 载入保存的模型
clf = joblib.load('kmeans.pkl')
#预测
clf.fit_predict(weight)

#打印分类结果
print(clf.labels_)
#[1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 1 1 0 ......]

#对聚类结果打分
score = accuracy_score(clf.labels_, labels)
#因为聚类对0, 1的不确定性,结果越趋近0或1越好,0.5左右就是乱猜了
score = max(score,1-score)
score
#0.95248868778280538
  • 2
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值