sklearn学习--文本分类多分类应用

#!/usr/bin/env python
# coding=utf-8
import sys
import jieba
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import MySQLdb
import pandas as pd
import re
import numpy as np

def jieba_tokenizer(x): return jieba.cut(x,cut_all=True)
def partition(x): return x
def filter_html(s):
d = re.compile(r'<[^>]+>',re.S)
s = d.sub('',s)
return s
def gbk_utf8(s):
s = s.decode('gbk',"ignore").encode('utf8')
return s
#链接mysql数据库
conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='mydb',port=3306,charset="gbk",use_unicode=False)
cursor =conn.cursor()
cursor.execute("SET NAMES GBK")


#训练数据样本
data_ret = pd.DataFrame()

sql = "SELECT ID, title,classid, content FROM t_reprint article WHERE ID<1000 ORDER BY a.ID ASC LIMIT 0,1000"
#print sql
cursor.execute(sql)

txt_ret = []
class_ret = []
id_ret = []
for row in cursor.fetchall():
content = filter_html(gbk_utf8(row[3]))
txt_ret.append(content)
class_s = gbk_utf8(row[2])
class_l = class_s.split(",")
class_ret.append(class_l)
id_ret.append(row[0])

txt_ret = txt_ret


X_train = txt_ret
Y_train = class_ret


classifier = Pipeline([
('counter', CountVectorizer(tokenizer=jieba_tokenizer)),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LinearSVC())),
])
mlb = MultiLabelBinarizer()
Y_train = mlb.fit_transform(Y_train)


classifier.fit(X_train, Y_train)


#target_names=['100','102','103','104','105','106','107','108','109','110','111','112','113','114','115','116','117','118','119','120','121','122','123','124','125','126','127','128','129','130','131', '132','133','134']
#测试数据
test_txt_set = []
sql = "SELECT ID, title,classid, content FROM article WHERE ID>1000 ORDER BY ID DESC LIMIT 10 "
cursor.execute(sql)
test_id_ret = []

for row in cursor.fetchall():
test_txt_set.append(filter_html(gbk_utf8(row[3])))
test_id_ret.append(row[0])
X_test = test_txt_set


prediction = classifier.predict(X_test)

result = mlb.inverse_transform(prediction)
#展示结果
for i, label1 in enumerate(result):
classstr = ''
for j, label2 in enumerate(label1):
classstr+=str(label2)+","
print "ID:"+str(test_id_ret[i])+" =>class:"+classstr
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

walk walk

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值