mysql文本识别_sklearn学习--读取mysql数据源进行训练样本和预测文本分类

# coding=utf-8

import re

import pandas as pd

import string

import MySQLdb

import jieba

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.cross_validation import train_test_split

from sklearn.metrics import confusion_matrix

from sklearn import metrics

from sklearn.metrics import roc_curve, auc

from sklearn.svm import LinearSVC

#jieba分词

def jieba_tokenizer(x): return jieba.cut(x,cut_all=True)

def partition(x): return x

def filter_html(s):

d = re.compile(r']+>',re.S)

return d.sub('',s)

#链接mysql数据库

conn=MySQLdb.connect(host='localhost',user='root',passwd='',db='article',port=3306,charset="utf8")

cursor =conn.cursor()

cursor.execute("SET NAMES utf8")

#训练数据样本

data_ret = pd.DataFrame()

for i in range(0,5):

sql = "SELECT a.id,a.title,a.classid,b.artcontent FROM article a,article_txt b WHERE a.id=b.aid AND b.artcontent IS NOT NULL AND a.id>100 ORDER BY a.id ASC LIMIT "+str(i*1000)+",1000"

#print sql

ret = pd.read_sql_query(sql, conn)

data_ret = data_ret.append(ret)

Score = data_ret['classid']

data_ret['artcontent'] = [filter_html(msg) for msg in data_ret['artcontent']]

X_train = data_ret['artcontent']

Y_train = Score.map(partition)

corpus = []

for txt in X_train:

corpus.append(' '.join(jieba_tokenizer(txt)))

count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(corpus)

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = LinearSVC().fit(X_train_tfidf, Y_train)

#可以把clf持久化

#测试数据 预测分类

test_set = []

test_txt_data = pd.read_sql_query("SELECT a.id,a.title,a.classid,b.artcontent FROM article a,article_txt b WHERE a.id=b.aid AND b.artcontent IS NOT NULL AND a.id<50 ORDER BY a.id ASC", conn)

X_test = [filter_html(msg) for msg in test_txt_data['artcontent']]

for text in X_test:

text=' '.join(jieba_tokenizer(text))

test_set.append(text)

X_new_counts = count_vect.transform(test_set)

X_test_tfidf = tfidf_transformer.transform(X_new_counts)

result = dict()

result = clf.predict(X_test_tfidf)

for i in range(0, len(result)):

print "ID:"+str(test_txt_data['id'][i])+" -> classid:"+str(result[i])

cursor.close()

conn.close()

分享到:

18e900b8666ce6f233d25ec02f95ee59.png

72dd548719f0ace4d5f9bca64e1d7715.png

2016-08-15 15:45

浏览 1016

评论

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值