python机器学习-----文本分类笔记

#1.数据准备
import pandas as pda
import numpy as npy
filename=""
dataf=pda.read_csv(filename)
x=dataf.iloc[:,1:4].as_matrix()
y=dataf.iloc[:,0:1].as_matrix()

#2.数据的归一化
from sklearn import preprocessing
#归一化处理,处理0-1
nx=preprocessing.normalize(x)

#标准化处理,减去平均值,然后除以方差,结果是聚集在0附近,方差是1
sx=preprocessing.scale(x)
#特征筛选
from sklearn.ensemble import ExtraTreesClassifier
model=ExtraTreesClassifier()
model.fit(x,y)
#print(model.feature_importances)
#常见算法的实现--k近邻
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()
model.fit(x,y)
x2=npy.array([[800,3,50,1],[372,3.71,2]])
#print(model.predict(x2))

#模型评价
from sklearn import metrics
#模型报告
expected=y
predicted=model.predict()
print(metrics.classification_report(expected,predicted))
'''
precision(精准率)
假设预测目标有0,1,数据中1的个数为a,(真实分类结果数据),预测1的次数为b,预测命中次数为c,
precision=c/b
recall
fi-score召回率=c/a
f1-score:2*precision*recall/(recision+recall)
support(计数)
'''
#混淆矩阵
#print(metrics.confusion_matrix(expected,predicted))
#朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(x,y)
predicted=model.predict(x)
#print(model.predict(x))

#逻辑回归
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x,y)
predicted=model.predict(x)
#print(model.predict(x))

#决策树
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x,y)
predicted=model.predict(x)
#print(model.predict(x))
#支持向量机
from sklearn.svm import SVC
model=SVC()
model.fit(x,y)
predicted=model.predict(x)
#print(model.predict(x))
#英文文本分类
#文本数据准备
from sklearn.datasets import fetch_20newsgroups
categories=['comp.graphics','alt.atheism','sci.med']
train_text=fetch_20newsgroups(subset="train",categories=categories,shuffle=True,random_state=40)
#print(train_text.data[0])
print(train_text.data)
#本文特征提取与词频提取
from sklearn.feature_extraction.text import CountVectorize
count_vect= CountVectorize()
train_x_counts=count_vect.fit_transform(train_text.data)

#tfidf模型
from sklearn.feature_extraction.text import TfidfTransformer
tf_ts=TfidfTransformer(use_idf=False).fit(train_x_counts)
train_x_tf=tf_ts.transform(train_x_counts)
#训练
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB().fit(train_x_tf,train_text.target)

#分类预测
new_text=["I like reading books","computer development technology"]
new_x_counts=count_vect.transform(new_text)
new_x_tfidf=tf_ts.transform(new_x_counts)
rst=clf.predict(new_x_tfidf)
print(rst)

#中文文本分类
import os
import jieba
def loaddata(path,class1):
    allfile=os.listdir(path)
    textdata=[]
    classall=[]
    for thisfile in allfile:
        data=open(path+"/"+thisfile,"r",encoding="gbk").read()
        data1=jieba.cut(data)
        data11=""
        for item in data1:
            data11+=item+"  "
        textdata.append(data11)
        classall.append(class1)
    return textdata,classall

text1,class1=loaddata("",0)
text2,class2=loaddata("",1)
train_text=text1+text2
classall=class1+class2

count_vect=CountVectorize()
train_X_counts=count_vect.fit_transform(train_text)
from sklearn.feature_extraction.text import TfidfTransformer
tf_ts=TfidfTransformer(use_idf=False).fit(train_x_counts)
train_x_tf=tf_ts.transform(train_x_counts)
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB().fit(train_x_tf,classall)
new_text=["房间 有鬼","爱情"]
new_x_counts=count_vect.transform(new_text)
new_x_tfidf=tf_ts.transform(new_x_counts)
rst=clf.predict(new_x_tfidf)
print(rst)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值