记录一下自然语言处理课程的作业。
一、爬取数据集
1.爬取差别比较明显的五个类别
import math
import os
import urllib.request
import re
from bs4 import BeautifulSoup
#需要手动切换类别,进行爬取
#0体育、1娱乐、2教育、3科技、4股票
urllist=['https://sports.163.com/','https://ent.163.com/','https://edu.163.com/',
'https://tech.163.com/','https://money.163.com/stock/']
def get_urls():
url = urllist[4] #修改下标,选择要爬取的类别
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html,'html.parser')
links = soup.find_all(name="a", attrs={"href": re.compile("https://www.163.com/dy/article/")})
urls = []
for i in links:
urls.append(i.get("href"))
process_urls = list(set(urls))
process_urls.sort(key=urls.index)#得到处理后的所有url
return process_urls
def get_article(urls):
count = 0
for url in urls:
count = count + 1
html = urllib.request.urlopen(url).read().decode("utf-8")
soup = BeautifulSoup(html,'html.parser')
links = soup.find_all(name="p", attrs={"id": re.compile("^0")})
opl = open("article/股票.txt", "a+", encoding="utf-8") #修改文件名,选择要保存的文件
for i in links:
opl.write("%s\n" % (i.get_text())) # 循环写入段落内容
opl.close()
# 读取80篇新闻后停止
if count == 10000:
break
if __name__ == "__main__":
urls=[]
urls = get_urls()
get_article(urls)
print('爬取完成')
2.txt文件转换为csv文件,方便查看数据多少条
import pandas as pd
#转为csv文件,并增加索引,可以看文章数
file=pd.read_csv("article/股票.txt",sep='\n')
temp=pd.DataFrame(file)
temp.columns=["content"]
temp.to_csv("article/股票.csv")
3.数据集展示
二、模型训练
1.读取数据,划分训练集和测试集
import jieba
import pandas as pd
import random
from sklearn.model_selection import train_test_split
sports = pd.read_csv("./article/体育.csv", encoding='utf-8')
sports = sports.dropna()
ent = pd.read_csv("./article/娱乐.csv", encoding='utf-8')
ent = ent.dropna()
edu = pd.read_csv("./article/教育.csv", encoding='utf-8')
edu = edu.dropna()
tech = pd.read_csv("./article/科技.csv", encoding='utf-8')
tech = tech.dropna()
money = pd.read_csv("./article/股票.csv", encoding='utf-8')
money = money.dropna()
#选取数据
sports = sports.content.values.tolist()[0:500]
ent = ent.content.values.tolist()[0:500]
edu = edu.content.values.tolist()[0:500]
tech = tech.content.values.tolist()[0:500]
money = money.content.values.tolist()[0:500]
#加载停用词、去停用词
stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values
#文本内容进行分词
def preprocess_text(content_lines, sentences, category):
for line in content_lines:
try:
segs=jieba.lcut(line)
segs = filter(lambda x:len(x)>1, segs)
segs = filter(lambda x:x not in stopwords, segs)
sentences.append((" ".join(segs), category))
except Exception and e:
print(line)
continue
#调用方法分词,添加标签
sentences = []
preprocess_text(sports, sentences, '体育')
preprocess_text(ent, sentences, '娱乐')
preprocess_text(edu, sentences, '教育')
preprocess_text(tech, sentences, '科技')
preprocess_text(money, sentences, '股票')
#打乱顺序,生成更可靠的训练集
random.shuffle(sentences)
#分成训练集和测试集
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)
print(sentences)
可以看到训练集里面的格式,高频词+标签
2.贝叶斯分类器训练模型,joblib保存模型
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import joblib
#计算TF-IDF、词频矩阵,文本向量化
count_vec = TfidfVectorizer()
train_feature = count_vec.fit_transform(x_train)
#贝叶斯分类器训练模型
clf = MultinomialNB(alpha=0.01).fit(train_feature,y_train)
#用测试集测试准确度
test_feature = count_vec.transform(x_test)
predict_labels = clf.predict(test_feature)
scorce = metrics.accuracy_score(y_test, predict_labels)
print(scorce)
#保存模型
joblib.dump(clf, 'bayes.pkl')
joblib.dump(count_vec,'cout_vec.pkl')
打印出用测试集测试的得分
三、成果展示
用python自带的UI界面
import joblib
import jieba
import tkinter
#tkinter界面(TK )
window = tkinter.Tk()
window.title('新闻文章分类器')
window.geometry('800x300')
def result():
article = e1.get()
#加载模型
model = joblib.load('bayes.pkl')
cout_vec = joblib.load('cout_vec.pkl')
words = jieba.cut(article)
s = ' '.join(words)
predict_feature = cout_vec.transform([s])
predcit_label = model.predict(predict_feature)
print(predcit_label[0])
t.delete(1.0, 'end')
t.insert('insert', predcit_label[0]) # 将结果添加到文本框显示
l1 = tkinter.Label(window, text='请输入新闻文章:')
l1.pack()
# 定义输入框
e1 = tkinter.Entry(window, width=100)
e1.pack()
b1 = tkinter.Button(window, text="预测", command=result)
b1.pack()
# 定义文本框
t = tkinter.Text(window,
state='normal', # 有disabled、normal 两个状态值,默认为normal
width=15, height=2
)
t.pack()
window.mainloop()
输入文章:体育类
输入文章:娱乐类