本人奥迷一枚,新学爬虫,爬取贴吧评论,并进行简单文本分析
加载所需包
import xlwt
import csv
import codecs
import wordcloud
import jieba
import requests
import os
import re
import sklearn
import imageio
import csv
from bs4 import BeautifulSoup
import bs4
from wordcloud import WordCloud, STOPWORDS
from imageio import imread
from sklearn.feature_extraction.text import CountVectorizer
爬取并保存
box=[]
depth = 200 #爬取深度
for i in range(depth):
url = 'https://tieba.baidu.com/f?kw=%E5%A5%A5%E7%89%B9%E6%9B%BC&ie=utf-8&pn='+str(i*50)
r = requests.get(url,timeout=30)
r.raise_for_status()
html = r.text
ilt=[]
soup = BeautifulSoup(html,'html.parser')
divs = soup.find_all('div') #评论全部放在div节点
for div in divs:
ilt.append(div.string)
ilt = list(filter(None, ilt)) #过滤None
ilt=[''.join(x.split()) for x in ilt] #剔除/ax0
while '' in ilt: #删除''
ilt.remove('')
new_data = []
p = re.compile(r'[\u4e00-\u9fa5]') #提取其中的中文
for j in range(len(ilt)):
data = re.findall(p, ilt[j])
result = ''.join(data) #将中文字符连接起来
new_data.append(result)
for k in range(len(new_data)):
if new_data[k] != '': #若不为空,则加入
box.append(new_data[k])
#保存文件
def text_save(filename, data):
file = open(filename,'a') #以文本方式打开
for i in range(len(data)):
s = str(data[i]).replace('[','').replace(']','')+'。'
file.write(s) #写入文件
file.close() #关闭文件
print("保存文件成功")
text_save('C://Users//胡银洪//Desktop//picture//ultrman.txt', box)
#删除长度小于4,大于370的评论
with open('C://Users//胡银洪//Desktop//picture//ultrman.txt') as f:
contents = f.read()
print("contents变量的类型:", type(contents))
new_content = ''
for dp in contents.split('。'):
if len(dp)>5:
if len(dp)<370:
if '签到' not in dp:
if '超级会员' not in dp:
new_content = new_content + dp +'。'
new_content
fh = open('C://Users//胡银洪//Desktop//picture//finish.txt', 'w', encoding='utf-8')
fh.write(new_content)
fh.close()
绘制词云图并统计词频
contents_cut = jieba.cut(new_content,cut_all=False)#加上cut_all=False就是精确模式
print("contents_cut变量的类型:", type(contents_cut))
# print("【全模式】:" + "/ ".join(contents_cut)) #显示全模式的词
contents_list = " ".join(contents_cut)
print("contents_list变量的类型:", type(contents_list))
#print(contents_list)
wc = WordCloud(collocations=False,
background_color="white",min_font_size=2,max_words=800,relative_scaling=0.2,
font_path=r"C:\Users\胡银洪\Desktop\picture\msyh.ttc", #ttf改成ttc
width=800, height=400, random_state=70,
mask=imread('C://Users//胡银洪//Desktop//picture//ult.png',pilmode="RGB"))
wc.generate(contents_list)
wc.to_file("C://Users//胡银洪//Desktop//picture//ciyun.png")
#plt.imshow(wc)
#plt.show()
#统计词频----------------------------------------------------
# 使用CountVectorizer统计词频
cv = CountVectorizer()
contents_count = cv.fit_transform([contents_list])
list1 = cv.get_feature_names()
# 词有哪些
#print(list1)
# 词的频率
list2 = contents_count.toarray().tolist()[0]
# 将词与频率一一对应
contents_dict = dict(zip(list1, list2))
# 输出csv文件,newline="",解决输出的csv隔行问题
with open("C://Users//胡银洪//Desktop//picture//ultra_output.csv", 'w', newline="") as f:
writer = csv.writer(f)
for key, value in contents_dict.items():
writer.writerow([key, value])
结果