BeautifulSoup爬取虎扑评论并进行词云分析

1 篇文章 0 订阅

###BeautifulSoup爬取虎扑评论

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-10-24 20:36:15
# @Author  : awakeljw
# @Link    : http://blog.csdn.net/awakeljw/
# @Version : $Id$

import os
import re
from bs4 import BeautifulSoup
import urllib.request

url = 'https://nba.hupu.com'
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'  
HEADERS = {"User-Agent":user_agent}  
req = urllib.request.Request(url, headers=HEADERS)  
page = urllib.request.urlopen(req).read()  
soup = BeautifulSoup(page,'lxml')

forumlist = soup.find('div',class_="forumList")
link_store =[]
text_store = []

delete =  ['https://bbs.hupu.com/vote','https://bbs.hupu.com/all-nba','https://bbs.hupu.com/vote','https://bbs.hupu.com/nba ']
for link in forumlist.find_all(['a']):
#	print(link.text+':',link.get('href'))
	if link.get('href') not in delete:
		link_store.append(link.get('href'))
		text_store.append(link.text.strip())

number_test = 10
url = link_store[number_test]
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'  
HEADERS = {"User-Agent":user_agent}  
req = urllib.request.Request(url, headers=HEADERS)  
page = urllib.request.urlopen(req).read()  
pattern = re.compile(r'pageCount:(.*?),//')  
html = page.decode('utf-8')  
number = re.findall(pattern,html) 
number = int(''.join(number))
print (number)

all_link = []
all_link.append(link_store[number_test])
for i in range(2,number+1):
	temp = link_store[number_test].replace('.html','')+'-'+str(i)+'.html'
	all_link.append(temp)
delete = u':,。?!.@发自虎扑Android客户赞赏我他们个让有就虎扑不在的和了也是你说了他发表楼都你发自会引用客户端iPhoneAndroid而但去被又人就是啊很能上不会当然所以\n'
for url in all_link:
	print(url)
	user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'  
	HEADERS = {"User-Agent":user_agent}  
	req = urllib.request.Request(url, headers=HEADERS)  
	page = urllib.request.urlopen(req).read()  
	soup = BeautifulSoup(page,'lxml')
	text = soup.find_all('td')
	file = open('paper.txt','a')
	for t in text:
		try:
		temp= ''.join(t.text.split())
			for strs in temp:
				if strs in delete:
					temp = temp.replace(strs,'')		
			file.write(temp)
			file.write(os.linesep) 
		except:
			break

file.close()

###词云分析

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-10-18 17:52:25
# @Author  : awakeljw (liujw15@mails.tsinghua.edu.cn)
# @Link    : http://blog.csdn.net/awakeljw/
# @Version : $Id$


from wordcloud import WordCloud
import jieba
import PIL
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import matplotlib
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SentyTang'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False  
matplotlib.rc('xtick', labelsize=14)
matplotlib.rc('ytick', labelsize=14)
def wordcloudplot(txt):
    path='I:/tensorflow/ciyun/SentyTang.ttf'
#    path=unicode(path, 'utf8').encode('gb18030')
    alice_mask = np.array(PIL.Image.open('I:/tensorflow/ciyun/246.jpg'))
    wordcloud = WordCloud(font_path=path, 
                          background_color="white",   
                          margin=5, width=1800, height=800,mask=alice_mask,max_words=2000,max_font_size=60,random_state=42) 
    wordcloud = wordcloud.generate(txt)
    wordcloud.to_file('I:/tensorflow/ciyun/output.jpg')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

rem = ':,。?!.@我他们一个让有就虎扑不在的和了也是你说了他发表楼都你发自会\n引用客户端iPhoneAndroid而但去被又人就是啊很能上不会当然所以'
def main():
    a=[]
    f=open(r'I:\tensorflow\ciyun\paper1.txt','r',encoding='gb18030').read()
    words=list(jieba.cut(f))
    tongji = Counter(words).most_common(60)

    d = {key: value for (key, value) in tongji}  

    for i in list(d.keys()):
	    if i in rem:
		    d.pop(i) 
    print (d)
    label = list(d.keys())
    y = list(d.values())
    idx = np.arange(len(y))
    plt.barh(idx,y)
    plt.yticks(idx+0.4,label) 
    plt.xlabel('出现次数',fontsize = 20,labelpad = 5)
    plt.ylabel('关键词',fontsize= 20,labelpad = 5)
    plt.title('涡流发生器对激波串振荡的控制',fontsize= 25)
    plt.savefig('I:/tensorflow/ciyun/tongji.png')
    #plt.show()
    #绘制pie chart on polar axis
    N = len(d)
    theta = np.arange(0.0, 2*np.pi,2*np.pi/N)
    radii = y
    width = np.pi/6
    ax = plt.subplot(111,projection='polar')
    bars = ax.bar(theta, radii, width = width, bottom = 0.0)
    plt.xticks(theta+np.pi/12,label)
    for r, bar in zip(radii, bars):
        bar.set_facecolor(plt.cm.viridis(r / 10.))
        bar.set_alpha(0.5)

    plt.savefig('I:/tensorflow/ciyun/pie_polar.png')
    plt.show()

    for word in words:
        if len(word)>1:
            a.append(word)
    txt=r' '.join(a)
    wordcloudplot(txt)

if __name__=='__main__':
    main()

这里写图片描述
这里写图片描述
祝愿书豪早日康复。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值