###BeautifulSoup爬取虎扑评论
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2017-10-24 20:36:15
# @Author : awakeljw
# @Link : http://blog.csdn.net/awakeljw/
# @Version : $Id$
import os
import re
from bs4 import BeautifulSoup
import urllib.request
url = 'https://nba.hupu.com'
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
HEADERS = {"User-Agent":user_agent}
req = urllib.request.Request(url, headers=HEADERS)
page = urllib.request.urlopen(req).read()
soup = BeautifulSoup(page,'lxml')
forumlist = soup.find('div',class_="forumList")
link_store =[]
text_store = []
delete = ['https://bbs.hupu.com/vote','https://bbs.hupu.com/all-nba','https://bbs.hupu.com/vote','https://bbs.hupu.com/nba ']
for link in forumlist.find_all(['a']):
# print(link.text+':',link.get('href'))
if link.get('href') not in delete:
link_store.append(link.get('href'))
text_store.append(link.text.strip())
number_test = 10
url = link_store[number_test]
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
HEADERS = {"User-Agent":user_agent}
req = urllib.request.Request(url, headers=HEADERS)
page = urllib.request.urlopen(req).read()
pattern = re.compile(r'pageCount:(.*?),//')
html = page.decode('utf-8')
number = re.findall(pattern,html)
number = int(''.join(number))
print (number)
all_link = []
all_link.append(link_store[number_test])
for i in range(2,number+1):
temp = link_store[number_test].replace('.html','')+'-'+str(i)+'.html'
all_link.append(temp)
delete = u':,。?!.@发自虎扑Android客户赞赏我他们个让有就虎扑不在的和了也是你说了他发表楼都你发自会引用客户端iPhoneAndroid而但去被又人就是啊很能上不会当然所以\n'
for url in all_link:
print(url)
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
HEADERS = {"User-Agent":user_agent}
req = urllib.request.Request(url, headers=HEADERS)
page = urllib.request.urlopen(req).read()
soup = BeautifulSoup(page,'lxml')
text = soup.find_all('td')
file = open('paper.txt','a')
for t in text:
try:
temp= ''.join(t.text.split())
for strs in temp:
if strs in delete:
temp = temp.replace(strs,'')
file.write(temp)
file.write(os.linesep)
except:
break
file.close()
###词云分析
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2017-10-18 17:52:25
# @Author : awakeljw (liujw15@mails.tsinghua.edu.cn)
# @Link : http://blog.csdn.net/awakeljw/
# @Version : $Id$
from wordcloud import WordCloud
import jieba
import PIL
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import matplotlib
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SentyTang'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False
matplotlib.rc('xtick', labelsize=14)
matplotlib.rc('ytick', labelsize=14)
def wordcloudplot(txt):
path='I:/tensorflow/ciyun/SentyTang.ttf'
# path=unicode(path, 'utf8').encode('gb18030')
alice_mask = np.array(PIL.Image.open('I:/tensorflow/ciyun/246.jpg'))
wordcloud = WordCloud(font_path=path,
background_color="white",
margin=5, width=1800, height=800,mask=alice_mask,max_words=2000,max_font_size=60,random_state=42)
wordcloud = wordcloud.generate(txt)
wordcloud.to_file('I:/tensorflow/ciyun/output.jpg')
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
rem = ':,。?!.@我他们一个让有就虎扑不在的和了也是你说了他发表楼都你发自会\n引用客户端iPhoneAndroid而但去被又人就是啊很能上不会当然所以'
def main():
a=[]
f=open(r'I:\tensorflow\ciyun\paper1.txt','r',encoding='gb18030').read()
words=list(jieba.cut(f))
tongji = Counter(words).most_common(60)
d = {key: value for (key, value) in tongji}
for i in list(d.keys()):
if i in rem:
d.pop(i)
print (d)
label = list(d.keys())
y = list(d.values())
idx = np.arange(len(y))
plt.barh(idx,y)
plt.yticks(idx+0.4,label)
plt.xlabel('出现次数',fontsize = 20,labelpad = 5)
plt.ylabel('关键词',fontsize= 20,labelpad = 5)
plt.title('涡流发生器对激波串振荡的控制',fontsize= 25)
plt.savefig('I:/tensorflow/ciyun/tongji.png')
#plt.show()
#绘制pie chart on polar axis
N = len(d)
theta = np.arange(0.0, 2*np.pi,2*np.pi/N)
radii = y
width = np.pi/6
ax = plt.subplot(111,projection='polar')
bars = ax.bar(theta, radii, width = width, bottom = 0.0)
plt.xticks(theta+np.pi/12,label)
for r, bar in zip(radii, bars):
bar.set_facecolor(plt.cm.viridis(r / 10.))
bar.set_alpha(0.5)
plt.savefig('I:/tensorflow/ciyun/pie_polar.png')
plt.show()
for word in words:
if len(word)>1:
a.append(word)
txt=r' '.join(a)
wordcloudplot(txt)
if __name__=='__main__':
main()
祝愿书豪早日康复。