爬虫报错: 'raise HTTPError(req.full_url, code, msg, hdrs, fp) ’ HTTPError
报错原因:网站存在反爬虫机制,爬虫未进行UA封装,拒绝了爬虫请求。
解决方法:将原先的urlopen(‘https://movie.douban.com/nowplaying/ankang/’)进行UA封装,
替换为如下语句:
request = urllib.request.Request(url=‘https://movie.douban.com/nowplaying/ankang/’, headers=headers)
response = urllib.request.urlopen(request)
html_data = response.read().decode(‘utf-8’)
原代码:
import warnings
warnings.filterwarnings(“ignore”)
import jieba #分词包
import jieba.analyse
import numpy #numpy计算包
import re
import matplotlib.pyplot as plt
import urllib
from bs4 import BeautifulSoup as bs
import matplotlib
matplotlib.rcParams[‘figure.figsize’] = (10.0, 5.0)
from wordcloud import WordCloud, STOPWORDS #词云包
#分析网页函数
def getNowPlayingMovie_list():
headers={‘user-agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36’}
request = urllib.request.Request(url='https://movie.douban.com/nowplaying/ankang/', headers=headers)
response = urllib.request.urlopen(request)
html_data = response.read().decode('utf-8')
soup = bs(html_data, 'html.parser')
nowplaying_movie = soup.find_all('div', id='nowplaying')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
nowplaying_list = []
for item in nowplaying_movie_list:
nowplaying_dict = {}
nowplaying_dict['id'] = item['data-subject']
for tag_img_item in item.find_all('img'):
nowplaying_dict['name'] = tag_img_item['alt']
nowplaying_list.append(nowplaying_dict)
return nowplaying_list
#爬取评论函数
def getCommentsById(movieId, pageNum): #参数为电影id号和要爬取评论的页码
eachCommentList = [];
if pageNum>0:
start = (pageNum-1) * 20
else:
return False
requrl = ‘https://movie.douban.com/subject/’ + movieId + ‘/comments’ +’?’ +‘start=’ + str(start) + ‘&limit=20’
print(requrl)
resp = request.urlopen(requrl)
html_data = resp.read().decode(‘utf-8’)
soup = bs(html_data, ‘html.parser’)
comment_div_lits = soup.find_all(‘div’, class_=‘comment’)
#print(comment_div_lits[0])
for item in comment_div_lits:
#print(item.find_all(‘p’))
p=item.find_all(‘p’)[0]
span=p.find(‘span’)
if span.string is not None:
#print(span.string)
eachCommentList.append(span.string)
return eachCommentList
def main():
#循环获取第一个电影的前10页评论
commentList = []
NowPlayingMovie_list = getNowPlayingMovie_list()
print(NowPlayingMovie_list) #[{‘id’: ‘27605698’, ‘name’: ‘西虹市首富’}, {‘id’: ‘25882296’, ‘name’: ‘狄仁杰之四大天王’}]
for i in range(10): #前10页
num = i + 1
commentList_temp = getCommentsById(NowPlayingMovie_list[0][‘id’], num)#指定那部电影
commentList.append(commentList_temp)
#将列表中的数据转换为字符串
comments = ‘’
for k in range(len(commentList)):
comments = comments + (str(commentList[k])).strip()
#使用正则表达式去除标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
#使用结巴分词进行中文分词
result=jieba.analyse.textrank(cleaned_comments,topK=50,withWeight=True)
keywords = dict()
for i in result:
keywords[i[0]]=i[1]
print("删除停用词前",keywords) #{'演员': 0.18290354231824632, '大片': 0.2876433001472282}
#停用词集合
stopwords = set(STOPWORDS)
f=open('./StopWords.txt',encoding="utf8")
while True:
word=f.readline()
if word=="":
break
stopwords.add(word[:-1])
keywords={ x:keywords[x] for x in keywords if x not in stopwords}
print("\n删除停用词后",keywords)
#用词云进行显示
wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",
max_font_size=80,stopwords=stopwords)
word_frequence=keywords
myword=wordcloud.fit_words(word_frequence)
plt.imshow(myword)#展示词云图
plt.axis(“off”)
plt.show()
#主函数
main()