爬虫报错: ‘raise HTTPError(req.full_url, code, msg, hdrs, fp) ‘ HTTPError

爬虫报错: 'raise HTTPError(req.full_url, code, msg, hdrs, fp) ’ HTTPError

报错原因:网站存在反爬虫机制,爬虫未进行UA封装,拒绝了爬虫请求。
解决方法:将原先的urlopen(‘https://movie.douban.com/nowplaying/ankang/’)进行UA封装,
替换为如下语句:
request = urllib.request.Request(url=‘https://movie.douban.com/nowplaying/ankang/’, headers=headers)
response = urllib.request.urlopen(request)
html_data = response.read().decode(‘utf-8’)

原代码:
import warnings
warnings.filterwarnings(“ignore”)
import jieba #分词包
import jieba.analyse
import numpy #numpy计算包
import re
import matplotlib.pyplot as plt
import urllib
from bs4 import BeautifulSoup as bs
import matplotlib
matplotlib.rcParams[‘figure.figsize’] = (10.0, 5.0)
from wordcloud import WordCloud, STOPWORDS #词云包

#分析网页函数
def getNowPlayingMovie_list():
headers={‘user-agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36’}

request = urllib.request.Request(url='https://movie.douban.com/nowplaying/ankang/', headers=headers)
response = urllib.request.urlopen(request)
html_data = response.read().decode('utf-8')    
soup = bs(html_data, 'html.parser')    
nowplaying_movie = soup.find_all('div', id='nowplaying')        
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') 
nowplaying_list = []    
for item in nowplaying_movie_list:        
    nowplaying_dict = {}        
    nowplaying_dict['id'] = item['data-subject']       
    for tag_img_item in item.find_all('img'):            
        nowplaying_dict['name'] = tag_img_item['alt']  
        nowplaying_list.append(nowplaying_dict)    
return nowplaying_list

#爬取评论函数
def getCommentsById(movieId, pageNum): #参数为电影id号和要爬取评论的页码
eachCommentList = [];
if pageNum>0:
start = (pageNum-1) * 20
else:
return False
requrl = ‘https://movie.douban.com/subject/’ + movieId + ‘/comments’ +’?’ +‘start=’ + str(start) + ‘&limit=20’
print(requrl)
resp = request.urlopen(requrl)
html_data = resp.read().decode(‘utf-8’)
soup = bs(html_data, ‘html.parser’)
comment_div_lits = soup.find_all(‘div’, class_=‘comment’)
#print(comment_div_lits[0])
for item in comment_div_lits:
#print(item.find_all(‘p’))
p=item.find_all(‘p’)[0]
span=p.find(‘span’)
if span.string is not None:
#print(span.string)
eachCommentList.append(span.string)

return eachCommentList

def main():
#循环获取第一个电影的前10页评论
commentList = []
NowPlayingMovie_list = getNowPlayingMovie_list()
print(NowPlayingMovie_list) #[{‘id’: ‘27605698’, ‘name’: ‘西虹市首富’}, {‘id’: ‘25882296’, ‘name’: ‘狄仁杰之四大天王’}]
for i in range(10): #前10页
num = i + 1
commentList_temp = getCommentsById(NowPlayingMovie_list[0][‘id’], num)#指定那部电影
commentList.append(commentList_temp)
#将列表中的数据转换为字符串
comments = ‘’
for k in range(len(commentList)):
comments = comments + (str(commentList[k])).strip()

#使用正则表达式去除标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
#使用结巴分词进行中文分词
result=jieba.analyse.textrank(cleaned_comments,topK=50,withWeight=True)
keywords = dict()
for i in result:
    keywords[i[0]]=i[1]    
print("删除停用词前",keywords)  #{'演员': 0.18290354231824632, '大片': 0.2876433001472282} 
#停用词集合
stopwords = set(STOPWORDS)
f=open('./StopWords.txt',encoding="utf8")
while True:
     word=f.readline()
     if word=="":
         break
     stopwords.add(word[:-1])

keywords={ x:keywords[x] for x in keywords if x  not in stopwords}
print("\n删除停用词后",keywords)
#用词云进行显示
wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",

max_font_size=80,stopwords=stopwords)
word_frequence=keywords
myword=wordcloud.fit_words(word_frequence)
plt.imshow(myword)#展示词云图
plt.axis(“off”)
plt.show()
#主函数
main()

  • 2
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值