爬虫报错： ‘raise HTTPError(req.full_url, code, msg, hdrs, fp) ‘ HTTPError

最新推荐文章于 2024-10-03 16:52:07 发布

程序缘-

最新推荐文章于 2024-10-03 16:52:07 发布

阅读量7.4k

点赞数 2

文章标签：爬虫

本文链接：https://blog.csdn.net/qq_44102625/article/details/112213427

版权

爬虫报错： 'raise HTTPError(req.full_url, code, msg, hdrs, fp) ’ HTTPError

报错原因：网站存在反爬虫机制，爬虫未进行UA封装，拒绝了爬虫请求。
解决方法：将原先的urlopen(‘https://movie.douban.com/nowplaying/ankang/’)进行UA封装，
替换为如下语句：
request = urllib.request.Request(url=‘https://movie.douban.com/nowplaying/ankang/’, headers=headers)
response = urllib.request.urlopen(request)
html_data = response.read().decode(‘utf-8’)

原代码：
import warnings
warnings.filterwarnings(“ignore”)
import jieba #分词包
import jieba.analyse
import numpy #numpy计算包
import re
import matplotlib.pyplot as plt
import urllib
from bs4 import BeautifulSoup as bs
import matplotlib
matplotlib.rcParams[‘figure.figsize’] = (10.0, 5.0)
from wordcloud import WordCloud, STOPWORDS #词云包

#分析网页函数
def getNowPlayingMovie_list():
headers={‘user-agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36’}

request = urllib.request.Request(url='https://movie.douban.com/nowplaying/ankang/', headers=headers)
response = urllib.request.urlopen(request)
html_data = response.read().decode('utf-8')    
soup = bs(html_data, 'html.parser')    
nowplaying_movie = soup.find_all('div', id='nowplaying')        
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') 
nowplaying_list = []    
for item in nowplaying_movie_list:        
    nowplaying_dict = {}        
    nowplaying_dict['id'] = item['data-subject']       
    for tag_img_item in item.find_all('img'):            
        nowplaying_dict['name'] = tag_img_item['alt']  
        nowplaying_list.append(nowplaying_dict)    
return nowplaying_list

#爬取评论函数
def getCommentsById(movieId, pageNum): #参数为电影id号和要爬取评论的页码
eachCommentList = [];
if pageNum>0:
start = (pageNum-1) * 20
else:
return False
requrl = ‘https://movie.douban.com/subject/’ + movieId + ‘/comments’ +’?’ +‘start=’ + str(start) + ‘&limit=20’
print(requrl)
resp = request.urlopen(requrl)
html_data = resp.read().decode(‘utf-8’)
soup = bs(html_data, ‘html.parser’)
comment_div_lits = soup.find_all(‘div’, class_=‘comment’)
#print(comment_div_lits[0])
for item in comment_div_lits:
#print(item.find_all(‘p’))
p=item.find_all(‘p’)[0]
span=p.find(‘span’)
if span.string is not None:
#print(span.string)
eachCommentList.append(span.string)

return eachCommentList

def main():
#循环获取第一个电影的前10页评论
commentList = []
NowPlayingMovie_list = getNowPlayingMovie_list()
print(NowPlayingMovie_list) #[{‘id’: ‘27605698’, ‘name’: ‘西虹市首富’}, {‘id’: ‘25882296’, ‘name’: ‘狄仁杰之四大天王’}]
for i in range(10): #前10页
num = i + 1
commentList_temp = getCommentsById(NowPlayingMovie_list[0][‘id’], num)#指定那部电影
commentList.append(commentList_temp)
#将列表中的数据转换为字符串
comments = ‘’
for k in range(len(commentList)):
comments = comments + (str(commentList[k])).strip()

#使用正则表达式去除标点符号
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comments)
cleaned_comments = ''.join(filterdata)
#使用结巴分词进行中文分词
result=jieba.analyse.textrank(cleaned_comments,topK=50,withWeight=True)
keywords = dict()
for i in result:
    keywords[i[0]]=i[1]    
print("删除停用词前",keywords)  #{'演员': 0.18290354231824632, '大片': 0.2876433001472282} 
#停用词集合
stopwords = set(STOPWORDS)
f=open('./StopWords.txt',encoding="utf8")
while True:
     word=f.readline()
     if word=="":
         break
     stopwords.add(word[:-1])

keywords={ x:keywords[x] for x in keywords if x  not in stopwords}
print("\n删除停用词后",keywords)
#用词云进行显示
wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",

max_font_size=80,stopwords=stopwords)
word_frequence=keywords
myword=wordcloud.fit_words(word_frequence)
plt.imshow(myword)#展示词云图
plt.axis(“off”)
plt.show()
#主函数
main()