python 爬虫豆瓣top250-附源码-详细注释

import requests
from bs4 import BeautifulSoup #导入BeautifulSoup库
#由于豆瓣有反爬虫策略,headers用于伪装成浏览器访问,
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
}
#观察豆瓣top250每一页的网页,发现每次递增25,于是每次循环步长25,用来更新链接
for start in range(0,250,25):
    response = requests.get(f"https://movie.douban.com/top250?start={start}",headers=headers) #更新链接
    html = response.text #获取html文件,传递给html变量
    suop = BeautifulSoup(html,"html.parser") #调用BeautifulSoup,用html.parser来解析html
    titles = suop.findAll("span", attrs={"class": "title"}) #过滤,找到所有的span标签,class=title的信息
    for tit in titles: #遍历titles给tit
       tit_string = tit.string #只将tit的字符串提取出来赋值给tit——string
       if "/" not in tit_string: #如果不带有“/”,就输出
           print(tit_string)
  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
你好!以下是Python爬虫豆瓣Top250电影的详细代码: ```python import requests from bs4 import BeautifulSoup header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} def get_movie_info(url): html = requests.get(url, headers=header).text soup = BeautifulSoup(html, 'html.parser') # 获取电影名称 movie_name = soup.find('span', attrs={'property': 'v:itemreviewed'}).get_text() # 获取电影评分 movie_score = soup.find('strong', class_='ll rating_num').get_text() # 获取电影导演、演员列表 credits = soup.find('div', class_='subject clearfix').find_all('a') directors = [] actors = [] for credit in credits: if 'rel' in credit.attrs and credit.attrs['rel'][0] == 'v:directedBy': directors.append(credit.get_text()) elif 'rel' in credit.attrs and credit.attrs['rel'][0] == 'v:starring': actors.append(credit.get_text()) # 获取电影简介 movie_intro = soup.find('div', class_='indent', id='link-report').span.get_text().strip().replace('\n', '') # 获取电影其他信息 movie_infos = soup.find('div', id='info').find_all('span') movie_country = '' movie_language = '' movie_runtime = '' for movie_info in movie_infos: if '制片国家/地区:' in movie_info.get_text(): movie_country = movie_info.next_sibling.strip().replace('\n', '') elif '语言:' in movie_info.get_text(): movie_language = movie_info.next_sibling.strip().replace('\n', '') elif '上映日期:' in movie_info.get_text(): movie_runtime = movie_info.next_sibling.strip().replace('\n', '') # 输出电影信息 print('电影名称:', movie_name) print('电影评分:', movie_score) print('导演:', directors) print('演员:', actors) print('制片国家/地区:', movie_country) print('语言:', movie_language) print('上映日期:', movie_runtime) print('电影简介:', movie_intro) print('\n') def main(): start = 0 urls = ['https://movie.douban.com/top250?start=' + str(start) for start in range(0, 250, 25)] for url in urls: html = requests.get(url, headers=header).text soup = BeautifulSoup(html, 'html.parser') movie_hrefs = soup.find_all('div', class_='hd') for movie_href in movie_hrefs: movie_url = movie_href.a['href'] get_movie_info(movie_url) if __name__ == "__main__": main() ``` 请注意,这只是简单的代码示例,如果您要爬取豆瓣Top250电影,请了解相关法律法规,遵守爬虫道德规范,尊重网站规则。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值