豆瓣电影排行榜爬虫-CSDN博客

本文链接：https://blog.csdn.net/weixin_45938063/article/details/138316267

豆瓣排行榜爬虫

利用正则表达式匹配目标字符来对数据进行爬取

# @FileName  :06利用re爬取豆瓣top250.py
# @Time      :2024/4/29 9:19
import random
import re
import requests
import csv
fp = open('../others/doubantop250.csv','a+',encoding='utf-8',newline='') 
writer = csv.writer(fp)
url = "https://movie.douban.com/top250?start="
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1660.57"
}
proxies = [
    {'http': '42.63.65.37:80'},
    {'http': '42.63.65.13:80'},
    {'http': '42.63.65.15:80'},
    {'http': '42.63.65.7:80'},
    {'http': '42.63.65.8:80'},
    {'http': '42.63.65.9:80'},
    {'http': '39.173.106.248:80'},
    {'http': '39.173.106.249:80'},
]

proxy = random.choice(proxies)
for i in range(0, 226, 25):
    new_url = url + str(i)
    response = requests.get(url=new_url, headers=headers, proxies=proxy)
    content = response.text
    # print(content)

    # 查看是否爬取到了
    # print(content)
    # 接下来我们利用re来定制我们要爬取到的数据
    # 1预加载
    moviestitle = re.compile(
        r'<div class="item">.*?<span class="title">(?P<moviename>.*?)</span>.*?<p class="">.*?导演:(?P<editor>.*?)&nbsp;.*?主演:(?P<actor>.*?)<br>.*?(?P<year>.*?)&nbsp;.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?<span>(?P<numbersofevaluators>.*?)</span>',
        re.S)  # re.S可以让正则中的.匹配换行符
    # 2调用re模块的方法，注意要将待匹配内容其转化为字符串类型先看看匹配到了什么，之后我们用迭代器
    result = moviestitle.finditer(content)
    # <callable_iterator object at 0x00000129CF79AFB0> <--加上之后成功，我们利用提取分组数据中的提取方式来对迭代器内容进行提取
    for item in result:
        name = item.group("moviename")
        editor = item.group("editor")
        actor = item.group("actor")
        year = item.group("year").strip()  # 利用字符串中的方法.strip()将字符串的左右换行/空格等处理掉
        score = item.group("score")
        numbersofevaluators = item.group("numbersofevaluators")
        print(name, editor, actor, year, score, numbersofevaluators)
        movieinfo=[name,editor,actor,year,score,numbersofevaluators]
        writer.writerow(movieinfo)

fp.close()
response.close()