day17作业

day17作业

爬取电影网站信息

import requests
from bs4 import BeautifulSoup
import csv

a = []
for x in range(10):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }
    response = requests.get(f'https://movie.douban.com/top250?start={x * 25}&filter=', headers=headers)
    result = response.text

    soup = BeautifulSoup(result, 'lxml')
    names = soup.select('img')
    names.pop()

    print(len(names))
    score = soup.select('.rating_num')
    print(len(score))
    people = soup.select('div.star>span')
    print(len(people))
    evaluation = []
    for x in range(0, len(people), 4):
        evaluation.append(people[x + 3])

    list1 = soup.select('.info>.bd')
    list2 = []
    for x in list1:
        if x.select_one('.inq'):
            c = x.select_one('.inq')
            list2 += c
        else:
            list2 += ['']
    import re

    list3 = soup.select('.info>.bd>p:first-child')
    year_list = []
    country_list = []
    classes_list = []
    for x in list3:
        year = re.findall(r'<br/>\s*(\d{4}).*?/.*?/', str(x))
        year_list += year

        country = re.findall(r'<br/>\s*\d{4}.*?/\xa0(.*?)\xa0/', str(x))
        country_list += country

        classes = re.findall(r'<br/>\s*\d{4}.*?/\xa0.*?\xa0/\xa0(.*?)\s*</p>', str(x))
        classes_list += classes

    data = map(lambda i1, i2, i3, i4, i5, i6, i7: {'电影名': i1.attrs['alt'], '评分': i2.text, '影评': i3, '评分情况': i4.text,
                                                   '播出年份': i5, '拍摄国家': i6, '影片类型': i7}, names, score, list2, evaluation,
               year_list, country_list, classes_list)
    a += list(data)
f = open('files/data.csv', 'w', encoding='utf-8', newline='')
w1 = csv.DictWriter(f, ['电影名', '评分', '评分情况', '影评', '播出年份', '拍摄国家', '影片类型'])
w1.writeheader()
w1.writerows(a)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值