day17作业

最新推荐文章于 2024-09-26 19:15:00 发布

白蚀754

最新推荐文章于 2024-09-26 19:15:00 发布

阅读量61

点赞数

文章标签： python 开发语言

本文链接：https://blog.csdn.net/qq_61966149/article/details/128556466

版权

day17作业

爬取电影网站信息

import requests
from bs4 import BeautifulSoup
import csv

a = []
for x in range(10):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }
    response = requests.get(f'https://movie.douban.com/top250?start={x * 25}&filter=', headers=headers)
    result = response.text

    soup = BeautifulSoup(result, 'lxml')
    names = soup.select('img')
    names.pop()

    print(len(names))
    score = soup.select('.rating_num')
    print(len(score))
    people = soup.select('div.star>span')
    print(len(people))
    evaluation = []
    for x in range(0, len(people), 4):
        evaluation.append(people[x + 3])

    list1 = soup.select('.info>.bd')
    list2 = []
    for x in list1:
        if x.select_one('.inq'):
            c = x.select_one('.inq')
            list2 += c
        else:
            list2 += ['']
    import re

    list3 = soup.select('.info>.bd>p:first-child')
    year_list = []
    country_list = []
    classes_list = []
    for x in list3:
        year = re.findall(r'<br/>\s*(\d{4}).*?/.*?/', str(x))
        year_list += year

        country = re.findall(r'<br/>\s*\d{4}.*?/\xa0(.*?)\xa0/', str(x))
        country_list += country

        classes = re.findall(r'<br/>\s*\d{4}.*?/\xa0.*?\xa0/\xa0(.*?)\s*</p>', str(x))
        classes_list += classes

    data = map(lambda i1, i2, i3, i4, i5, i6, i7: {'电影名': i1.attrs['alt'], '评分': i2.text, '影评': i3, '评分情况': i4.text,
                                                   '播出年份': i5, '拍摄国家': i6, '影片类型': i7}, names, score, list2, evaluation,
               year_list, country_list, classes_list)
    a += list(data)
f = open('files/data.csv', 'w', encoding='utf-8', newline='')
w1 = csv.DictWriter(f, ['电影名', '评分', '评分情况', '影评', '播出年份', '拍摄国家', '影片类型'])
w1.writeheader()
w1.writerows(a)