day17作业评讲

百事不可乐BOOM

已于 2023-01-05 18:53:26 修改

阅读量102

点赞数

文章标签： python 开发语言

于 2023-01-05 17:47:48 首次发布

本文链接：https://blog.csdn.net/qq_63449560/article/details/128567730

版权

day17作业评讲


import requests
from bs4 import BeautifulSoup
import csv
from re import findall

# 多数时候数据不理想，则需要清洗


# 获取一页
def get_one_page(page):
    url = ...
    headers = {...}
    response = requests.get(url, headers=headers)
    html = response.text

    """
    套路：需要获取某网页的信息结构基本一致，网页解析时：
    找到重复结构的代码，标签对象.select/select_one
    """
    # 2.解析数据
    soup = BeautifulSoup(html,'lxml')
    # 获取每个电影的div
    all_film_box = soup.select('.item')

    # 遍历列表获取每个电影div
    for div in all_film_box:
        # 电影名称
        name = div.select_one('.title').text
        # print(name)
        # 评分
        score = float(div.select_one('.rating_num').text)
        # print(score)
        # 评论数
        comment_num = int(div.select('.star>span')[-1].text[:-3])
        # print(comment_num)
        # comment_num = int(div.select_one('.star>span:nth-child(4)').text[:3])
        # 只有前后关系是>的时候才能":nth-child(4)",提取评论数数字：切片
        #描述
        describe = div.select_one('.inq').text
        # print(describe)
        #时间，国家，分类
        message = div.select_one('.bd>p').text
        # 去掉字符串前后空格:字符串.strip
        info = message.strip().split('\n')[-1].strip()
        # 切割
        #result = info.split('/')
        # print([x.strip() for x in result])
        result = [x.strip() for x in info.split('/')]
        # print(result)
        # 正则提取
        # time_country_and_plot = findall(r'(\d+)\s/\s*(.+?)\s*/\s*(.+)', info)
        # print(time_country_and_plot)
        time = result[0]
        country = result[1]
        film_type = result[-1]

        # 将数据写入到csv文件中
        w.writerow([name, score, comment_num, time, country, film_type, describe])
    print('写入成功！')


if __name__ == '__main__':
    # 1. 创建writer
    w = csv.writer(open('files/films.csv', 'w', encoding='utf-8', newline=''))
    w.writerow(['电影名称', '评分', '评论人数', '上映时间', '国家', '类型', '描述'])

    # 2. 获取数据
    # get_one_page()
    # 获取十页，第二页start=25，第十页的start=225
    for start in range(0, 226, 25):
        get_one_page(start)
        break

star=25 star=225