day17作业评讲
import requests
from bs4 import BeautifulSoup
import csv
from re import findall
# 多数时候数据不理想,则需要清洗
# 获取一页
def get_one_page(page):
url = ...
headers = {...}
response = requests.get(url, headers=headers)
html = response.text
"""
套路:需要获取某网页的信息结构基本一致,网页解析时:
找到重复结构的代码,标签对象.select/select_one
"""
# 2.解析数据
soup = BeautifulSoup(html,'lxml')
# 获取每个电影的div
all_film_box = soup.select('.item')
# 遍历列表获取每个电影div
for div in all_film_box:
# 电影名称
name = div.select_one('.title').text
# print(name)
# 评分
score = float(div.select_one('.rating_num').text)
# print(score)
# 评论数
comment_num = int(div.select('.star>span')[-1].text[:-3])
# print(comment_num)
# comment_num = int(div.select_one('.star>span:nth-child(4)').text[:3])
# 只有前后关系是>的时候才能":nth-child(4)",提取评论数数字:切片
#描述
describe = div.select_one('.inq').text
# print(describe)
#时间,国家,分类
message = div.select_one('.bd>p').text
# 去掉字符串前后空格:字符串.strip
info = message.strip().split('\n')[-1].strip()
# 切割
#result = info.split('/')
# print([x.strip() for x in result])
result = [x.strip() for x in info.split('/')]
# print(result)
# 正则提取
# time_country_and_plot = findall(r'(\d+)\s/\s*(.+?)\s*/\s*(.+)', info)
# print(time_country_and_plot)
time = result[0]
country = result[1]
film_type = result[-1]
# 将数据写入到csv文件中
w.writerow([name, score, comment_num, time, country, film_type, describe])
print('写入成功!')
if __name__ == '__main__':
# 1. 创建writer
w = csv.writer(open('files/films.csv', 'w', encoding='utf-8', newline=''))
w.writerow(['电影名称', '评分', '评论人数', '上映时间', '国家', '类型', '描述'])
# 2. 获取数据
# get_one_page()
# 获取十页,第二页start=25,第十页的start=225
for start in range(0, 226, 25):
get_one_page(start)
break