本文仅供学习使用及参考。
每行代码均有注释
# -*- coding: utf-8 -*-
# @Author : 归燕
# @FileName: 02-豆瓣电影TOP250.py
# @Time : 2023/12/12 18:07
import csv
import random
import time
import requests
import logging
from lxml import etree
# 日志配置
logging.basicConfig(level=logging.INFO,
format='%(asctime)s-%(levelname)s:%(message)s',
filename='02-test.log',
filemode='a'
)
# 设置请求头
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
# 基础url
base_url = "https://movie.douban.com/top250?start={start}&filter="
# 请求数据
def scerpe_api(url):
logging.info("scraping %s", url)
try:
# 请求数据
response = requests.get(url=url,headers=header)
# 休眠
time.sleep(random.randint(1, 10))
print(response.text)
# 判断是否响应结果
if response.status_code == 200:
response.encoding = 'utf-8'
element = etree.HTML(response.text)
return element
logging.error('网站请求状态码为:%s ;请求网址为:%s', response.status_code, url)
except requests.RequestException:
logging.info("未响应网址:%s", url, exc_info=True)
# 解析网页 TOP250
def TOP_250(element):
# 电影名称
title = element.xpath('//a/span[@class="title"][1]/text()')
# 电影评分
score = element.xpath('//div[@class="star"]/span[2]/text()')
# 电影标签
tag = element.xpath('//div[@class="bd"]/p[1]/text()')
# 详情url
detail_url = element.xpath('//div[@class="pic"]/a/@href')
print(detail_url)
# 保存数据
save_data(title=title,score=score,tag=tag)
# 保存数据
def save_data(title,score,tag):
path = "02-TOP250.csv"
with open(path,"a+",encoding='utf-8') as f:
csv_write = csv.writer(f)
# 循环保存数据
for i in range(len(title)):
title_ = str(title[i]).replace(" ","")
tag_ = str(tag[(2*i+1)]).replace(" ","")
score_ = str(score[i]).replace(" ","")
data_row =[title_,tag_,score_]
csv_write.writerow(data_row)
# 单次保存25条
LIMIT = 25
# 主函数
def main():
for start in range(0,10):
# 构建url
url = base_url.format(start=LIMIT*start)
# 请求数据
html = scerpe_api(url)
# 解析及保存数据
TOP_250(html)
if __name__ == '__main__':
main()