import requests
import csv
import time
from lxml import etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.0.10191 SLBChan/105"
}
move_title = ["电影名", "出产地", "电影时长", "上映时间", "封面网址"]
with open("moves.csv", "w", encoding="utf_8", newline="") as f:
w = csv.writer(f)
w.writerow(move_title)
for i in range(1, 3):
url = "https://ssr3.scrape.center" + f"/page/{i}"
response = requests.get(url, headers=headers, auth=("admin", "admin")) # auth=():这是一个包含用户名和密码的元组,用于登录或验证用户
time.sleep(2)
# 解析获取的响应
html = etree.HTML(response.content.decode()) # .decode() 方法是将“字节串”(bytes)将其转换为“字符串”(str)。
# 获取li标签 分组
div_list = html.xpath('//div[@class="el-col el-col-18 el-col-offset-3"]/div')
for li in div_list:
name = li.xpath('.//div/div/div[2]/a/h2/text()')[0]
addr = li.xpath('.//div[@class="el-row"]/div[2]/div[2]/span[1]/text()')[0]
time_long = li.xpath('.//div[@class="el-row"]/div[2]/div[2]/span[3]/text()')[0]
time_agen = li.xpath('.//div[@class="el-row"]/div[2]/div[3]/span/text()')[0] if len(
li.xpath('.//div[@class="el-row"]/div[2]/div[3]/span/text()')) > 0 else None
img_url = li.xpath('.//div[@class="el-row"]/div[1]/a/img/@src')[0]
# 获取封面
responses = requests.get(img_url, headers=headers)
filepath = r'./dyimg/' + name + ".png"
with open(filepath, "wb") as f:
f.write(responses.content)
# print(filepath)
# 详情页
detail_url = 'https://ssr1.scrape.center' + li.xpath('./div/div/div[1]/a/@href')[0]
print(detail_url)
responses1 = requests.get(detail_url, headers=headers) # 获取响应
html = etree.HTML(responses1.content.decode()) # 解析响应
datas = html.xpath('//div[@class="el-card__body"]/div/div[2]/div[4]/p/text()')[0]
# 保存爬取的数据
move_info = [name, addr, time_long, time_agen, img_url, datas]
w.writerow(move_info)
print("爬取成功")
2.requests 用for循环爬取“下一页”和“详情页”的内容并保存图片和文字(csv文件)
于 2023-12-27 22:05:15 首次发布