一、要求
得到电影天堂的最新下载页面下的相关数据 点击打开 ;
存储为 csv 格式的表格文件
二、可能遇到的bug和解决
使用正则表达式匹配到没有"《》"的情况
请求链接返回页面有误
部分电影详情页面没有评分导致返回错误
三种小错误都是通过条件判断解决
三、参考代码
import requests
from lxml import etree
import re
from datetime import datetime
# 创建文件名 时间.csv
datetime.now().strftime('%Y-%m-%d %H:%M:%S')
datetime.now().isoformat()
content = str(datetime.now())
pattern = re.compile(r"[0-9]*") # 正则表达式
result = pattern.findall(content)
file_name = ""
for i in range(0, 12):
file_name = file_name + result[i]
fp = open(f"./文件/{file_name}.csv", mode="w", encoding="GB18030") # GB18030是本地计算机excel的默认编码
data = "序号,电影,评分,磁力链接\n"
fp.write(data)
fp.close()
# 解析网页数据
url = "https://www.dy2018.com/html/gndy/dyzz/index.html"
content = requests.get(url)
content.encoding = content.apparent_encoding # 确定编码
content = content.text
movie_num = 0
if content:
root = etree.HTML(content)
pages = root.xpath("//select/option/@value")
# print(pages)
for page in pages:
new_url = "https://www.dy2018.com" + page
content = requests.get(new_url)
content.encoding = content.apparent_encoding
content = content.text
if content:
fp = open(f"./文件/{file_name}.csv", mode="a", encoding="GB18030")
root = etree.HTML(content)
get_moive = root.xpath("//b/a/@title") # 获取电影简介
for name, url in zip(get_moive, root.xpath("//b/a/@href")):
movie_num += 1
print(name)
score_url = "https://www.dy2018.com" + url # 电影详情页面
score_content = requests.get(score_url)
score_content.encoding = score_content.apparent_encoding
score_content = score_content.text
# print(score_url)
movie_name = ""
score_root = etree.HTML(score_content)
if score_root.xpath("//strong/text()"): # 获取评分
movie_score = score_root.xpath("//strong/text()")
else:
movie_score = [","]
tmp_download_url = score_root.xpath("//td[@bgc