import requests
import re
import os
from lxml import etree
# 地址 豆瓣 复仇者联盟4:终局之战 的全部演职员
url = "https://movie.douban.com/subject/26100958/celebrities"
# 请求
res = requests.get(url)
if res.status_code != requests.codes.ok:
print('演职员地址请求失败!')
exit()
# 转码
res.encoding = 'utf-8'
# 保存文件
# with open('one.html', 'w') as f:
# f.write(res.text)
# 建立HTML树
tree = etree.HTML(res.text)
path_name = '//div[@id="wrapper"]/div[@id="content"]/h1/text()'
# 创建照片文件夹
folder = os.getcwd()+'/imgs/'+tree.xpath(path_name)[0] + '/'
if not os.path.exists(folder):
print('创建文件夹')
os.makedirs(folder)
# 标题 节点位置
# class 有多个必须写全 例如 含有两个 celebrities-list __multiline 不能只写一个
path_title = '//div[@class="list-wrapper"][2]/ul[@class="celebrities-list __multiline"]/li/a'
# 提取 标题 节点
node_title = tree.xpath(path_title)
# # 正则
re_str = 'https://img+.*[jpg|png]'
for title_node in node_title:
title = title_node.get('title')
# 子节点的获取,线转化为字符串再转化为HTML树
treec = etree.HTML(etree.tostring(title_node))
# 地址位置节点
path_url = '//div'
# 提取节点
node_url = treec.xpath(path_url)
if len(node_url) > 0:
# 获取含有图片地址的属性
line = node_url[0].get('style')
# 正则匹配
matchObj = re.search( re_str, line)
img_url = matchObj.group()
if matchObj:
# 保存文件
with open(folder+title+img_url[-4::], 'wb') as f:
f.write(requests.get(img_url).content)
print('下载成功'+title)
else:
print('文件地址未查询到'+title)