python爬虫（实例1）:requests+xpath 爬取并下载豆瓣电影的演员照片

最新推荐文章于 2022-10-01 21:52:24 发布

老毒毒毒毒

最新推荐文章于 2022-10-01 21:52:24 发布

阅读量1k

点赞数

分类专栏： python爬虫文章标签： python爬虫

本文链接：https://blog.csdn.net/weixin_38564268/article/details/93002471

版权

python爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

import requests
import re
import os
from lxml import etree


# 地址 豆瓣 复仇者联盟4：终局之战 的全部演职员
url = "https://movie.douban.com/subject/26100958/celebrities"
# 请求
res = requests.get(url)
if res.status_code != requests.codes.ok:
    print('演职员地址请求失败！')
    exit()

# 转码
res.encoding = 'utf-8'

# 保存文件
# with open('one.html', 'w') as f:
#     f.write(res.text)

# 建立HTML树
tree = etree.HTML(res.text)
path_name = '//div[@id="wrapper"]/div[@id="content"]/h1/text()'

# 创建照片文件夹
folder = os.getcwd()+'/imgs/'+tree.xpath(path_name)[0] + '/'
if not os.path.exists(folder):
    print('创建文件夹')
    os.makedirs(folder)

# 标题 节点位置
# class 有多个必须写全 例如 含有两个 celebrities-list  __multiline 不能只写一个
path_title = '//div[@class="list-wrapper"][2]/ul[@class="celebrities-list  __multiline"]/li/a'

# 提取 标题 节点
node_title = tree.xpath(path_title)
# # 正则
re_str = 'https://img+.*[jpg|png]'

for title_node in node_title:
    title = title_node.get('title')
    # 子节点的获取，线转化为字符串再转化为HTML树
    treec = etree.HTML(etree.tostring(title_node))
    # 地址位置节点
    path_url = '//div'
    # 提取节点
    node_url = treec.xpath(path_url)
    if len(node_url) > 0:
        # 获取含有图片地址的属性
        line = node_url[0].get('style')
        # 正则匹配
        matchObj = re.search( re_str, line)
        img_url = matchObj.group()
        if matchObj:
            
            # 保存文件
            with open(folder+title+img_url[-4::], 'wb') as f:
                f.write(requests.get(img_url).content)
                print('下载成功'+title)

        else:
            print('文件地址未查询到'+title)

老毒毒毒毒

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python爬虫（实例1）:requests+xpath 爬取并下载豆瓣电影的演员照片

import requestsimport reimport osfrom lxml import etree# 地址豆瓣复仇者联盟4：终局之战的全部演职员url = "https://movie.douban.com/subject/26100958/celebrities"# 请求res = requests.get(url)if res.status_code !...
复制链接

扫一扫