python 爬虫小姐姐

南下狩猎的小花猫

于 2022-08-25 16:47:29 发布

阅读量3.6k

点赞数 1

分类专栏： python 文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/HSJ0170/article/details/126527999

版权

python 专栏收录该内容

29 篇文章 0 订阅

订阅专栏

声明：本代码仅做学习python爬虫研究之用，请勿用于不正当用途。

运行：全局搜索 ‘F:/python_study/python/Pictures/’ 替换自己的文件目录，然后直接运行即可

# 目标网址：https://www.xiurenb.cc
# https://blog.csdn.net/Primordial_Shen/article/details/126292214
# 唐安琪 
# 周于希 
# 朱可儿 
# 杨晨晨 
# 芝芝 

# 徐莉芝 
# 林星阑 
# 利世 
# 鱼子酱 
# 就是阿朱啊 
# 王馨瑶 
# 陆萱萱 
# 熊小诺 
# 王雨纯 
# 梦心玥 
# 豆瓣酱 
# 江真真 
# 小肥莹 
# 安然 
# 是小逗逗 
# 小果冻儿 
# 露露 
# 韩好甜 
# 吴雪瑶 
# 萌奈子 
# 小波多 
# 沈佳熹 
# 糯美子 
# 梦乃 
# 白甜 
# 夏沫沫 
# 果儿 
# 冯木木 
# 尤妮丝 
# 小海臀 
# 阿姣 
# 波巧酱 
# 周妍希 

# 导入库
import time, os, requests
from lxml.html import etree
from urllib import parse

# 定义请求头

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
}

# 格式化列表
img_list = []
url_list = []
page_list = []

# 编码输入数据
human_unencode = input('输入写真姓名（回车确认[会下载该小姐姐全部的图片]，如输入：杨晨晨）：')
human_encode = parse.quote(human_unencode)

# 编码后索引url
url_human = 'https://www.xiurenb.cc/plus/search/index.asp?keyword=' + str(human_encode) + '&searchtype=title'

# 获取指定人物写真集列表页数
res_first = requests.get(url=url_human, headers=headers)
tree_first = etree.HTML(res_first.text)
Num_first = len(tree_first.xpath('/html/body/div[3]/div[1]/div/div/ul/div[3]/div/div[2]/a'))
print(f'{human_unencode}，总页数:{Num_first}')

# 获取指定页数的每个写真集的url并写入列表
# i = input('Enter the PageNumber:')
# print(f'Getting the page-{i}...')
# print(url_human + '&p=' + str(i))
for index in range(int(Num_first)):
    # 分页从1开始
    index = index + 1
    # 第index页前的已经下载
    if index < 4:
        continue

    print(f'{human_unencode}，开始下载第：{index}页数据......')
    res_human = requests.get(url_human + '&p=' + str(index))
    tree_human = etree.HTML(res_human.text)
    jihe_human = tree_human.xpath('/html/body/div[3]/div[1]/div/div/ul/div[3]/div/div[1]/div/div[1]/h2/a/@href')
    for page in jihe_human:
        page_list.append(page)
    # time.sleep(1)

    # 获取每个写真集的全部图片
    for Page_Num in page_list:
        url = 'https://www.xiurenb.cc' + str(Page_Num)
        Num_res = requests.get(url=url, headers=headers)
        Num_tree = etree.HTML(Num_res.text)
        Num = len(Num_tree.xpath('/html/body/div[3]/div/div/div[4]/div/div/a'))
        url_list.append(url)
        for i in range(1, int(Num) - 2):
            url_other = url[:-5] + '_' + str(i) + '.html'
            url_list.append(url_other)
        # 获取所有图片url
        for url_img in url_list:
            res = requests.get(url=url_img, headers=headers)
            tree = etree.HTML(res.text)
            img_src = tree.xpath('/html/body/div[3]/div/div/div[5]/p/img/@src')
            for img in img_src:
                img_list.append(img)
            time.sleep(0.1)
        # 创建保存目录
        res = requests.get(url=url_list[0], headers=headers)
        res.encoding = 'utf-8'
        tree = etree.HTML(res.text)
        path_name = tree.xpath('/html/body/div[3]/div/div/div[1]/h1//text()')[0][11:]
        print(path_name)
        if not os.path.exists(f'F:/python_study/python/Pictures/{human_unencode}'):
            os.mkdir(f'F:/python_study/python/Pictures/{human_unencode}')
        the_path_name = f'F:/python_study/python/Pictures/{human_unencode}/' + path_name
        # 期数已经存在，跳过
        if not os.path.exists(the_path_name):
            os.mkdir(the_path_name)
            # 保存图片数据
            num = 0
            for j in img_list:
                img_url = 'https://www.xiurenb.cc' + j
                img_data = requests.get(url=img_url, headers=headers).content
                img_name = img_url.split('/')[-1]
                finish_num = str(num) + '/' + str(len(img_list))
                with open(f'F:/python_study/python/Pictures/{human_unencode}/' + path_name + '/' + img_name, 'wb') as f:
                    print(f'正在下载图片:{img_name}/{finish_num}')
                    f.write(img_data)
                    f.close()
                num += 1
                time.sleep(0.1)
            # 再次格式化列表
            img_list = []
            url_list = []
        else:
            print('已存在的期数，跳过>>>')
            # 再次格式化列表
            img_list = []
            url_list = []

    # 再次格式化列表下一页
    page_list = []
# 输出结束提示
print(f'{human_unencode}，全部下载完成!')