python爬取壁纸程序(http://www.jj20.com/)

之前有写了,美女一栏的后来朋友们反映,除了美女就没有其他的了,今天特意整合了几个栏目供选择
新增加了图片小于1920x1080的不要
在这里插入图片描述
在这里插入图片描述
程序已打包上传
全部代码如下

import requests
import time
import os
from multiprocessing import Pool, cpu_count, current_process, Process, freeze_support
from lxml import etree
import parsel
import re
from PIL import Image
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
import threading

timeout = 10
# 下载图片保存路径
DIR_PATH = r"D:\meizi"
header = \
    {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Cookie": "td_cookie=2464237204; td_cookie=2279767161; td_cookie=2279297450; __yjs_duid=1_6ab2ea97fb9890674b30afac7438c78d1623378472065; UM_distinctid=179f8e5766edd-01960ab8d9a8cc-68141f7b-1fa400-179f8e5766f714; td_cookie=2462230420; Hm_lvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623378459,1623919239; CNZZDATA1278657954=1933864588-1623376693-%7C1623973822; Hm_lpvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623978864",
        "Host": "www.jj20.com",
        # "Referer": "http://www.jj20.com/bz/hhzw/",
        "Pragma": "no-cache",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36"
    }

# 获取每一页的地址


def get_page_num():
    try:
        r = requests.get(url=base_url+fpath, timeout=10, headers=header)
        html = parsel.Selector(r.text)
        list = html.xpath(
            '//span[@class="ea"]/select[@name="sldd"]/option/@value').extract()
        return list
    except Exception as e:
        print(e)

# 组装每一页上每个item的地址


def get_page_detail(list):
    try:
        page_list = []
        for i in list:
            url = base_url+fpath+i
            r = requests.get(url=url, timeout=10, headers=header)
            html = parsel.Selector(r.text)
            url_list = html.xpath(
                '//ul[@class="picbz"]/li/a[@target="_blank"]/@href').extract()
            nums = html.xpath('//ul[@class="picbz"]/li/text()').extract()
            total = []
            for d in nums:
                if re.findall(r"\d+\.?\d*", d):
                    total.append(re.findall(r"\d+\.?\d*", d)[0])
            for ind, i in enumerate(url_list):
                id = re.findall(r"\d+", i)[0]
                for i3 in range(1,int(total[ind])):
                    s = f'{id}_{i3+1}'
                    page_list.append(i.replace(id, s))
        return page_list
    except Exception as e:
        print("get_page_detail",e)


# 获取原图地址
def get_img_orangin_url(url, index):
    try:
        headerx = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Cookie": "td_cookie=2279820233; td_cookie=2279767161; td_cookie=2279297450; __yjs_duid=1_6ab2ea97fb9890674b30afac7438c78d1623378472065; UM_distinctid=179f8e5766edd-01960ab8d9a8cc-68141f7b-1fa400-179f8e5766f714; yjs_js_security_passport=b6f33999198afafd68edc422416189e2ceed9b0c_1623919246_js; td_cookie=2462230420; Hm_lvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623378459,1623919239; CNZZDATA1278657954=1933864588-1623376693-%7C1623919362; Hm_lpvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623919825",
            "Host": "www.jj20.com",
            "Pragma": "no-cache",
            "Referer": "http://www.jj20.com/",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36"
        }
        r = requests.get(url=url, timeout=10, headers=headerx)
        r.encoding = r.apparent_encoding
        html = parsel.Selector(r.text)
        file_name = html.xpath('/html/body/div[3]/h1/span/text()').get()
        img_url = html.xpath('/html/body/script[1]').get()
        pattern = re.compile("'(.*)'")
        img_url = pattern.findall(img_url)[0]
        img_url = 'http://pic.jj20.com'+img_url
        file_name = re.sub('\(.*\)', '', file_name)
        if '_' in url:
            file_name = file_name+img_url[img_url.index('-')+1:]
        else:
            file_name = f'{file_name}-1.jpg'
        return {
            "name": file_name,
            "img_url": img_url
        }
    except Exception as e:
        print(e, 'eeeeeeeee')
# 保存图片


def save_img(img_url, file_name):
    try:
        if not os.path.exists(DIR_PATH):
            os.makedirs(DIR_PATH)
        os.chdir(DIR_PATH)
        img_header = {
            "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
            "Cache-Control": "no-cache",
            "Cookie": "__yjs_duid=1_6ab2ea97fb9890674b30afac7438c78d1623378472065; UM_distinctid=179f8e5766edd-01960ab8d9a8cc-68141f7b-1fa400-179f8e5766f714; Hm_lvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623378459; Hm_lpvt_d9f1c8630a7aa5c7ce2a72d4b564c044=1623734936",
            "Host": "pic.jj20.com",
            "Pragma": "no-cache",
            "Referer": "http://cj.jj20.com/",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
        }
        response = requests.get(img_url, headers=img_header, timeout=timeout)
        image = Image.open(BytesIO(response.content))
        if image.width < 1920 or image.height < 1080:
            print("图片宽高不合适已舍弃...")
        else:
            image.save(file_name)
    except Exception as e:
        print(e)


def callback(s):
    try:
        img_url = s.get("img_url")
        file_name = s.get("name")
        if os.path.exists(os.path.join(DIR_PATH, file_name)):
            print("图片已存在...")
        else:
            save_img(img_url, file_name)
    except Exception as e:
        print(e, "callback")


def get_img(page_list):
    try:
        for index, i in enumerate(page_list):
            print(f'共有{l}张图片,正在爬取第{index+1}张')
            s = get_img_orangin_url(base_url+i, index+1)
            # s = tpool.submit(get_img_orangin_url, base_url+i,index+1)
            callback(s)
            # print(s.result())
            # tpool.submit(callback,s)
    except Exception as e:
        print(e)


if __name__ == '__main__':
    freeze_support()
    img_list = [{"img_url": "url", "name": "name"}]
    type_obj = {
        "1": {
            "name": "美女模特",
            "val": "/bz/nxxz/nxmt/"
        },
        "2": {
            "name": "花卉",
            "val": "/bz/hhzw/"
        },
        "3": {
            "name": "动漫",
            "val": "/bz/ktmh/"
        },
        "4": {
            "name": "风景",
            "val": "/bz/zrfg/"
        },

    }
    a = input("请输入你要爬取的内容:(1:美女模特 2:花卉3:动漫4:风景)")
    if a=="1" or a=="2" or a=="3" or a=="4":
        fpath=type_obj[a].get("val")
        DIR_PATH=os.path.join(DIR_PATH,type_obj[a].get("name"))
    else:
        print("输入不合法")
    st = time.time()
    print("程序正在执行,请稍后...")
    base_url = 'http://www.jj20.com'
    list=get_page_num()
    # list=['list_16_1.html']
    # 创建一个包含4条线程的线程池
    tpool = ThreadPoolExecutor(4)
    future1 = tpool.submit(get_page_detail, list)
    page_list=future1.result()
    future2 = tpool.submit(get_img, page_list)
    l=len(page_list)
    et = time.time()
    # 关闭线程池
    # if(future1.done() and future2.done()):
    #     tpool.shutdown()
    print(f'用时{et-st}秒...')
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值