python桌面爬虫_Python爬虫 利用python爬取ZOL桌面壁纸大图

Python爬虫 利用python爬取ZOL桌面壁纸大图

利用 pool.map 线程池 高效爬取大图片

python代码

# coding=gb2312

import requests

from lxml import etree

from multiprocessing import Pool

import os

from time import sleep

import random

class Down_pic():

def __init__(self):

self.headers = {

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",

"Accept-Encoding": "gzip, deflate",

"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",

"Cache-Control": "max-age=0",

"Connection": "keep-alive",

"Host": "desk.zol.com.cn",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",

}

self.main_url = 'http://desk.zol.com.cn' # 主页

self.dic = {} # 存放图片路径

self.count = 0 # 总得图片数量

self.k = 0 # 用来取datas数据 切片

self.type = 5 # 大分类取 5个

self.small_type_num = 4 # 小分类取4个

self.tupian = 3 # 每一个小分类下载的图片数量 (最多3个)

def get_tree(self, htlm):

tree = etree.HTML(htlm)

return tree

# 获取大分类

def get_type(self):

main_page = requests.get(self.main_url, headers=self.headers).text

tree = self.get_tree(main_page)

a_list = tree.xpath('//*[@id="main"]/dl[1]/dd/a')

# 去掉全部这个分类

a_list.pop(0)

for a in a_list[0:self.type]:

type_name = a.xpath('./text()')[0]

type_url = self.main_url + a.xpath('./@href')[0]

yield type_name, type_url

# 获取小分类

def get_small_type(self):

for type_name, type_url in self.get_type():

small_page = requests.get(type_url)

small_page.encoding = 'gb2312'

tree = self.get_tree(small_page.text)

# 获取小分类

small_name_list = tree.xpath('//a[@class="pic"]/span/@title')[0:self.small_type_num] # 小分类取3个

small_url_list = tree.xpath('//a[@class="pic"]/@href')[0:self.small_type_num]

self.dic[type_name] = small_name_list

yield small_url_list

# 获取图片列表

def get_pic_list(self):

for pic_page_url in self.get_small_type():

for pic_url in pic_page_url:

url = self.main_url + pic_url

pic_page = requests.get(url=url).text

etree = self.get_tree(pic_page)

pic_list_url = etree.xpath('//*[@id="showImg"]/li/a/@href')

for pic_url in pic_list_url[:self.tupian]: # 每一个小分类 取几个图片

yield self.main_url + pic_url

# 获取图片尺寸网址页面

def get_size(self):

for pic_url in self.get_pic_list():

pic_page = requests.get(pic_url).text

etree = self.get_tree(pic_page)

try:

data_url = self.main_url + etree.xpath('//*[@id="tagfbl"]/a[2]/@href')[0] # 多数默认2880*1800

except Exception:

data_url = etree.xpath('//*[@id="bigImg"]/@src')[0]

yield data_url

# 获取图片下载地址

def get_data(self):

for url in self.get_size():

data_page = requests.get(url).text

etree = self.get_tree(data_page)

try:

pic_data_url = etree.xpath('/html/body/img[1]/@src')[0]

except Exception:

pic_data_url = url

self.count += 1

yield pic_data_url

self.num = self.count

# 开启线程下载

def ppp(self):

print('开启线程')

pool = Pool(5)

datas = pool.map(self.download, [url for url in self.get_data()])

pool.close()

pool.join()

for type_name in self.dic:

for small_name in self.dic[type_name]:

path = type_name + '/' + small_name

path = path[:path.find('?')]

if not os.path.exists(path):

os.makedirs(path)

for data in datas[self.k:self.k + self.tupian]:

name = small_name + str(random.randint(1, 1000)) # 图片名

pa = path + '/' + name + '.jpg'

with open(pa, 'wb') as f:

f.write(data)

self.k += self.tupian

print('共下载:{}图片'.format(self.count))

def download(self, url):

# print('\r当前下载进度:{}%'.format((1 - self.num / self.count) * 100), end='')

data = requests.get(url=url).content

sleep(1)

return data

if __name__ == '__main__':

down = Down_pic()

down.ppp()

效果预览

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值