python试验--使用requests批量下载壁纸

由于鄙人刚刚接触爬虫, 对法律要求还不是很了解, 所以分析过程就尽量简陋, 不深入分析, 只讲个大致分析思路, 请见谅

1. 先导入所需要的模块, 其中requests是获取网页必备的模块, re是正则模块(鄙人暂时只会使用正则, 请谅解), threading用来创建多线程提高下载速度, random模块是用于随机获取免费代理, time模块是用于休眠, 避免因为访问太快以至于服务器限制我们范围, 再者也可以减轻对于服务器的负担

import re
import requests
import threading
import random
import time

2. 若要批量下载图片, 首先就得要有图片所在的页面链接才行, 所以定义一个函数获取批量页面的url, 分析一下页面规律, 我害怕违法所以就不显示url, 请谅解

def url_nums(num):
    url_list = [f'https://***********.com/2/index_{i}.shtml' for i in range(2, num + 1)]
    return url_list

3. 获取到页面url之后, 我们需要获取页面的html并且对html用正则模块re来提取出我们需要的部分源码, 以便于后面提取出图片url

def html_code(url, headers, proxies):
    resp = requests.get(url, headers=headers, proxies=proxies)
    resp.encoding = 'utf-8'
    html = resp.text
    pattern = r'<html lang="en">(.*?)<nav class="paging">'
    string = html
    html_needing = re.findall(pattern, string, re.S)[0]
    return html_needing

4. 得到我们所需要的部分源码后, 我们需要进一步提取, 主要是提取出图片的url以及图片的名字, 要注意缩小范围, 防止下载到其他的不需要的图片

def img_name_url(html_needing):
    string = html_needing
    pattern1 = '<img data-src="(.*?)" alt=".*?">'
    pattern2 = '<img data-src=".*?" alt="(.*?)">'
    img_url_lists = re.findall(pattern1, string, re.S)
    name_lists = re.findall(pattern2, string, re.S)
    name_lists = [i.replace('/', '20') for i in name_lists]
    name_lists = [i[0:7] for i in name_lists]
    name_lists = [i.replace('"', '101') for i in name_lists]
    return img_url_lists, name_lists


def button_name_url(html_needing):
    string = html_needing
    pattern1 = r'<button data-src="(.*?)" data-behaviour="WALLPAPER_DOWNLOAD" data-name=".*?【.*?】"'
    pattern2 = r'<button data-src=".*?" data-behaviour="WALLPAPER_DOWNLOAD" data-name="(.*?)【.*?】"'
    img_url_lists = re.findall(pattern1, string, re.S)
    name_lists = re.findall(pattern2, string, re.S)
    name_lists = [i.replace('/', '1') for i in name_lists]
    name_lists = [i[0:7] for i in name_lists]
    name_lists = [i.replace('"', '10') for i in name_lists]
    return img_url_lists, name_lists

由于鄙人下载的时候名字出现了有空格, 有斜杠等猪多问题, 所以就一次性改成这样, 放弃对正确名字的处理, 请谅解, 之所以定义了两个函数, 一个是用于下载页面显示的图片, 一个是图片下载的标清图片

5. 得到图片的url后就得下载, 因此定义一个函数专门用于下载:

def download(url, name, style, headers, proxies):
    resp = requests.get(url, headers=headers, proxies=proxies)
    path = f'C:/Users/13255/Desktop/{style}/{name}.jpg'
    path = path.replace(' ', '88')
    with open(path, 'wb') as fh:
        print(f'正在下载:{path}')
        fh.write(resp.content)

6. 要想使用多线程, 那么我们得另外定义多几个函数来把上面的函数建立联系


        print(f'正在下载:{path}')
        fh.write(resp.content)


def begin_spider1(url, headers, proxies):
    html_needing = html_code(url, headers=headers, proxies=proxies)
    img_list, name_list = img_name_url(html_needing)
    for x, y in zip(img_list, name_list):
        proxym = random.choice(proxy_list_http)
        proxyn = proxy_list_https[proxy_list_http.index(proxym)]
        proxyx = {
            'HTTP': proxym,
            'HTTPS': proxyn
        }
        download(x, y, '标清图片', headers=header, proxies=proxyx)


def begin_spider2(url, headers, proxies):
    html_needing = html_code(url, headers=headers, proxies=proxies)
    img_list, name_list = button_name_url(html_needing)
    for x, y in zip(img_list, name_list):
        proxym = random.choice(proxy_list_http)
        proxyn = proxy_list_https[proxy_list_http.index(proxym)]
        proxyx = {
            'HTTP': proxy1,
            'HTTPS': proxy2
        }
        download(x, y, '高清图片', headers=header, proxies=proxyx)

7. 考虑到有可能的小反爬, 我就加多了个headers和proxy

header = {
       '*****':
                '**********'

}
proxy_list_origin = ['*********', '**********', '**********',  '**********',  '**********']
proxy_list_http = [f'http://{i}:9999' for i in proxy_list_origin]
proxy_list_https = [f'https://{i}:9999' for i in proxy_list_origin]

我使用的是免费的代理, 可能已经失效了, 所以我就不敢公布出来\

8. 准备工作完成后, 就准备循环遍历说获得的url并创建多个线程对象批量处理:

if __name__ == '__main__':
    url_list = url_nums(200)
    begin_spider1_thread_list = []
    begin_spider2_thread_list = []
    for i in url_list:
        proxy1 = random.choice(proxy_list_http)
        proxy2 = proxy_list_https[proxy_list_http.index(proxy1)]
        proxy = {
            'HTTP': proxy1,
            'HTTPS': proxy2
        }
        begin_spider1_thread = threading.Thread(target=begin_spider1, args=(i, header, proxy))
        begin_spider1_thread_list.append(begin_spider1_thread)
        begin_spider2_thread = threading.Thread(target=begin_spider2, args=(i, header, proxy))
        begin_spider2_thread_list.append(begin_spider2_thread)

    for i in begin_spider1_thread_list:
        time.sleep(1)
        i.start()
    for i in begin_spider2_thread_list:
        time.sleep(1)
        i.start()

9. 所用代码如下:

import re
import requests
import threading
import random
import time

header = {
        '*****':
            '***************'

}
proxy_list_origin = ['**********', '**********', '**********', '**********', '**********']
proxy_list_http = [f'http://{i}:9999' for i in proxy_list_origin]
proxy_list_https = [f'https://{i}:9999' for i in proxy_list_origin]


def url_nums(num):
    url_list = [f'https://************/2/index_{i}.shtml' for i in range(2, num + 1)]
    return url_list


def html_code(url, headers, proxies):
    resp = requests.get(url, headers=headers, proxies=proxies)
    resp.encoding = 'utf-8'
    html = resp.text
    pattern = r'<html lang="en">(.*?)<nav class="paging">'
    string = html
    html_needing = re.findall(pattern, string, re.S)[0]
    return html_needing


def img_name_url(html_needing):
    string = html_needing
    pattern1 = '<img data-src="(.*?)" alt=".*?">'
    pattern2 = '<img data-src=".*?" alt="(.*?)">'
    img_url_lists = re.findall(pattern1, string, re.S)
    name_lists = re.findall(pattern2, string, re.S)
    name_lists = [i.replace('/', '20') for i in name_lists]
    name_lists = [i[0:7] for i in name_lists]
    name_lists = [i.replace('"', '101') for i in name_lists]
    return img_url_lists, name_lists


def button_name_url(html_needing):
    string = html_needing
    pattern1 = r'<button data-src="(.*?)" data-behaviour="WALLPAPER_DOWNLOAD" data-name=".*?【.*?】"'
    pattern2 = r'<button data-src=".*?" data-behaviour="WALLPAPER_DOWNLOAD" data-name="(.*?)【.*?】"'
    img_url_lists = re.findall(pattern1, string, re.S)
    name_lists = re.findall(pattern2, string, re.S)
    name_lists = [i.replace('/', '1') for i in name_lists]
    name_lists = [i[0:7] for i in name_lists]
    name_lists = [i.replace('"', '10') for i in name_lists]
    return img_url_lists, name_lists


def download(url, name, style, headers, proxies):
    resp = requests.get(url, headers=headers, proxies=proxies)
    path = f'C:/Users/user/Desktop/{style}/{name}.jpg'
    path = path.replace(' ', '88')
    with open(path, 'wb') as fh:
        print(f'正在下载:{path}')
        fh.write(resp.content)


def begin_spider1(url, headers, proxies):
    html_needing = html_code(url, headers=headers, proxies=proxies)
    img_list, name_list = img_name_url(html_needing)
    for x, y in zip(img_list, name_list):
        proxym = random.choice(proxy_list_http)
        proxyn = proxy_list_https[proxy_list_http.index(proxym)]
        proxyx = {
            'HTTP': proxym,
            'HTTPS': proxyn
        }
        download(x, y, '标清图片', headers=header, proxies=proxyx)


def begin_spider2(url, headers, proxies):
    html_needing = html_code(url, headers=headers, proxies=proxies)
    img_list, name_list = button_name_url(html_needing)
    for x, y in zip(img_list, name_list):
        proxym = random.choice(proxy_list_http)
        proxyn = proxy_list_https[proxy_list_http.index(proxym)]
        proxyx = {
            'HTTP': proxy1,
            'HTTPS': proxy2
        }
        download(x, y, '高清图片', headers=header, proxies=proxyx)


if __name__ == '__main__':
    url_list = url_nums(200)
    begin_spider1_thread_list = []
    begin_spider2_thread_list = []
    for i in url_list:
        proxy1 = random.choice(proxy_list_http)
        proxy2 = proxy_list_https[proxy_list_http.index(proxy1)]
        proxy = {
            'HTTP': proxy1,
            'HTTPS': proxy2
        }
        begin_spider1_thread = threading.Thread(target=begin_spider1, args=(i, header, proxy))
        begin_spider1_thread_list.append(begin_spider1_thread)
        begin_spider2_thread = threading.Thread(target=begin_spider2, args=(i, header, proxy))
        begin_spider2_thread_list.append(begin_spider2_thread)

    for i in begin_spider1_thread_list:
        time.sleep(1)
        i.start()
    for i in begin_spider2_thread_list:
        time.sleep(1)
        i.start()

10. 效果如下:

11. 仅供参考, 我使用爬虫心惊胆战, 恐怖如斯, 刚刚开始不懂规矩, 请谅解-_-  -_-

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值