由于鄙人刚刚接触爬虫, 对法律要求还不是很了解, 所以分析过程就尽量简陋, 不深入分析, 只讲个大致分析思路, 请见谅
1. 先导入所需要的模块, 其中requests是获取网页必备的模块, re是正则模块(鄙人暂时只会使用正则, 请谅解), threading用来创建多线程提高下载速度, random模块是用于随机获取免费代理, time模块是用于休眠, 避免因为访问太快以至于服务器限制我们范围, 再者也可以减轻对于服务器的负担
import re
import requests
import threading
import random
import time
2. 若要批量下载图片, 首先就得要有图片所在的页面链接才行, 所以定义一个函数获取批量页面的url, 分析一下页面规律, 我害怕违法所以就不显示url, 请谅解
def url_nums(num):
url_list = [f'https://***********.com/2/index_{i}.shtml' for i in range(2, num + 1)]
return url_list
3. 获取到页面url之后, 我们需要获取页面的html并且对html用正则模块re来提取出我们需要的部分源码, 以便于后面提取出图片url
def html_code(url, headers, proxies):
resp = requests.get(url, headers=headers, proxies=proxies)
resp.encoding = 'utf-8'
html = resp.text
pattern = r'<html lang="en">(.*?)<nav class="paging">'
string = html
html_needing = re.findall(pattern, string, re.S)[0]
return html_needing
4. 得到我们所需要的部分源码后, 我们需要进一步提取, 主要是提取出图片的url以及图片的名字, 要注意缩小范围, 防止下载到其他的不需要的图片
def img_name_url(html_needing):
string = html_needing
pattern1 = '<img data-src="(.*?)" alt=".*?">'
pattern2 = '<img data-src=".*?" alt="(.*?)">'
img_url_lists = re.findall(pattern1, string, re.S)
name_lists = re.findall(pattern2, string, re.S)
name_lists = [i.replace('/', '20') for i in name_lists]
name_lists = [i[0:7] for i in name_lists]
name_lists = [i.replace('"', '101') for i in name_lists]
return img_url_lists, name_lists
def button_name_url(html_needing):
string = html_needing
pattern1 = r'<button data-src="(.*?)" data-behaviour="WALLPAPER_DOWNLOAD" data-name=".*?【.*?】"'
pattern2 = r'<button data-src=".*?" data-behaviour="WALLPAPER_DOWNLOAD" data-name="(.*?)【.*?】"'
img_url_lists = re.findall(pattern1, string, re.S)
name_lists = re.findall(pattern2, string, re.S)
name_lists = [i.replace('/', '1') for i in name_lists]
name_lists = [i[0:7] for i in name_lists]
name_lists = [i.replace('"', '10') for i in name_lists]
return img_url_lists, name_lists
由于鄙人下载的时候名字出现了有空格, 有斜杠等猪多问题, 所以就一次性改成这样, 放弃对正确名字的处理, 请谅解, 之所以定义了两个函数, 一个是用于下载页面显示的图片, 一个是图片下载的标清图片
5. 得到图片的url后就得下载, 因此定义一个函数专门用于下载:
def download(url, name, style, headers, proxies):
resp = requests.get(url, headers=headers, proxies=proxies)
path = f'C:/Users/13255/Desktop/{style}/{name}.jpg'
path = path.replace(' ', '88')
with open(path, 'wb') as fh:
print(f'正在下载:{path}')
fh.write(resp.content)
6. 要想使用多线程, 那么我们得另外定义多几个函数来把上面的函数建立联系
print(f'正在下载:{path}')
fh.write(resp.content)
def begin_spider1(url, headers, proxies):
html_needing = html_code(url, headers=headers, proxies=proxies)
img_list, name_list = img_name_url(html_needing)
for x, y in zip(img_list, name_list):
proxym = random.choice(proxy_list_http)
proxyn = proxy_list_https[proxy_list_http.index(proxym)]
proxyx = {
'HTTP': proxym,
'HTTPS': proxyn
}
download(x, y, '标清图片', headers=header, proxies=proxyx)
def begin_spider2(url, headers, proxies):
html_needing = html_code(url, headers=headers, proxies=proxies)
img_list, name_list = button_name_url(html_needing)
for x, y in zip(img_list, name_list):
proxym = random.choice(proxy_list_http)
proxyn = proxy_list_https[proxy_list_http.index(proxym)]
proxyx = {
'HTTP': proxy1,
'HTTPS': proxy2
}
download(x, y, '高清图片', headers=header, proxies=proxyx)
7. 考虑到有可能的小反爬, 我就加多了个headers和proxy
header = {
'*****':
'**********'
}
proxy_list_origin = ['*********', '**********', '**********', '**********', '**********']
proxy_list_http = [f'http://{i}:9999' for i in proxy_list_origin]
proxy_list_https = [f'https://{i}:9999' for i in proxy_list_origin]
我使用的是免费的代理, 可能已经失效了, 所以我就不敢公布出来\
8. 准备工作完成后, 就准备循环遍历说获得的url并创建多个线程对象批量处理:
if __name__ == '__main__':
url_list = url_nums(200)
begin_spider1_thread_list = []
begin_spider2_thread_list = []
for i in url_list:
proxy1 = random.choice(proxy_list_http)
proxy2 = proxy_list_https[proxy_list_http.index(proxy1)]
proxy = {
'HTTP': proxy1,
'HTTPS': proxy2
}
begin_spider1_thread = threading.Thread(target=begin_spider1, args=(i, header, proxy))
begin_spider1_thread_list.append(begin_spider1_thread)
begin_spider2_thread = threading.Thread(target=begin_spider2, args=(i, header, proxy))
begin_spider2_thread_list.append(begin_spider2_thread)
for i in begin_spider1_thread_list:
time.sleep(1)
i.start()
for i in begin_spider2_thread_list:
time.sleep(1)
i.start()
9. 所用代码如下:
import re
import requests
import threading
import random
import time
header = {
'*****':
'***************'
}
proxy_list_origin = ['**********', '**********', '**********', '**********', '**********']
proxy_list_http = [f'http://{i}:9999' for i in proxy_list_origin]
proxy_list_https = [f'https://{i}:9999' for i in proxy_list_origin]
def url_nums(num):
url_list = [f'https://************/2/index_{i}.shtml' for i in range(2, num + 1)]
return url_list
def html_code(url, headers, proxies):
resp = requests.get(url, headers=headers, proxies=proxies)
resp.encoding = 'utf-8'
html = resp.text
pattern = r'<html lang="en">(.*?)<nav class="paging">'
string = html
html_needing = re.findall(pattern, string, re.S)[0]
return html_needing
def img_name_url(html_needing):
string = html_needing
pattern1 = '<img data-src="(.*?)" alt=".*?">'
pattern2 = '<img data-src=".*?" alt="(.*?)">'
img_url_lists = re.findall(pattern1, string, re.S)
name_lists = re.findall(pattern2, string, re.S)
name_lists = [i.replace('/', '20') for i in name_lists]
name_lists = [i[0:7] for i in name_lists]
name_lists = [i.replace('"', '101') for i in name_lists]
return img_url_lists, name_lists
def button_name_url(html_needing):
string = html_needing
pattern1 = r'<button data-src="(.*?)" data-behaviour="WALLPAPER_DOWNLOAD" data-name=".*?【.*?】"'
pattern2 = r'<button data-src=".*?" data-behaviour="WALLPAPER_DOWNLOAD" data-name="(.*?)【.*?】"'
img_url_lists = re.findall(pattern1, string, re.S)
name_lists = re.findall(pattern2, string, re.S)
name_lists = [i.replace('/', '1') for i in name_lists]
name_lists = [i[0:7] for i in name_lists]
name_lists = [i.replace('"', '10') for i in name_lists]
return img_url_lists, name_lists
def download(url, name, style, headers, proxies):
resp = requests.get(url, headers=headers, proxies=proxies)
path = f'C:/Users/user/Desktop/{style}/{name}.jpg'
path = path.replace(' ', '88')
with open(path, 'wb') as fh:
print(f'正在下载:{path}')
fh.write(resp.content)
def begin_spider1(url, headers, proxies):
html_needing = html_code(url, headers=headers, proxies=proxies)
img_list, name_list = img_name_url(html_needing)
for x, y in zip(img_list, name_list):
proxym = random.choice(proxy_list_http)
proxyn = proxy_list_https[proxy_list_http.index(proxym)]
proxyx = {
'HTTP': proxym,
'HTTPS': proxyn
}
download(x, y, '标清图片', headers=header, proxies=proxyx)
def begin_spider2(url, headers, proxies):
html_needing = html_code(url, headers=headers, proxies=proxies)
img_list, name_list = button_name_url(html_needing)
for x, y in zip(img_list, name_list):
proxym = random.choice(proxy_list_http)
proxyn = proxy_list_https[proxy_list_http.index(proxym)]
proxyx = {
'HTTP': proxy1,
'HTTPS': proxy2
}
download(x, y, '高清图片', headers=header, proxies=proxyx)
if __name__ == '__main__':
url_list = url_nums(200)
begin_spider1_thread_list = []
begin_spider2_thread_list = []
for i in url_list:
proxy1 = random.choice(proxy_list_http)
proxy2 = proxy_list_https[proxy_list_http.index(proxy1)]
proxy = {
'HTTP': proxy1,
'HTTPS': proxy2
}
begin_spider1_thread = threading.Thread(target=begin_spider1, args=(i, header, proxy))
begin_spider1_thread_list.append(begin_spider1_thread)
begin_spider2_thread = threading.Thread(target=begin_spider2, args=(i, header, proxy))
begin_spider2_thread_list.append(begin_spider2_thread)
for i in begin_spider1_thread_list:
time.sleep(1)
i.start()
for i in begin_spider2_thread_list:
time.sleep(1)
i.start()
10. 效果如下:
11. 仅供参考, 我使用爬虫心惊胆战, 恐怖如斯, 刚刚开始不懂规矩, 请谅解-_- -_-