各位小伙伴们,大家好呀,今天教大家如何用python利用多线程爬取高清壁纸
本人比较喜欢收集壁纸,发现娟娟壁纸动漫分类下的壁纸,我都很喜欢,于是写了个爬虫,后来发现整个网站的网页结构基本一致,于是加了点代码,把整个网页的高清壁纸都爬下来了
import requests
import threading
from lxml import etree
import re
import os
DOWN_PATH = 'G:\爬虫下载\娟娟壁纸'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
# 下载函数
def download(url, name, i):
# lock.acquire()
path = DOWN_PATH + '\\' + name
if not os.path.exists(path):
os.makedirs(path)
# os.chdir(path)
# lock.release()
r = requests.get(url, headers=headers)
# 对文件读写上锁
with open(path + '\\{}.jpg'.format(i), 'wb') as f:
f.write(r.content)
print("下载完成{}{}.jpg".format(name, i))
# 这里写详情页的逻辑
def detail(link):
# 对详情页发送请求
r = requests.get(link, headers=headers)
r.encoding = 'gbk'
# 解析html
tree = etree.HTML(r.text)
# 提取套图数量以及名称
pic_msg = tree.xpath('//div[@class="wzfz tu-tit fix"]/h1/span/text()')[0]
# 正则提取(这里没必要提取num)
name, num = re.findall('(.*?)\(\d/(\d)', pic_msg)[0]
# 提取出当前页面所有的套图连接
first_pic = tree.xpath('//ul[@id="showImg"]/li/img/@src')[0]
last_pic_list = tree.xpath('//ul[@id="showImg"]/li/a/img/@src')
# 把第一张图片添加进套图列表(指定第一个位置添加)
last_pic_list.insert(0, first_pic)
for pic_url, i in zip(last_pic_list, range(1, len(last_pic_list)+1)):
download(pic_url, name, i)
# 这里是首页的逻辑
def main(base_url):
# 对首页发送请求
r = requests.get(base_url, headers=headers)
r.encoding = 'gbk'
# 解析html
tree = etree.HTML(r.text)
# 提取出当前页面所有的套图连接
total_list = tree.xpath('//ul[@class="picbz"]/li/a[1]/@href')
# print(total_list, len(total_list))
for link in total_list:
link = 'http://www.jj20.com' + link
detail(link)
def run(start, over):
# 总共39页
for i in range(start, over):
url = 'http://www.jj20.com/bz/ktmh/list_16_{}.html'.format(i)
main(url)
# 程序的入口
if __name__ == '__main__':
# 创建多线程
t1 = threading.Thread(target=run, args=(1, 11))
t2 = threading.Thread(target=run, args=(11, 21))
t3 = threading.Thread(target=run, args=(21, 31))
t4 = threading.Thread(target=run, args=(31, 40))
# 开启多线程
t1.start()
t2.start()
t3.start()
t4.start()
总结,以上多线程采用多个线程执行同一个函数,下一节将会引入队列解决多线程的问题
好了,以上就是本文的全部内容,希望对大家的学习有所帮助