学不动了家人们,搞了个异步爬虫。
速度应该是比之前的同步爬虫快点,而且这次所有类型的都可以爬爬爬,爽的一批!
import requests
import time
import aiohttp
import asyncio
from lxml import etree
import re, os
from fake_useragant import UserAgent
qaz = '爬爬爬'
if not os.path.exists('./%s' % qaz):
os.mkdir('./%s' % qaz)
headers = {
"User-Agent": str(UserAgent().random) # 伪装头信息
} # 头信息
url = "http://mzsock.net"
res = requests.get(url=url, headers=headers).text
tree = etree.HTML(res)
li_list = tree.xpath('/html/body/header/div[2]/nav/ul/li')
http_class_list = []
for li in li_list:
li = li.xpath('./a/@href')[0]
if 'http://mzsock.net/' in li:
http_class_list.append(li)
else:
continue
oso = {
'mv': 'mianwa',
'cy': 'chuanwa',
'cwzp': 'zipai',
'lz': 'lzu',
'sw': 'swa',
'fbx': 'fanbuxie'
}
# 同步爬虫的对比时间:1.7s
# start = time.time()
# for http in http_son:
# rrr = requests.get(url=http, headers=headers).text
# treee = etree.HTML(rrr)
# lili = treee.xpath('/html/body/section/div[1]/ul/li')
# for a in lili:
# a = a.xpath('./div[1]/a/@href')[0]
# print(a)
# print(time.time()-start)
start = time.time()
class_http_list = []
async def qwe(url): # 获取所有的分类的网址
async with aiohttp.ClientSession() as session:
proxy_auth = aiohttp.BasicAuth('user', 'pass')
async with await session.get(url, headers=headers, proxy_auth=proxy_auth) as response:
page_test = await response.text()
son_tree = etree.HTML(page_test)
page = son_tree.xpath('/html/body/div[1]/div[1]/span[2]/em/text()')[0] # 获得页码
end_page = int(page) // 20 + 1
# 截断可要可不要
# a = url.split('/')[3]
# url = 'http://mzsock.net/' + a
for i in range(1, end_page + 1):
if i > 1:
url1 = url + 'page/' + str(i)
# print(url1)
else:
url1 = url
# print(url1)
if 'http://' in url1:
class_http_list.append(url1)
else:
continue
tasks = []
for url in http_class_list: # 创建任务
c = qwe(url)
task = asyncio.ensure_future(c)
tasks.append(task)
loop = asyncio.get_event_loop() # 创建事件
loop.run_until_complete(asyncio.wait(tasks)) # 把任务注册到事件里
# print(class_http_list)
# class_http_list已经存储所有的网址,获取所有的子网址
http_son_list = []
pin_name_list = []
async def ewq(url): # 获取每个人的图片库网址
async with aiohttp.ClientSession() as session:
proxy_auth = aiohttp.BasicAuth('user', 'pass')
async with await session.get(url, headers=headers, proxy_auth=proxy_auth) as response:
page_test = await response.text()
son_tree = etree.HTML(page_test)
li_son_list = son_tree.xpath('/html/body/section/div[1]/ul/li') # 获得子网址
for li_son in li_son_list:
li_son = li_son.xpath('./div[1]/a/@href|./h3/a/@title')
if 'http://' in li_son[0]:
http_son_list.append(li_son[0])
else:
continue
# print(li_son[0])
tasks_1 = []
for url in class_http_list: # 创建任务
d = ewq(url)
task1 = asyncio.ensure_future(d)
tasks_1.append(task1)
loop1 = asyncio.get_event_loop() # 创建事件
loop1.run_until_complete(asyncio.wait(tasks_1)) # 把任务注册到事件里
# 除去重复的链接
http_son_list = list(set(http_son_list))
# print(http_son_list, '\n', len(http_son_list)) # 253个人
# http_son_list列表里存满了所有的人的照片
# 对所有的照片链接发起请求
pic_pic = []
pic_page_list = []
async def wqe(url, semaphore): # 获取每个人的图片网址
os_name = url.split('/')[3]
# print(os_name)
if not os.path.exists('./%s/%s' % (qaz, oso[os_name])):
os.makedirs('./%s/%s' % (qaz, oso[os_name]))
async with semaphore:
async with aiohttp.ClientSession() as session:
proxy_auth = aiohttp.BasicAuth('user', 'pass')
async with await session.get(url, headers=headers, proxy_auth=proxy_auth) as response:
page_test = await response.text()
# print(response.status) # 查看响应码是不是200,不是200就是没进去网站
son_tree = etree.HTML(page_test)
# pic_name = son_tree.xpath('/html/body/section/div/div[3]/p[1]/a/img/@alt')
pic_name = son_tree.xpath('/html/body/section/div/h1/text()')[0].split('(')[0]
pic_page = son_tree.xpath('/html/body/section/div/h1/span/text()')[0].split('/')[-1]
# print(response.status)
# print(pic_name, pic_page, url)
ex = '(.*?).html'
url = re.findall(ex, url, re.S)[0]
for i in range(1, int(pic_page) + 1):
if i > 1:
url2 = url + '_' + str(i) + '.html'
# print(url2)
else:
url2 = url + '.html'
# print(url2)
# async with aiohttp.ClientSession() as session:
# proxy_auth = aiohttp.BasicAuth('user', 'pass')
response1 = await session.get(url2, headers=headers, proxy_auth=proxy_auth)
page_test1 = await response1.text()
son_tree1 = etree.HTML(page_test1)
pic_list1 = son_tree1.xpath('//p[@class="img_jz"]')
# print(url2, son_tree1, pic_list1)
we = 0
for pic in pic_list1:
pic = pic.xpath('./a/img/@src')[0]
we += 1
if 'http://' in pic:
# pic_pic.append(pic)
# text()文本
# read()二进制
# json()
response2 = await session.get(pic, headers=headers, proxy_auth=proxy_auth)
page_test2 = await response2.read()
imgpath = './%s/%s/' % (qaz, oso[os_name]) + str(pic_name) + str(i) + '.' + str(we) + '.jpg'
with open(imgpath, 'wb') as fp:
fp.write(page_test2)
print(imgpath, '下载成功')
else:
continue
tasks_2 = []
sem = asyncio.Semaphore(20) # 限制并行的信息量,不要太大嗷,站点会屏蔽
# text = []
# text.append(http_son_list[0])
# text.append(http_son_list[1])
for url in http_son_list: # 创建任务
e = wqe(url, sem)
task2 = asyncio.ensure_future(e)
tasks_2.append(task2)
loop2 = asyncio.get_event_loop() # 创建事件
loop2.run_until_complete(asyncio.wait(tasks_2)) # 把任务注册到事件里
print(time.time() - start)