python设置爬取速度_learnPython/Python爬虫日记九:豌豆荚设计奖三种爬取方法速度对比.py at master · rieuse/learnPython · GitHub...

__author__ = '布咯咯_rieuse'

import asyncio

import random

import time

import aiohttp

import pymongo

import requests

import multiprocessing

from bs4 import BeautifulSoup

# 共用部分

clients = pymongo.MongoClient('localhost')

db = clients["wandoujia"]

col = db["info"]

urls = ['http://www.wandoujia.com/award?page={}'.format(num) for num in range(1, 46)]

UA_LIST = [

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

]

headers = {

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding': 'gzip, deflate, sdch',

'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',

'Connection': 'keep-alive',

'Host': 'www.wandoujia.com',

'User-Agent': random.choice(UA_LIST)

}

proxies = {

'http': 'http://123.206.6.17:3128',

'https': 'http://123.206.6.17:3128'

}

# 方式一:使用常见的requests

def method_1():

start = time.time()

for url in urls:

html = requests.get(url, headers=headers, proxies=proxies).text

soup = BeautifulSoup(html, 'lxml')

title = soup.find_all(class_='title')

app_title = soup.find_all(class_='app-title')

item_cover = soup.find_all(class_='item-cover')

icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')

for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):

content = {

'title': title_i.get_text(),

'app_title': app_title_i.get_text(),

'item_cover': item_cover_i['data-original'],

'icon_cover': icon_cover_i['data-original']

}

col.insert(content)

print('成功插入一组数据' + str(content))

print('一共用时:' + str(time.time() - start))

# if __name__ == '__main__':

# method_1()

# 方式二:使用Requests + Pool

def method_2(url):

html = requests.get(url, headers=headers, proxies=proxies).text

soup = BeautifulSoup(html, 'lxml')

title = soup.find_all(class_='title')

app_title = soup.find_all(class_='app-title')

item_cover = soup.find_all(class_='item-cover')

icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')

for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):

content = {

'title': title_i.get_text(),

'app_title': app_title_i.get_text(),

'item_cover': item_cover_i['data-original'],

'icon_cover': icon_cover_i['data-original']

}

# time.sleep(1)

col.insert(content)

print('成功插入一组数据' + str(content))

# if __name__ == '__main__':

# start = time.time()

# pool = multiprocessing.Pool(4)

# pool.map(method_2, urls)

# pool.close()

# pool.join()

# print('一共用时:' + str(time.time() - start))

# 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块

def method_3():

async def get_url(url):

async with aiohttp.ClientSession() as session: # async关键字将一个函数声明为协程函数,函数执行时返回一个协程对象。

async with session.get(url) as html:

response = await html.text(encoding="utf-8") # await关键字将暂停协程函数的执行,等待异步IO返回结果。

return response

async def parser(url):

html = await get_url(url)

soup = BeautifulSoup(html, 'lxml')

title = soup.find_all(class_='title')

app_title = soup.find_all(class_='app-title')

item_cover = soup.find_all(class_='item-cover')

icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')

for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):

content = {

'title': title_i.get_text(),

'app_title': app_title_i.get_text(),

'item_cover': item_cover_i['data-original'],

'icon_cover': icon_cover_i['data-original']

}

col.insert(content)

print('成功插入一组数据' + str(content))

start = time.time()

loop = asyncio.get_event_loop()

tasks = [parser(url) for url in urls]

loop.run_until_complete(asyncio.gather(*tasks))

print(time.time() - start)

if __name__ == '__main__':

method_3()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值