缘来
今天下班前,老板让我帮忙爬一个数据,简单看了下,需要登录,看起来应该不难。回到家,注册一个账号,复制url
,然后用postman转代码,简单暴力,直接撸。
这里说下postman的一个BUG,发送请求不会获得任何数据,如果你遇到,建议升级postman为最新版本
代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import pymongo
import requests
import traceback
from concurrent import futures
from urllib.request import urljoin
from scrapy.selector import Selector
logging.basicConfig(level=logging.INFO)
client = pymongo.MongoClient()
coll = client['table']['collection']
# coll.create_index('url', unique=True)
url = "your urls"
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.9,en;q=0.8",
}
def fetch(page):
try:
proxies = {
'http': proxy,
'https': proxy
}
response = requests.get(url.format(page), headers=headers, timeout=20, proxies=proxies)
if response.status_code == 200:
s = Selector(text=response.text)
for i in s.xpath('//*[@id="search-results"]/tbody/tr'):
url_ = i.xpath('td[4]/a/@href').extract_first()
detail_url = urljoin(url, url_)
data = get_detail(detail_url)
logging.info('success save data {} '.format(data['url']))
save_mongo(data)
except Exception as e:
logging.error(traceback.format_exc())
def get_detail(detail_url):
try:
proxies = {
'http': proxy,
'https': proxy
}
response = requests.get(detail_url, headers=headers, timeout=20, proxies=proxies)
if response.status_code == 200:
s = Selector(text=response.text)
data = dict()
data['url'] = detail_url
data['SMILES '] = s.xpath('//*[@id="smiles"]/text()').extract_first()
img = s.xpath('//*[@id="molecule-image"]/img/@src').extract_first()
data['img'] = urljoin(detail_url, img)
data['formula'] = s.xpath('//*[@id="name-structure"]/tbody/tr[2]/td[2]/text()').extract_first()
data['Mass'] = s.xpath('//*[@id="name-structure"]/tbody/tr[3]/td[2]/text()').extract_first()
return data
except Exception as e:
logging.error(traceback.format_exc())
def save_mongo(data):
try:
coll.insert(data)
except pymongo.errors.DuplicateKeyError:
pass
if __name__ == '__main__':
# for i in range(1, 11):
# fetch(str(i))
# if use Thread
with futures.ThreadPoolExecutor(max_workers=50) as executor:
to_do = []
for i in range(1, 51):
future = executor.submit(fetch, str(i))
to_do.append(future)
代码相当简单,fetch
函数用来抓取列表页,get_detail
函数抓取详情页,save_mongo
保存数据库,需要说明下的就是最后使用的多线程了,这里号使用的是futures
,并不想说很多大道理,来看看文档
提高速度
concurrent.futures 是python3新增加的一个库,用于并发处理,提供了多线程和多进程的并发功能
线程池
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
进程池
import concurrent.futures
import math
PRIMES = [
112272535095293,
112582705942171,
112272535095293,
115280095190773,
115797848077099,
1099726899285419]
def is_prime(n):
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1, 2):
if n % i == 0:
return False
return True
def main():
with concurrent.futures.ProcessPoolExecutor() as executor:
for number, prime in zip(PRIMES, executor.map(is_prime, PRIMES)):
print('%d is prime: %s' % (number, prime))
if __name__ == '__main__':
main()
关于入库方面,建议是增加唯一索引, coll.create_index('url', unique=True)
,一个是去重,一个是提高查询速度。