**多进程 多线程 异步 爬虫
忽略爬虫具体规则策略cookie登录等,专注高性能,高并发。**
初步
爬取煎蛋图片,存图片链接到mongodb
#!/usr/bin/python
#-*- coding: utf-8 -*-
import os
import json
import functools
import requests
import urllib.request
from urllib.request import FancyURLopener
import urllib.parse
import urllib.error
from lxml import etree
import time
from pymongo import MongoClient
from concurrent.futures import ProcessPoolExecutor, as_completed,ThreadPoolExecutor
import asyncio
import aiohttp
TsHeader = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
PATH = r'..\Spider_PoolExecutor'
CHUNK_SIZE = 1024
P_MAX_WORKERS_PAGE = 3
P_MAX_WORKERS_IMGSAVE = 10
T_MAX_WORKERS_PAGE = 3
T_MAX_WORKERS_IMGSAVE = 10
LOCK_TIME = 0.2
EVENT_TIME = 0.2
SEMA_NUM = 3
class DataBase():
def __init__(self):
client = MongoClient('127.0.0.1', 27017)
self._db = client['jandan']
class JDBase():
def __init__(self):
self._chunk_size = CHUNK_SIZE
try:
db = DataBase()
self._db = db._db
except:
print('数据库连接失败')
self._db = None
def insert_header(self,header, img_url, call_url):
referer = call_url
if call_url.find('#') > 0:
referer = call_url[:len(call_url) - 9]
host = img_url[7:21]
header.addheader("Host", host)
header.addheader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
if img_url.find('.gif') > 0:
header.addheader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
else:
header.addheader("Accept", "*/*")
header.addheader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
header.addheader("Accept-Encoding", "gzip,deflate")
header.addheader("Referer", referer)
header.addheader("Connection", "keep-alive")
header.addheader("Upgrade-Insecure-Requests", "1")
return header
def