多进程 多线程 异步 爬虫(1)

**多进程 多线程 异步 爬虫
忽略爬虫具体规则策略cookie登录等,专注高性能,高并发。**

初步

爬取煎蛋图片,存图片链接到mongodb

#!/usr/bin/python
#-*- coding: utf-8 -*-
import os
import json
import functools
import requests
import urllib.request
from urllib.request import FancyURLopener
import urllib.parse
import urllib.error
from lxml import etree
import time
from pymongo import MongoClient

from concurrent.futures import ProcessPoolExecutor, as_completed,ThreadPoolExecutor
import asyncio
import aiohttp

TsHeader = {
  'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}

PATH = r'..\Spider_PoolExecutor'
CHUNK_SIZE = 1024

P_MAX_WORKERS_PAGE = 3
P_MAX_WORKERS_IMGSAVE = 10

T_MAX_WORKERS_PAGE = 3
T_MAX_WORKERS_IMGSAVE = 10

LOCK_TIME = 0.2
EVENT_TIME = 0.2

SEMA_NUM = 3

class DataBase():
    def __init__(self):
        client = MongoClient('127.0.0.1', 27017)
        self._db = client['jandan']

class JDBase():
    def __init__(self):
        self._chunk_size = CHUNK_SIZE
        try:
            db = DataBase()
            self._db = db._db
        except:
            print('数据库连接失败')
            self._db = None

    def insert_header(self,header, img_url, call_url):
        referer = call_url
        if call_url.find('#') > 0:
            referer = call_url[:len(call_url) - 9]
        host = img_url[7:21]
        header.addheader("Host", host)
        header.addheader("User-Agent",
                         "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
        if img_url.find('.gif') > 0:
            header.addheader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        else:
            header.addheader("Accept", "*/*")
        header.addheader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
        header.addheader("Accept-Encoding", "gzip,deflate")
        header.addheader("Referer", referer)
        header.addheader("Connection", "keep-alive")
        header.addheader("Upgrade-Insecure-Requests", "1")
        return header

    def 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值