爬虫怕封IP,程序员教你打造个人IP池。

搭建一套稳定的代理池服务,为上千个爬虫提供有效的代理,保证各个爬虫拿到的都是对应网站有效的代理IP,从而保证爬虫快速稳定的运行。无私分享全套Python爬虫干货,如果你也想学习Python,@ 私信小编获取

如何保证代理质量?

可以肯定免费的代理IP大部分都是不能用的,不然别人为什么还提供付费的。所以采集回来的代理IP不能直接使用,可以写检测程序不断的去用这些代理访问一个稳定的网站,看是否可以正常使用。

获取模块

import requests
import chardet
import traceback
from lxml import etree
class Downloader(object):
 def __init__(self):
 self.headers = {
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
 }
 def download(self, url):
 print('正在下载页面:{}'.format(url))
 try:
 resp = requests.get(url, headers=self.headers)
 resp.encoding = chardet.detect(resp.content)['encoding']
 if resp.status_code == 200:
 return self.xpath_parse(resp.text)
 else:
 raise ConnectionError
 except Exception:
 print('下载页面出错:{}'.format(url))
 traceback.print_exc()
 def xpath_parse(self, resp):
 try:
 page = etree.HTML(resp)
 trs = page.xpath('//div[@id="list"]/table/tbody/tr')
 proxy_list = []
 for tr in trs:
 ip = tr.xpath('./td[1]/text()')[0]
 port = tr.xpath('./td[2]/text()')[0]
 proxy = {
 'proxy': ip + ':' + port
 }
 proxy_list.append(proxy)
 return proxy_list
 except Exception:
 print('解析IP地址出错')
 traceback.print_exc()
if __name__ == '__main__':
 print(Downloader().download('https://www.kuaidaili.com/free/inha/1/'))

存储模块

import pymongo
from pymongo.errors import DuplicateKeyError
class MongoDB(object):
 def __init__(self):
 self.client = pymongo.MongoClient()
 self.db = self.client['proxypool3']
 self.proxies = self.db['proxies']
 self.proxies.ensure_index('proxy', unique=True)
 self.proxies.create_index()
 # createIndex()
 def insert(self, proxy):
 try:
 self.proxies.insert(proxy)
 print('插入成功:{}'.format(proxy))
 except DuplicateKeyError:
 pass
 def delete(self, conditions):
 self.proxies.remove(conditions)
 print('删除成功:{}'.format(conditions))
 def update(self, conditions, values):
 self.proxies.update(conditions, {"$set": values})
 print('更新成功:{},{}'.format(conditions,values))
 def get(self, count, conditions=None):
 conditions = conditions if conditions else {}
 count = int(count)
 items = self.proxies.find(conditions, limit=count).sort('delay', pymongo.ASCENDING)
 items = list(items)
 return items
 def get_count(self):
 return self.proxies.count({})
if __name__ == '__main__':
 m = MongoDB()
 print(m.get(3))

检测模块

import requests
import time
import traceback
from requests.exceptions import ProxyError, ConnectionError
from db.mongo_db import MongoDB
from multiprocessing.pool import ThreadPool
def valid_many(proxy_list, method):
 pool = ThreadPool(16)
 for proxy in proxy_list:
 pool.apply_async(valid_one, args=(proxy, method))
 pool.close()
 pool.join()
def valid_one(proxy, method, url='https://www.baidu.com'):
 headers = {
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
 }
 proxies = {
 'http': 'http://' + proxy['proxy'],
 'https': 'http://' + proxy['proxy']
 }
 try:
 start_time = time.time()
 # requests.packages.urllib3.disable_warnings()
 resp = requests.get(url, headers=headers, proxies=proxies, timeout=5, verify=False)
 delay = round(time.time() - start_time, 2)
 if resp.status_code == 200:
 proxy['delay'] = delay
 if method == 'insert':
 MongoDB().insert(proxy)
 elif method == 'check':
 MongoDB().update({'proxy': proxy['proxy']}, {'delay': proxy['delay']})
 else:
 if method == 'check':
 MongoDB().delete({'proxy': proxy['proxy']})
 except (ProxyError, ConnectionError):
 if method == 'check':
 MongoDB().delete({'proxy': proxy['proxy']})
 except Exception:
 traceback.print_exc()
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
 }
 proxies = {
 'http': 'http://' + proxy['proxy'],
 'https': 'http://' + proxy['proxy']
 }
 try:
 start_time = time.time()
 # requests.packages.urllib3.disable_warnings()
 resp = requests.get(url, headers=headers, proxies=proxies, timeout=5, verify=False)
 delay = round(time.time() - start_time, 2)
 if resp.status_code == 200:
 proxy['delay'] = delay
 if method == 'insert':
 MongoDB().insert(proxy)
 elif method == 'check':
 MongoDB().update({'proxy': proxy['proxy']}, {'delay': proxy['delay']})
 else:
 if method == 'check':
 MongoDB().delete({'proxy': proxy['proxy']})
 except (ProxyError, ConnectionError):
 if method == 'check':
 MongoDB().delete({'proxy': proxy['proxy']})
 except Exception:
 traceback.print_exc()

为了帮助大家更轻松的学好Python,我给大家分享一套Python学习资料,希望对正在学习的你有所帮助!

获取方式:关注并私信小编 “ 学习 ”,即可免费获取!

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值