程序员教你怎样利用python构建代理ip池

在众多的网站防爬措施中,有一种是根据ip的访问频率进行限制,即在某一时间段内,当某个ip的访问次数达到一定的阀值时,该ip就会被拉黑、在一段时间内禁止访问。 搭建一个IP代理池,使用不同的IP轮流进行爬取。

获取模块

import requests

import chardetimport tracebackfrom lxml import etree

class Downloader(object): def __init__(self):

self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

} def download(self, url):

print('正在下载页面:{}'.format(url))

try:

resp = requests.get(url, headers=self.headers) resp.encoding = chardet.detect(resp.content)['encoding']

if resp.status_code == 200:

return self.xpath_parse(resp.text)

else:

raise ConnectionError

except Exception:

print('下载页面出错:{}'.format(url))

traceback.print_exc() def xpath_parse(self, resp):

try:

page = etree.HTML(resp) trs = page.xpath('//div[@id="list"]/table/tbody/tr')

proxy_list = [] for tr in trs:

ip = tr.xpath('./td[1]/text()')[0]

port = tr.xpath('./td[2]/text()')[0]

proxy = { 'proxy': ip + ':' + port

} proxy_list.append(proxy) return proxy_list

except Exception:

print('解析IP地址出错')

traceback.print_exc()if __name__ == '__main__':

print(Downloader().download('https://www.kuaidaili.com/free/inha/1/'))

存储模块

import pymongo

from pymongo.errors import DuplicateKeyErrorclass MongoDB(object): def __init__(self):

self.client = pymongo.MongoClient()

self.db = self.client['proxypool3']

self.proxies = self.db['proxies']

self.proxies.ensure_index('proxy', unique=True)

self.proxies.create_index()

# createIndex()

def insert(self, proxy):

try:

self.proxies.insert(proxy)

print('插入成功:{}'.format(proxy))

except DuplicateKeyError:

pass def delete(self, conditions):

self.proxies.remove(conditions) print('删除成功:{}'.format(conditions))

def update(self, conditions, values):

self.proxies.update(conditions, {"$set": values})

print('更新成功:{},{}'.format(conditions,values))

def get(self, count, conditions=None):

conditions = conditions if conditions else {}

count = int(count)

items = self.proxies.find(conditions, limit=count).sort('delay', pymongo.ASCENDING)

items = list(items)

return items

def get_count(self):

return self.proxies.count({})

if __name__ == '__main__':

m = MongoDB() print(m.get(3))

检测模块

import requests

import timeimport tracebackfrom requests.exceptions import ProxyError, ConnectionError

from db.mongo_db import MongoDB

from multiprocessing.pool import ThreadPool

def valid_many(proxy_list, method): pool = ThreadPool(16)

for proxy in proxy_list:

pool.apply_async(valid_one, args=(proxy, method)) pool.close()

pool.join()def valid_one(proxy, method, url='https://www.baidu.com'):

headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

} proxies = { 'http': 'http://' + proxy['proxy'],

'https': 'http://' + proxy['proxy']

} try: start_time = time.time() # requests.packages.urllib3.disable_warnings() resp = requests.get(url, headers=headers, proxies=proxies, timeout=5, verify=False)

delay = round(time.time() - start_time, 2)

if resp.status_code == 200:

proxy['delay'] = delay

if method == 'insert':

MongoDB().insert(proxy) elif method == 'check':

MongoDB().update({'proxy': proxy['proxy']}, {'delay': proxy['delay']})

else:

if method == 'check':

MongoDB().delete({'proxy': proxy['proxy']})

except (ProxyError, ConnectionError): if method == 'check':

MongoDB().delete({'proxy': proxy['proxy']})

except Exception: traceback.print_exc()

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'

} proxies = { 'http': 'http://' + proxy['proxy'],

'https': 'http://' + proxy['proxy']

} try: start_time = time.time()

# requests.packages.urllib3.disable_warnings() resp = requests.get(url, headers=headers, proxies=proxies, timeout=5, verify=False)

delay = round(time.time() - start_time, 2)

if resp.status_code == 200:

proxy['delay'] = delay

if method == 'insert':

MongoDB().insert(proxy)

elif method == 'check':

MongoDB().update({'proxy': proxy['proxy']}, {'delay': proxy['delay']})

else:

if method == 'check':

MongoDB().delete({'proxy': proxy['proxy']})

except (ProxyError, ConnectionError): if method == 'check':

MongoDB().delete({'proxy': proxy['proxy']})

except Exception: traceback.print_exc()


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值