python代理池_python爬虫-代理池的维护

importjsonimportrefrom .utils importget_pagefrom pyquery importPyQuery as pqclassProxyMetaclass(type):def __new__(cls, name, bases, attrs):

count=0

attrs['__CrawlFunc__'] =[]for k, v inattrs.items():if 'crawl_' ink:

attrs['__CrawlFunc__'].append(k)

count+= 1attrs['__CrawlFuncCount__'] =countreturn type.__new__(cls, name, bases, attrs)class Crawler(object, metaclass=ProxyMetaclass):defget_proxies(self, callback):

proxies=[]for proxy in eval("self.{}()".format(callback)):print('成功获取到代理', proxy)

proxies.append(proxy)returnproxiesdef crawl_daili66(self, page_count=4):"""获取代理66

:param page_count: 页码

:return: 代理"""start_url= 'http://www.66ip.cn/{}.html'urls= [start_url.format(page) for page in range(1, page_count + 1)]for url inurls:print('Crawling', url)

html=get_page(url)ifhtml:

doc=pq(html)

trs= doc('.containerbox table tr:gt(0)').items()for tr intrs:

ip= tr.find('td:nth-child(1)').text()

port= tr.find('td:nth-child(2)').text()yield ':'.join([ip, port])defcrawl_ip3366(self):for page in range(1, 4):

start_url= 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)

html=get_page(start_url)

ip_address= re.compile('

\s*(.*?)\s*(.*?)')#\s * 匹配空格,起到换行作用

re_ip_address =ip_address.findall(html)for address, port inre_ip_address:

result= address + ':' +portyield result.replace(' ', '')defcrawl_kuaidaili(self):for i in range(1, 4):

start_url= 'http://www.kuaidaili.com/free/inha/{}/'.format(i)

html=get_page(start_url)ifhtml:

ip_address= re.compile('

(.*?)')

re_ip_address=ip_address.findall(html)

port= re.compile('

(.*?)')

re_port=port.findall(html)for address, port inzip(re_ip_address, re_port):

address_port= address + ':' +portyield address_port.replace(' ', '')defcrawl_xicidaili(self):for i in range(1, 3):

start_url= 'http://www.xicidaili.com/nn/{}'.format(i)

headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3','Host': 'www.xicidaili.com','Referer': 'http://www.xicidaili.com/nn/3','Upgrade-Insecure-Requests': '1',

}

html= get_page(start_url, options=headers)ifhtml:

find_trs= re.compile('

(.*?)', re.S)

trs=find_trs.findall(html)for tr intrs:

find_ip= re.compile('

(\d+\.\d+\.\d+\.\d+)')

re_ip_address=find_ip.findall(tr)

find_port= re.compile('

(\d+)')

re_port=find_port.findall(tr)for address, port inzip(re_ip_address, re_port):

address_port= address + ':' +portyield address_port.replace(' ', '')defcrawl_ip3366(self):for i in range(1, 4):

start_url= 'http://www.ip3366.net/?stype=1&page={}'.format(i)

html=get_page(start_url)ifhtml:

find_tr= re.compile('

(.*?)', re.S)

trs=find_tr.findall(html)for s in range(1, len(trs)):

find_ip= re.compile('

(\d+\.\d+\.\d+\.\d+)')

re_ip_address=find_ip.findall(trs[s])

find_port= re.compile('

(\d+)')

re_port=find_port.findall(trs[s])for address, port inzip(re_ip_address, re_port):

address_port= address + ':' +portyield address_port.replace(' ', '')defcrawl_iphai(self):

start_url= 'http://www.iphai.com/'html=get_page(start_url)ifhtml:

find_tr= re.compile('

(.*?)', re.S)

trs=find_tr.findall(html)for s in range(1, len(trs)):

find_ip= re.compile('

\s+(\d+\.\d+\.\d+\.\d+)\s+', re.S)

re_ip_address=find_ip.findall(trs[s])

find_port= re.compile('

\s+(\d+)\s+', re.S)

re_port=find_port.findall(trs[s])for address, port inzip(re_ip_address, re_port):

address_port= address + ':' +portyield address_port.replace(' ', '')defcrawl_data5u(self):

start_url= 'http://www.data5u.com/free/gngn/index.shtml'headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding': 'gzip, deflate','Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7','Cache-Control': 'max-age=0','Connection': 'keep-alive','Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86','Host': 'www.data5u.com','Referer': 'http://www.data5u.com/free/index.shtml','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',

}

html= get_page(start_url, options=headers)ifhtml:

ip_address= re.compile('(\d+\.\d+\.\d+\.\d+).*?(\d+)', re.S)

re_ip_address=ip_address.findall(html)for address, port inre_ip_address:

result= address + ':' +portyield result.replace(' ', '')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值