代理池搭建学习
最近学习爬虫需要更换代理IP,就按照书籍搭建了自己的代理池,仅供参考
代理网站获取proxy
##代理网站爬取模块,daili66网站--------------------------------------------------------------
import requests
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
##定义元类,增加代理的属性attrs['__CrawlFunc__'],一个代理网站一个方法
class ProxyMetaclass(type):
def __new__(cls, name, bases, attrs):
# print(cls,'\n',name,'\n',bases,'\n',attrs)
#<class '__main__.ProxyMetaclass'> Crawler ( <class 'object'>,)
count = 0
attrs['__CrawlFunc__'] = []
#k,v是键值对
for k,v in attrs.items():
# print(k,v)
if 'crawl' in k:
attrs['__CrawlFunc__'].append(k)
count += 1
attrs['__CrawlFuncCount__'] = count
#返回增加了方法属性的元类
return type.__new__(cls,name,bases,attrs)
class Crawler(object,metaclass=ProxyMetaclass):
##调用Crawler中所有的网站获取proxy方法(这里只有daili66,想要增加网站在后面写方法)
def get_proxies(self,callback):
proxis = []
for proxy in eval("self.{}()".format(callback)):
# print('成功获取到代理',proxy)
proxis.append(proxy)
return proxis
# print(proxis)
##获取网站页面,需要页面进行一定调整,例如转码问题gbk、ISO、UTF8等
def get_page(self,url):
'''
:return: html
'''
headers = {
# 'UserAgent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0'
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0'
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'
}
html = requests.get(url)
html = html.content.decode('gbk')
return html
##daili66网址解析获取proxy
def crawl_daili66(self,page_count=4):
'''
获取代理66
:param page_count:
:return:
'''
start_url = 'http://www.66ip.cn/{}.html'
urls = [start_url.format(page) for page in range(1,page_count+1)]
for url in urls:
print('Crawling',url)
html = self.get_page(url)
if html:
# soup = BeautifulSoup(html,'lxml')
# print(soup)
doc = pq(html)
trs = doc('.containerbox table tr:gt(0)').items()
for tr in trs:
ip = tr.find('td:nth-child(1)').text