前言
网络爬虫在运行过程中并不顺利,总是会遇到各种各样的问题,如fan爬虫策略,它会试图阻止网络爬虫的运行,限制我们的的ip,所以说我们先需要在请求的时候加上代理ip,避免真实ip被封禁。在某代理批量抓取ip为我们搭建ip代理池做基础。
代理ip种类
1.透明代理:如果你使用了该形式的代理,服务器端知道你使用了代理机制也知道你的真实ip。
2.匿名代理:知道你使用了代理,但是不知道你的真实ip。
3.高匿代理:不知道你使用了代理,也不知道你的真实ip。
代理的类型
1.http:转发http的请求
2.https:代理只能转发https协议的请求
定义爬虫类
class ProxyPool(object):
databases_map = db_map
def __init__(self, start, end):
# url列表
self.url_list = ['https://free.kuaidaili.com/free/inha/' + str(i) for i in range(start, end)]
# 设置请求头
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
# 连接redis
self.__r = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True, charset='utf-8', encoding='utf-8')
解析数据
def get_html(self, url):
response = requests.get(url=url, headers=self.headers).text
return response
def parse(self, html):
tree = etree.HTML(html)
# ip地址
ip = tree.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()')
# 端口号
port = tree.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()')
for ips, ports in zip(ip, port):
# 完整代理ip
http_proxy_url = 'http' + '://' + ips + ':' + ports
self.save(http_proxy_url, 'http')
保存至redis
def save(self, proxy, protocol):
try:
self.__r.lpush(self.databases_map.get(protocol, None), proxy)
except Exception as e:
print('出错了:', e)
print('保存成功', proxy)
main方法设置多线程爬取
def main(self):
for url in self.url_list:
html = self.get_html(url)
self.parse(html)
t_list = []
for i in range(10):
run = ProxyPool(1, 10)
t_list.append(run)
thread_list = []
for i in t_list:
t = Thread(target=i.main, args=())
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
完整代码
import redis
import requests
from lxml import etree
from threading import Thread
db_map = {
"http": "proxies:http:v1",
"https": "proxies:https:v1"
}
class ProxyPool(object):
databases_map = db_map
def __init__(self, start, end):
self.url_list = ['https://xxxxxxxx' + str(i) for i in range(start, end)]
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
self.__r = redis.Redis(host='127.0.0.1', port=6379, decode_responses=True, charset='utf-8', encoding='utf-8')
def get_html(self, url):
response = requests.get(url=url, headers=self.headers).text
return response
def parse(self, html):
tree = etree.HTML(html)
# ip地址
ip = tree.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()')
# 端口号
port = tree.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()')
for ips, ports in zip(ip, port):
# 完整代理ip
http_proxy_url = 'http' + '://' + ips + ':' + ports
self.save(http_proxy_url, 'http')
def save(self, proxy, protocol):
try:
self.__r.lpush(self.databases_map.get(protocol, None), proxy)
except Exception as e:
print('出错了:', e)
print('保存成功', proxy)
def main(self):
for url in self.url_list:
html = self.get_html(url)
self.parse(html)
t_list = []
for i in range(10):
run = ProxyPool(1, 10)
t_list.append(run)
thread_list = []
for i in t_list:
t = Thread(target=i.main, args=())
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
if __name__ == '__main__':
run = ProxyPool(1, 10)
run.main()
免费的ip似乎不是很稳定,果然好的东西都是花钱买的