python 爬取之通过Redis构建IP地理池
0x00 主要流程
1、通过开源的IP代理网站获取免费的IP地理;
http://www.66ip.cn/
http://www.66ip.cn/areaindex_33/
https://ip.jiangxianli.com
https://ip.ihuan.me
2、爬取到的IP代理存入Redis数据库,数据结构说明:
Redis存储方式为:zset
代理IP的权值最高为:100
代理IP的权值最低为:0
代理IP的权值初始为:10
3、构建IP代理池:
1、爬取IP代理:通过getIPproxy_XXX()类函数,其中XXX表示面给代理IP的网站说明
2、通过summaryProxies(sub, res) 汇总爬取的IP代理
3、通过mainCreateIPpools()构建IP代理池,并测试代理IP的可用性:首先、将所有获得的IP入库,初始权值为10;测试可用,将IP的权值设为100;不可用,则从数据库中删除;
4、构建IP代理池完毕
0x02 Redis数据库设计
redisop.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/3/25/025 23:47
# @Author : H
# @File : redisop.py
import redis
import random
MAX_SCORE = 100 # 最高分
MIN_SCORE = 0 # 最低分
INITIAL_SCORE = 10 # 初始分数
REDIS_HOST = "localhost"
REDIS_PORT = 6379
class RedisClient(object):
def __init__(self):
self.db = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=0, password='',decode_responses=True)
self.key = "proxies"
def add(self, proxy, score=INITIAL_SCORE):
"""
将代理添加到代理池中
:param proxy: 代理
:param score: 分数
:return:
"""
if not self.is_exist(proxy):
#self.db.zadd(self.key, proxy,score)
proxy_score = {}
proxy_score[proxy] = score
self.db.zadd(self.key, proxy_score)
def is_exist(self, proxy):
"""
判断代理池中是否存在该代理
:param proxy: 代理
:return: True or False
"""
if self.db.zscore(self.key, proxy):
return True
else:
return False
def random(self):
"""
获取有效代理,先获取最高分代理,如果不存在,则按分数排名然后随机获取
:return: 代理
"""
result = self.db.zrangebyscore(self.key, MAX_SCORE, MAX_SCORE)
if len(result):
return random.choice(result)
else:
result = self.db.zrangebyscore(self.key, MIN_SCORE, MAX_SCORE)
if len(result):
return random.choice(result)
else:
print("代理池已空!")
def decrease(self, proxy):
"""
代理分数减1分,若小于最低分,则从代理池中移除
:param proxy:
:return:
"""
if self.is_exist(proxy):
score = self.db.zscore(self.key, proxy)
if score > MIN_SCORE:
score -= 1
# self.db.zadd(self.key, proxy,score)
proxy_score = {}
proxy_score[proxy] = score
self.db.zadd(self.key, proxy_score)
else:
self.delete(proxy)
def max(self, proxy):
"""
将代理分数设置为最高分
:param proxy: 代理
:return:
"""
if self.is_exist(proxy):
# self.db.zadd(self.key, proxy, MAX_SCORE)
proxy_score = {}
proxy_score[proxy] = MAX_SCORE
self.db.zadd(self.key, proxy_score)
def delete(self, proxy):
"""
从代理池中移除该代理
:param proxy: 代理
:return:
"""
if self.is_exist(proxy):
self.db.zrem(self.key, proxy)
def all(self):
"""
获取代理池中的所有代理
:return:
"""
if self.count():
return self.db.zrange(self.key, MIN_SCORE, MAX_SCORE)
def count(self):
"""
获取代理池中代理数量
:return:
"""
return self.db.zcard(self.key)
if __name__ == '__main__':
conn = RedisClient()
print(conn.count())
0x03 获取IP构建代理池
getproxies.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/3/23/023 16:27
# @Author : H
# @File : getproxies.py
import requests
from bs4 import BeautifulSoup
import re
import random
from packages.redisop import RedisClient
def getProxyIP_61(page):
"""
:function:Crawling agent IP by www.ip.cn
:param page:Crawl pages of the site
:return:list ['ip:port']
"""
base_url = f"http://www.66ip.cn/{page}.html"
res = requests.get(base_url)
sub = []
if res.ok:
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('table')[2]
tr = table.find_all("tr")
for td in tr:
ip = td("td")[0].string
port = td("td")[1].string
if ip == "ip":
pass
else:
sub.append(ip + ":" + port)
return sub
def getProxyIP_61_areaindex_1(page):
"""
:function:Crawling agent IP by www.66ip.cn/areaindex_33
:param page:Crawl pages of the site
:return:list ['ip:port']
"""
base_url = f"http://www.66ip.cn/areaindex_33/{page}.html"
res = requests.get(base_url)
sub = []
if res.ok:
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('table')[2]
tr = table.find_all("tr")
for td in tr:
ip = td("td")[0].string
port = td("td")[1].string
if ip == "ip":
pass
else:
sub.append(ip + ":" + port)
return sub
def getProxyIP_xicaidaili(page):
"""
:function:Crawling agent IP by ip.jiangxianli.com
:param page:Crawl pages of the site
:return:list ['ip:port']
"""
base_url = f"https://ip.jiangxianli.com/?page={page}"
res = requests.get(base_url)
sub = []
if res.ok:
soup = BeautifulSoup(res.content, "html.parser")
table = soup("table")[0]
tr = table.find_all("tr")
for i in tr:
if len(i("td")) != 0:
ip = i("td")[0].string
port = i("td")[1].string
sub.append(ip + ":" + port)
return sub
def getIPproxy_ihuan(page):
"""
:function:Crawling agent IP by ip.jiangxianli.com
:param page:Crawl pages(need special format) of the site
:return:list ['ip:port']
"""
# page = b97827cc 1
# page = 4ce63706 2
# page = 5crfe930 3
# page = f3k1d581 4
base_url = f"https://ip.ihuan.me/?page={page}"
res = requests.get(base_url)
sub = []
if res.ok:
soup = BeautifulSoup(res.content, "html.parser")
table = soup.find_all('tbody')[0]
tr = table.find_all("tr")
for td in tr:
ip = td("td")[0]("a")[0]
ip = re.findall('(.*)>(.*)</a>(.*)', str(ip))[0][1]
port = td("td")[1].string
sub.append(ip + ":" + port)
return sub
def summaryProxies(sub, res):
"""
:function:Summary crawling proxy IP
:param res:IP proxy results crawled by the above methods
:param sub:List of results to return
:return:sub(list ['ip:port'])
"""
for i in res:
sub.append(i)
return sub
def mainCreateIPpools():
# Crawling agent IP
pages = ['b97827cc', '4ce63706', '5crfe930']
proxies = []
# for page in pages:
# summaryProxies(proxies, getIPproxy_ihuan(page))
for page in range(1, 5):
summaryProxies(proxies, getProxyIP_61(page))
summaryProxies(proxies, getProxyIP_61_areaindex_1(page))
summaryProxies(proxies, getProxyIP_xicaidaili(page))
# Write proxy IP to redis and build IP address pool
conn = RedisClient()
for proxy in proxies:
print(f"[+]爬取的代理:\t{proxy}")
conn.add(proxy)
print(f"[+]本次爬取代理的数量:\t{len(proxies)}")
# Randomly select one UserAgent
UserAgents = []
for i in open("user-agent.txt", "r", encoding="utf-8"):
UserAgents.append(i.strip())
# Test the IP availability of agent and write it to redis to build IP address pool
proxy_list = conn.all()
url = "http://httpbin.org/get"
for proxy in proxy_list:
try:
UserAgent = random.choice(UserAgents)
headers = {
'UserAgent': UserAgent
}
proxy_host = "https://" + proxy
proxy_temp = {
"http": proxy_host,
"https": proxy_host
}
res = requests.get(url, headers = headers, proxies = proxy_temp, timeout= 10)
if res.ok:
print("[+]可用代理:\t" + proxy_host)
conn.max(proxy)
except Exception as e:
print(f"[-]代理不可用:\t" + proxy_host)
print('error', e.args)
conn.delete(proxy)
continue
print(f"[+]====>Agent IP address pool completed and the number of IP addresses is:\t{conn.count()}")
if __name__ == '__main__':
mainCreateIPpools()
0x04 从代理池中取出代理
getproxy.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/3/24 10:43
# @Author : H
# @File : get_proxy.py
from packages.redisop import RedisClient
import random
import requests
def getProxy():
data = []
conn = RedisClient()
# Take out a proxy IP
proxy = conn.random()
# proxy = "https://" + proxy
proxy = {
"https": "https://" + proxy}
data.append(proxy)
# Randomly select one UserAgent
UserAgents = []
for i in open("user-agent.txt", "r", encoding="utf-8"):
UserAgents.append(i.strip())
UserAgent = random.choice(UserAgents)
headers = {
'UserAgent': UserAgent,
}
data.append(headers)
return data
0x05 使用代理
test.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/3/6/006 11:07
# @Author : H
# @File : test.py
from packages.redisop import RedisClient
import random
import requests
conn = RedisClient()
# Take out a proxy IP
proxy = conn.random()
# Randomly select one UserAgent
UserAgents = []
for i in open("../data/user-agent.txt", "r", encoding="utf-8"):
UserAgents.append(i.strip())
UserAgent = random.choice(UserAgents)
headers = {
'UserAgent': UserAgent,
}
url = "http://www.baidu.com"
proxy_host = "https://" + proxy
proxy_temp = {"http": proxy_host}
try:
# Take out a proxy IP
proxy = conn.random()
proxy_host = "https://" + proxy
proxy_temp = {"http": proxy_host}
UserAgent = random.choice(UserAgents)
headers = {
'UserAgent': UserAgent,
}
res = requests.get(url, headers=headers, proxies=proxy_temp)
print(res.headers)
except requests.exceptions.ConnectionError as e:
conn.decrease(proxy)
print('error', e.args)
0x06 总结说明
1、首先、运行Redis服务器服务端:
2、运行proxies.py爬取并构建IP代理池
3、取出代理运用到爬虫中:
运行test.py文件,很多免费的IP代理不可用,所以会出想代理连接错误
0x07 附录
user-agent.txt:也可以通过fake_useragent获取
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246
Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36
Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1
Mozilla/5.0 (Linux; Android 5.1; AFTS Build/LMY47O) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/41.99900.2250.0242 Safari/537.36
Mozilla/5.0 (PlayStation 4 3.11) AppleWebKit/537.73 (KHTML, like Gecko)
Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)
Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.36 Safari/525.19
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10
Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.4 (KHTML, like Gecko) Chrome/6.0.481.0 Safari/534.4
Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.86 Safari/533.4
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.3 Safari/532.2
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0
Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.173.1 Safari/530.5
Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.558.0 Safari/534.10
Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/540.0 (KHTML,like Gecko) Chrome/9.1.0.0 Safari/540.0
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.600.0 Safari/534.14
Mozilla/5.0 (X11; U; Windows NT 6; en-US) AppleWebKit/534.12 (KHTML, like Gecko) Chrome/9.0.587.0 Safari/534.12
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20
Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.872.0 Safari/535.2
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7
Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.45 Safari/535.19
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36
Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36
Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.103 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.38 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; ko; rv:1.9.1b2) Gecko/20081201 Firefox/3.1b2
Mozilla/5.0 (X11; U; SunOS sun4u; en-US; rv:1.9b5) Gecko/2008032620 Firefox/3.0b5
Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1.12) Gecko/20080214 Firefox/2.0.0.12
Mozilla/5.0 (Windows; U; Windows NT 5.1; cs; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8
Mozilla/5.0 (X11; U; OpenBSD i386; en-US; rv:1.8.0.5) Gecko/20060819 Firefox/1.5.0.5
Mozilla/5.0 (Windows; U; Windows NT 5.0; es-ES; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3
Mozilla/5.0 (Windows; U; WinNT4.0; en-US; rv:1.7.9) Gecko/20050711 Firefox/1.0.5
Mozilla/5.0 (Windows; Windows NT 6.1; rv:2.0b2) Gecko/20100720 Firefox/4.0b2
Mozilla/5.0 (X11; Linux x86_64; rv:2.0b4) Gecko/20100818 Firefox/4.0b4
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2) Gecko/20100308 Ubuntu/10.04 (lucid) Firefox/3.6 GTB7.1
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b7) Gecko/20101111 Firefox/4.0b7
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b8pre) Gecko/20101114 Firefox/4.0b8pre
Mozilla/5.0 (X11; Linux x86_64; rv:2.0b9pre) Gecko/20110111 Firefox/4.0b9pre
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b9pre) Gecko/20101228 Firefox/4.0b9pre
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.2a1pre) Gecko/20110324 Firefox/4.2a1pre
Mozilla/5.0 (X11; U; Linux amd64; rv:5.0) Gecko/20100101 Firefox/5.0 (Debian)
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2
Mozilla/5.0 (X11; Linux i686 on x86_64; rv:12.0) Gecko/20100101 Firefox/12.0
Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2
Mozilla/5.0 (X11; Ubuntu; Linux armv7l; rv:17.0) Gecko/20100101 Firefox/17.0
Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0
Mozilla/5.0 (Windows NT 5.1; rv:25.0) Gecko/20100101 Firefox/25.0
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:25.0) Gecko/20100101 Firefox/25.0
Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0
Mozilla/5.0 (X11; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0
Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0
Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0