想爬数据,怕搞太多被封了,所以创建一个ip代理池,降低被封的危险,成功爬取数据。
1. 创建数据库和表-mysql版
你爬取的东西要传到数据库中,方便你后续使用。这里使用的是mysql。创建的表元素只包含ip和score,这个score充当ip可用度的衡量。
CREATE TABLE `proxies` (
`ip` varchar(20) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT 'ip+端口',
`score` int(5) DEFAULT NULL COMMENT '评估代理ip分数',
PRIMARY KEY (`ip`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
2. 创建数据库连接
这里我创建了mysqlpywwl.py文件,集成项目对数据库的所有操作。
import mysql.connector
import re
class MySQLClient:
def __init__(self):
self.db_config = { #设置参数值
"host" : "localhost",
"port" : 3306,
"user" : "root",
"password" : "hls",
"database" : "food"
}
# 创建链接
self.connect = mysql.connector.connect(**self.db_config,charset='utf8mb4')
# 创建游标
self.cursor = self.connect.cursor()
self.MAX_SCORE = 10 #ip最高分数
self.MIN_SCORE = 0 #ip最低分数
self.INIT_SCORE = 2 #ip初始分数
self.MAX_NUM = 100 #ip表最多可以有多少个ip
#这里其实可以很多,99时也可以插入100个,但是第二次199就插不进去了,但是就两个页面,不超过六十个IP
#插入数据,加多条的
def insert_data(self,sql,data):
try:
self.cursor.executemany(sql, data) # 使用 executemany 插入多条数据,可单条
self.connect.commit()
except Exception as e:
raise e
# 查询数据
def select_data(self, sql, data):
try:
self.cursor.execute(sql, data)
return self.cursor.fetchall()
except Exception as e:
raise e
#添加ip或当ip已经存在是对应socre+1 ,这里是加1条的
def add_ip(self,proxy):
try:
if not re.match(r"\d+\.\d+\.\d+\.\d+:\d+",proxy):
print("代理不合规范",proxy,"丢弃")
raise Exception(f"{proxy}代理不规范,请重新输入")
sql = "insert into proxies (ip,score) values (%s,%s) on duplicate key update score = score + 1"
self.cursor.execute(sql, (proxy,self.INIT_SCORE))
self.connect.commit()#提交事务
except Exception as e:
raise e
# 查询代理是否存在 1有0无
def exists_ip(self,proxy):
try:
# sql = "SELECT * FROM proxies WHERE ip = %s"
sql = "select exists(SELECT * FROM proxies WHERE ip = %s)"
self.cursor.execute(sql,(proxy,))
returns = self.cursor.fetchone()[0]
return returns
except Exception as e:
raise e
#获取代理数量
def count_ip(self):
try:
sql = "select count(ip) FROM proxies"
self.cursor.execute(sql)
returns = self.cursor.fetchone()[0]
return returns
except Exception as e:
raise e
#获取所有代理,按照score从高到低排序
def all_ip(self):
try:
sql = "select ip from proxies order by score desc"
self.cursor.execute(sql)
return [i[0] for i in self.cursor.fetchall()]
except Exception as e:
raise e
#让ip分数变大
def max_ip(self,proxy):
try:
sql = "select score from proxies where ip = %s"
self.cursor.execute(sql, (proxy,))
results = self.cursor.fetchone()
if results is None:
print("没有对应的ip,请重新输入")
raise Exception("没有对应ip,请重新输入")
score = results[0]
if score > self.MAX_SCORE:
print("此ip可用,且已经达到最大分数")
sql = "update proxies set score = score + 1 where ip = %s and score < %s"
self.cursor.execute(sql, (proxy,self.MAX_SCORE))
self.connect.commit() # 提交事务
except Exception as e:
raise e
#ip分数减1,若小于最少分数,则删除,失败两次就删了
def decrease_ip(self,proxy):
try:
sql = "update proxies set score = score - 1 where ip = %s and score > %s"
results = self.cursor.execute(sql,(proxy,self.MIN_SCORE))
if results == 0:
sql = "delete from proxies where ip = %s and score < %s"
self.cursor.execute(sql,(proxy,self.MIN_SCORE))
self.connect.commit() # 提交事务
except Exception as e:
raise e
#随机获取ip,第一个最好的
def random_ip(self):
try:
sql = "select ip from proxies order by score desc limit 1"
results = self.cursor.execute(sql)
if results == 0:
print("ip表为空,请先输入ip")
raise Exception("没有ip数据,请先输入ip")
results = self.cursor.fetchone()[0]
return results
except Exception as e:
raise e
#关闭链接
def close(self):
try:
# 关闭游标
self.cursor.close()
# 关闭链接
self.connect.close()
except Exception as e:
raise e
if __name__ == '__main__':
mysql1 = MySQLClient()
# print(mysql1.exists_ip("123.12.123.2222:8181"))
print(mysql1.max_ip("182.34.18.206:9999"))
3. 爬取代理IP
这里我创建了getter.py文件,需要一些爬虫知识基础的
import re
from mysqlpywwl import MySQLClient
import requests
class Getter():
def __init__(self):
self.mysql = MySQLClient()
self.MAX_COUNT = 300
def get_page(self,url,headers):
print("正在抓取",url)
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
else:
print("抓取失败",url)
return None
#从98网获取代理IP
def getip_from_89(self):
#只爬第一页就够了
try:
if self.mysql.count_ip() >= self.mysql.MAX_NUM:
print("最多能有ip数量为",self.mysql.MAX_NUM)
return
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0',
'cookie': 'Hm_lvt_f9e56acddd5155c92b9b5499ff966848=1741702155; HMACCOUNT=8A7BAB81E3639D4A; Hm_lpvt_f9e56acddd5155c92b9b5499ff966848=1741703248; https_waf_cookie=5c19846d-ed08-4f9bfcae4c4a5af26f064b6661b773c6e9b9; https_ydclearance=e83e1a1a12c17af7b5f183aa-978c-4b66-9586-b6a8d9e66abf-1741710531',
'referer': 'https://www.89ip.cn/index_1.html'
}
url = "https://www.89ip.cn"
html = self.get_page(url,headers)
all_data = re.findall(r"<td>\s*([\d.]+)\s*</td>\s*<td>\s*(\d+)\s*</td>",html)
all_ip = []
for address,port in all_data:
ip = address + ":"+port
if not re.match(r"\d+\.\d+\.\d+\.\d+:\d+", ip):
print("代理不合规范", ip, "丢弃")
continue
tuples = (ip,2)
all_ip.append(tuples)
sql = "insert into proxies (ip,score) values (%s,%s) on duplicate key update score = score + 1"
self.mysql.insert_data(sql,all_ip)
print("89数据ip查询成功,已放入数据库")
# print(all_data)
# return all_ip
except Exception as e:
raise e + "\n89数据ip查询失败"
# 从ip3366网获取代理IP
def getip_from_ip3366(self):
try:
if self.mysql.count_ip() >= self.mysql.MAX_NUM:
print("最多能有ip数量为",self.mysql.MAX_NUM)
return
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0',
'cookie': 'Hm_lvt_c4dd741ab3585e047d56cf99ebbbe102=1741496242; HMACCOUNT=8A7BAB81E3639D4A; Hm_lpvt_c4dd741ab3585e047d56cf99ebbbe102=1741683219',
'referer': 'https://www.89ip.cn/index_1.html'
}
url = "http://www.ip3366.net/free/?stype=1&page={}"
html = self.get_page(url, headers)
all_data = re.findall(r"<td>\s*([\d.]+)\s*</td>\s*<td>\s*(\d+)\s*</td>", html)
all_ip = []
for address, port in all_data:
ip = address + ":" + port
if not re.match(r"\d+\.\d+\.\d+\.\d+:\d+", ip):
print("代理不合规范", ip, "丢弃")
continue
tuples = (ip, 2)
all_ip.append(tuples)
sql = "insert into proxies (ip,score) values (%s,%s) on duplicate key update score = score + 1"
self.mysql.insert_data(sql, all_ip)
print("ip3366数据ip查询成功,已放入数据库")
# print(all_data)
# return all_ip
except Exception as e:
raise e + "\nip3366数据ip查询失败"
def run(self):
self.getip_from_ip3366()
self.getip_from_89()
if __name__ == '__main__':
get = Getter()
# print(get.get_page("https://ip.ihuan.me/"))
print(get.getip_from_ip3366())
4. 测试ip表
这里我创建了testerip.py文件,创建了wei bo 和bai du tieba充当测试网站验证代理ip是否可用。这里数量少,没有用异步功能。
import time
import requests
from mysqlpywwl import MySQLClient
class Tester:
def __init__(self):
self.mysql = MySQLClient()
self.test_url = "https://www.baidu.com/"
self.OK_STATUS_CODES = [200,302] #合理响应状态码
self.headers_baidu_tieba = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0',
'cookie': 'BAIDUID=2600D63DBC7430873E99DF615FE987BE:FG=1; BIDUPSID=963B47D0567B93149014CB525003FB32; BAIDUID_BFESS=2600D63DBC7430873E99DF615FE987BE:FG=1; BAIDU_WISE_UID=wapp_1714884951712_936; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1714884951,1716512301; Hm_lvt_287705c8d9e2073d13275b18dbd746dc=1724346687; __bid_n=192195fde65982476b05b2; H_WISE_SIDS=110085_619664_1991829_1992049_626068_628198_632164_632115_632143_633359_633369_633616_633569_634600_635511_636124_636641_639041_637511_639608_640074_627286_640396_640445_637860_640309_640453_640507_640588_640603_640380_639930_640805_640832_640852_640965_640977_641048_641072_641080_641053_641124_641119_641114_641121_640917_641173_641143_641219_641195_641310_641319_641400_641427_639697_641459_641470_641479_641515_641492_641456_641266_641663_641591_641590_641593_641585_641587_641758_637753_641807_641806_641802_641797_641838_641850_641261_641952_641905_641911_639099_642055_642072_640772_640865_642203_641702_642323_642327_641154_641325_642408; newlogin=1; H_WISE_SIDS_BFESS=110085_619664_1991829_1992049_626068_628198_632164_632115_632143_633359_633369_633616_633569_634600_635511_636124_636641_639041_637511_639608_640074_627286_640396_640445_637860_640309_640453_640507_640588_640603_640380_639930_640805_640832_640852_640965_640977_641048_641072_641080_641053_641124_641119_641114_641121_640917_641173_641143_641219_641195_641310_641319_641400_641427_639697_641459_641470_641479_641515_641492_641456_641266_641663_641591_641590_641593_641585_641587_641758_637753_641807_641806_641802_641797_641838_641850_641261_641952_641905_641911_639099_642055_642072_640772_640865_642203_641702_642323_642327_641154_641325_642408; PSTM=1740817557; BDRCVFR[77Ms7oRaB-6]=mk3SLVN4HKm; H_PS_PSSID=61027_61674_62080_62325_62346_62329_62373_62401_62421_62423_62433; USER_JUMP=-1; Hm_lvt_292b2e1608b0823c1cb6beef7243ef34=1740465740,1740646655,1741454325,1741767466; HMACCOUNT=8A7BAB81E3639D4A; st_key_id=17; arialoadData=false; video_bubble0=1; ppfuid=FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGkjS1Q+e/k7Rs6uiFpI37bSGEimjy3MrXEpSuItnI4KDzKu30suSE3sF8hPJkvLugjgwNSQKKIDdXA6eDfuiw2FGGpKToFqEAWa32dvn91pcwK5E0lYft4devFzJRK+rynGgLbz7OSojK1zRbqBESR5Pdk2R9IA3lxxOVzA+Iw1TWLSgWjlFVG9Xmh1+20oPSbrzvDjYtVPmZ+9/6evcXmhcO1Y58MgLozKnaQIaLfWRHYJbniad6MOTaDR3XV1dTJbe1wmaetIVuYIt+6PfjR+9o+ncGd7mqfxmbDMsEYcm3eTNkbS2el0J2+pbyoXJb3meBF6m1/WfXOHFmmRXSJdqfhwWryYacZfOfIdenDLSkJsc4rBzsbBPyjKAzWGBO7nCxNtgYtDo26K+8ukl31Y+/geIrmTvn+xVA1gAbbf1lkKhylX1zGsOVlJip30kecMEGvjdNWpsel/qfsfe5JBpqDTksMVoBr7nszRboiUHbedcq1mi/UXvX2b3lxbCLv4Mxoy+dFS3Fr9jSAmssiPARPZutqXQT8krr+KVakPUpdbkwv/8CHDu0C/Z5vtDeiYLQpEgFjmQoey69Fz+kM7Y5cg925MGCeBU4jWp2g2g9u5Ac21q2nG0Jd6BjPtwYOv0TevNnC+snZY5pf/dtFJYdBrxFfXZ8kxisvFxYwkIkhpOuhw8bYyjFOBzzWtHbb/bZjYuTi397a21776RWy1hAfIX+LOYnlmJLoIfeQJGyV88cwcM1svp+zVDjUpc+tX+hKCutoGOSxoeD09wtPvKLw/HwVNQvfr0aMx4YQVLRx8VJ2MfThAR9JY85BUmPjqOlbhrblXFkBQG7n4JKiZyalk9X22FXe2MsdipOHFxSUHSqbPo2PPJzEor12m6lThR7YgmlfLt7kntJD5XeM7GEQw3OLo5dsSUeQDd6vDnkb72/TMYcbPW48WuSnSgMDL820G6v5sII3fbIl1IswRGEQw3OLo5dsSUeQDd6vDntVllC1+aCCc8K28RWpYmU0S24R9DDZVx3j3+tLLpw3BRuF/lI7yGQ5dEntCEMtnVf7zN/nnvSeKksVSFNh8SLgU5ZZ2QvNYYVj4McvsdmZl2ssb09Yk1KUaxhNd9iuw6w==; ZFY=H1J3QnfX0qL0WFDHtu35fDe27W7UoER5iYfPRm6GQ:Bk:C; BDUSS=g3dGxudGNwQ2JIcVlXemhnc0VqdS03RjF3d3pmME1lM2tzUW5weURKWlQwZmhuSVFBQUFBJCQAAAAAAAAAAAEAAACKf4hFeWHKq-f3eWEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFNE0WdTRNFnSE; BDUSS_BFESS=g3dGxudGNwQ2JIcVlXemhnc0VqdS03RjF3d3pmME1lM2tzUW5weURKWlQwZmhuSVFBQUFBJCQAAAAAAAAAAAEAAACKf4hFeWHKq-f3eWEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFNE0WdTRNFnSE; STOKEN=1adca3127648d319ae7a45b38cb2bbb59b15633771751bf686f8eefe8b7f6b14; 1166573450_FRSVideoUploadTip=1; video_bubble1166573450=1; Hm_lpvt_292b2e1608b0823c1cb6beef7243ef34=1741767963; XFI=a39f7130-ff1b-11ef-a46c-29f4f20871a7; BA_HECTOR=808025al0l8lah258181040k8fc34c1jt2h8t23; ab_sr=1.0.1_ZjlhZmQwNDY3NjUyMDc2OTFkMTYyM2ZjOGViMjljMjVmZGYyODhkZTNiZjVmYWM3NmZkNTEzNzY2ZDhmYThmYjNiYTEyODhkOWU4MTc1NmJhMTBlMjk5NWYyODQyNTdjNWEwODdjMjMzZGViYmY0NGNkN2MzYTY3Njc2YzhkMDU0NjQzNjc2MDBiMmM5NTI5OTgzY2Q3MmUxYTY4ZDVkNDI4YjU5MGJlYzdhM2RlYzlkYjMxZDQ4OTNlZjM4NDQ2; st_data=ae80f3e7575a4ee2dd79c54fe0f4cf336a6622f4179a15274bb2e2c88af7befa12e9a2b3d56f5d255544899c6bb0372d4140ca02366a89ff9dc27d48ccfa08cca0b0b5312b228a8e1ea39216fd8567668d0ef94af814fd797d58748582b1a0c4f9e73385461ac8702b904c010570ee1ceb5c26f01e0a345a8b45f9f23a52c848347c4ec0c4f999dd62c06fbd1f2133e3; st_sign=83a4207e; XFCS=16265C386CB97F7F0F572E802CCAACDF3CF24529F5453AC816D42F3D3BC39547; XFT=dEHRlPDQB2R9I1uZZr1FnWYjgkLa6OywQriKX/emVUg=; RT="z=1&dm=baidu.com&si=3b57f827-448f-4733-9862-da7b84e7cbdc&ss=m85ndjw7&sl=n&tt=12b4&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=atty&ul=bgjw',
# 'SCF=AowcyTSKSpSdln_E_VwmIXNn80aAYKf7xZGHhqe-TGCY2s_TP3wgYb69Yx2P0BNQHDi9uRQKCiBZsLMX2GaodUA.; SINAGLOBAL=5069177305009.556.1740917241806; _s_tentry=-; Apache=2429538326668.67.1741495518910; ULV=1741495518936:4:4:1:2429538326668.67.1741495518910:1741433360748; ALF=1744114157; SUB=_2A25KyfS9DeRhGeFH41EX9SzPzzyIHXVppwh1rDV8PUJbkNB-LRKskW1NelMmaJmnA9B9BNIYIjsXA_28ptqwUxaX; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5H5R_iAK7bM1FURjYbiYrs5JpX5KMhUgL.FoM41hecSKz0Sh52dJLoIpRLxKBLB.BLBK5LxK-LBK-LB.Bpeo2EeKzp1KnXeh2t',
'referer':'https://tieba.baidu.com/f?ie=utf-8&kw=%E9%A3%9F%E5%93%81%E5%AE%89%E5%85%A8&fr=search'
}
self.headers_weibo = {
'user-agen':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0.0',
'cookie' : 'SINAGLOBAL=1044639185962.6592.1712316604472; SCF=AnxWnG3dvUdy5RFrS7iO3jbXmEOZEd0y5adI8CQRr4n1pFleQPnxDb2cIdNArTE4qRZIPBrrMuCidgq5uMwIS3I.; UOR=,,cn.bing.com; ALF=1744086497; SUB=_2A25KyWixDeRhGeFH41EX9SzPzzyIHXVpp-R5rDV8PUJbkNAYLUn8kW1NelMmaEshza0QxN_Dw8O6Kqu00iwVOg6T; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5H5R_iAK7bM1FURjYbiYrs5JpX5KMhUgL.FoM41hecSKz0Sh52dJLoIpRLxKBLB.BLBK5LxK-LBK-LB.Bpeo2EeKzp1KnXeh2t; _s_tentry=www.food_python.com; Apache=1425942933226.263.1741768420160; ULV=1741768420263:5:3:2:1425942933226.263.1741768420160:1741494499683',
'referer':'https: // www.food_python.com /'
}
def test_proxy(self,proxy):
try:
if isinstance(proxy, bytes):
proxy = proxy.decode("utf-8")
real_proxy = {"http": f"http://{proxy}"}
print("正在测试"+proxy)
response = requests.get(self.test_url,proxies=real_proxy,headers=self.headers_weibo)
if response.status_code in self.OK_STATUS_CODES:
self.mysql.max_ip(proxy)
print("代理可用",proxy)
else:
self.mysql.decrease_ip(proxy)
print("请求响应不合法",response.status,"ip",proxy)
except Exception as e:
self.mysql.decrease_ip(proxy)
print("代理不可用",proxy)
raise e
#主函数,这里mysql没用异步
def run(self):
try:
print("测试函数开始运行")
all_ip = self.mysql.all_ip()
count = len(all_ip)
print(f"现在还有 {count} 个代理")
for num,proxy in enumerate(all_ip):
if num % 10 == 0:
print(f"现在为第{num}个代理")
self.test_proxy(proxy)
time.sleep(1)
self.mysql.close()
except Exception as e:
print('测试函数发生错误', e.args)
if __name__ == '__main__':
test = Tester()
test.run()
5. 用flask设置一个为服务器。
这里我创建了api_ip.py模块,Flask 提供了一个内置的开发服务器,用于快速启动和测试应用。但不能承受高压,只能用于生成环境。
import time
from mysqlpywwl import MySQLClient
from flask import Flask,g
#g对象是全局对象,每次flask启动时的g都不一样
#用__name__为名创建一个flask对象,赋值给ip
ip = Flask(__name__)
def get_connect():
#确保每次返回的对象都是同一个,这里的g对象是flask内置的
if not hasattr(g,"mysql"):
g.mysql = MySQLClient()
return g.mysql
@ip.route('/')
def index():
return '<h2>Welcome to Proxy Pool System</h2>'
#获取随机ip
@ip.route('/random')
def random_ip():
try:
mysql = get_connect()
return mysql.random_ip()
except Exception as e:
raise e
@ip.route('/count')
def count_ip():
mysql = get_connect()
return str(mysql.count_ip())
if __name__ == '__main__':
ip.run("127.0.0.2",5001) #创建一个服务器,用127.0.0.2和5001端口可以访问这个服务器
# time.sleep(5)
5. 封装上面三个程序,提供简洁调度
这里我创建了entrance.py模块,导入时加入了proxyip这个包名,因为我后续会在其他包调用这个模块,导致查找失败,在导入时加上对应包名,确保程序成功运行。
import time
from proxyip.getterip import Getter
from proxyip.testerip import Tester
from proxyip.api_ip import ip
from multiprocessing import Process
class Entrance:
def entrance_apiip(self,cycletime = 300):
print("api接口模块运行中----------")
host = "127.0.0.2"
port = "8083"
ip.run(host,port)
def entrance_getterip(self,cycletime = 300):
getter = Getter()
while True:
print("获取ip模块运行中,若所爬网址首页ip正常,则不换------------")
getter.run()
time.sleep(cycletime)
def entrance_testerip(self,cycletime = 300):
tester = Tester()
while True:
print("测试模块运行中----------------")
tester.run()
time.sleep(cycletime)
def run_all(self):
#要给引用method,而不是方法method()
print('代理池开始运行')
#运行获取模块
getter_process = Process(target=self.entrance_getterip)
getter_process.start()
#运行测试模块
tester_process = Process(target=self.entrance_testerip)
tester_process.start()
def run_api(self):
# 运行api接口模块
api_process = Process(target=self.entrance_apiip)
api_process.start()
# api_process.terminate()#这个可以停
if __name__ == '__main__':
entrance = Entrance()
# entrance.run_all()
entrance.run_api()
6.成果
7.借鉴
若想使用redis的,请翻阅以下大神: