快代理Ip代理池
import requests
import logging
from lxml import etree
import pymysql
import time
import re
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s - %(funcName)s",
)
MAX_WORKERS = 5
class IpPoolProject:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
}
self.db = pymysql.connect(
host="localhost",
user = "root",
password= "xxxxx",
database= "ippool"
)
self.cursor = self.db.cursor()
self.pool = ThreadPoolExecutor(max_workers=MAX_WORKERS)
def getResponse(self, url):
response = requests.get(url=url, headers=self.headers)
response.encoding = "utf-8"
if response.status_code == 200:
return response
else:
logging.info(f"失败:{url}")
def parseResponse(self, response):
tree = etree.HTML(response.text)
tr_list = tree.xpath("//div[@id='list']//tbody/tr")
for tr in tr_list:
try:
ip = tr.xpath(".//td[1]/text()")[0]
host = tr.xpath(".//td[2]/text()")[0]
type = tr.xpath(".//td[4]/text()")[0]
datetime = tr.xpath(".//td[7]/text()")[0]
print(ip, host, type, datetime)
sql = "insert into ippool(ip, host, stype, datetime) values ('%s', '%s', '%s', '%s')" % (ip, host, type, datetime)
self.cursor.execute(sql)
self.db.commit()
except:
continue
def parseIp(self):
HTTPBIN_URL = "https://httpbin.org/get?show_env=1"
sql = "select ip,host from ippool"
self.cursor.execute(sql)
results = self.cursor.fetchall()
for result in results:
try:
proxies = {
"http": result[0] + ":" + result[1],
"https": result[0] + ":" + result[1]
}
logging.info("使用的代理Ip:" + result[0] + ":" + result[1])
response = requests.get(url=HTTPBIN_URL, headers=self.headers, proxies=proxies).text
print(response)
logging.info("当前IP有效")
time.sleep(2)
except:
logging.info("当前Ip无效")
del_sql = "delete from ippool where ip='%s'" % (result[0])
self.cursor.execute(del_sql)
self.db.commit()
logging.info("当前Ip删除成功")
continue
def main(self):
for page in range(1, 5000):
URL = f"https://www.kuaidaili.com/free/inha/{page}/"
response = self.getResponse(url=URL)
task = self.pool.submit(self.parseResponse, response)
time.sleep(3)
self.parseIp()
self.db.close()
if __name__ == '__main__':
with True:
spider = IpPoolProject()
spider.main()
time.sleep(36000)