python 爬取kuaidaili ops页-提取数据
爬取快代理,开放代理页
- 提取相关的数据
- 导入excel表(后期直接写入数据库)
import os
import re
import time
import openpyxl
import requests
from bs4 import BeautifulSoup
class KuaiDaiLi(object):
def __init__(self):
self.session = requests.session()
self.excel = None
self.headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
}
self.count = 3
self.time = 2
def get_status(self, url):
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
return response
else:
print("网络连接失败!")
return None
def index(self, url):
response = self.get_status(url)
if not response:
return False
return True
def initialize_excel(self, path):
if os.path.exists(path):
self.excel = openpyxl.load_workbook(path)
else:
self.excel = openpyxl.Workbook()
headline_data = {
"IP": ["PORT", "匿名度", "类型", "get/post支持", "位置", "响应速度 ", "最后验证时间"],
}
self.write_to_excel(path, headline_data)
def write_to_excel(self, path, info_dic):
sheet_names = self.excel.sheetnames
sheet = self.excel[sheet_names[0]]
for ip in info_dic:
sheet.append((ip,
info_dic[ip][0],
info_dic[ip][1],
info_dic[ip][2],
info_dic[ip][3],
info_dic[ip][4],
info_dic[ip][5],
info_dic[ip][6]))
self.excel.save(path)
def page_parse(self, url):
response = self.get_status(url)
if not response:
return None
html = response.text
soup = BeautifulSoup(html, "html5lib")
pages = soup.select("#listnav > ul > li > a")
for page in pages:
href = page.get("href")
get_url = re.findall(r"(https://.*?)/", url)
url = get_url[0] + href
yield url
def parse(self, url):
info_dic = {}
print(url)
response = self.get_status(url)
if not response:
return None
html = response.text
soup = BeautifulSoup(html, "html5lib")
trs = soup.select("#freelist tbody > tr")
for tr in trs:
tds = tr.find_all("td")
ip = port = hidden = ip_type = get_post_support = location = speed = last_verification_time = ""
for i in range(len(tds)):
ip = tds[0].text
port = tds[1].text
hidden = tds[2].text
ip_type = tds[3].text
get_post_support = tds[4].text
location = tds[5].text
speed = tds[6].text
last_verification_time = tds[7].text
if not ip:
continue
print(ip, port, hidden, ip_type, get_post_support, location, speed, last_verification_time)
info_dic[ip] = [port, hidden, ip_type, get_post_support, location, speed, last_verification_time]
return info_dic
def main(self):
url = "https://www.kuaidaili.com"
self.index(url)
path = os.path.abspath(os.path.join(os.getcwd(), "代理IP信息.xlsx"))
self.initialize_excel(path)
url = "https://www.kuaidaili.com/ops/"
urls = self.page_parse(url)
for url in urls:
info_dic = self.parse(url)
self.write_to_excel(path, info_dic)
time.sleep(self.time)
self.excel.close()
if __name__ == '__main__':
kdl = KuaiDaiLi()
kdl.main()