简介
- 抓取快代理-开放代理栏目;
- 默认抓取所有的数据,共10页;
- 抓取关键词: ip port;
- 时间间隔默认:2s,时间太短会导致抓取失败;
- 保存形式:text文本
创建KuaiDaiLiOps.py
import os
import re
import time
import requests
from bs4 import BeautifulSoup
class KuaiDaiLiOps(object):
def __init__(self):
self.session = requests.session()
self.proxies = None
self.timeout = 5
self.time_interval = 2
self.headers = {
"Accept": "text/html,application/xhtml+xml,"
"application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "zh-CN,zh;q=0.8",
"Connection": "Keep-Alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/55.0.2883.87 Safari/537.36",
}
def get_status(self, url):
"""
获取状态
:param url: 访问地址
:return: 返回response或False
"""
response = self.session.get(
url=url,
headers=self.headers,
proxies=self.proxies,
timeout=self.timeout,
)
if response.status_code == 200:
return response
else:
print("ERROR: 网络连接失败! status: %s url: %s" % (response.status_code, url))
return False
def get_index(self, url):
"""
访问首页,建立连接
:param url:
:return:
"""
response = self.get_status(url)
if response:
print("首页,建立连接...")
return True
else:
print("ERROR: 首页访问失败!")
return False
def parse_page(self, url):
"""
页数解析--只有10页
:param url:
:return:
"""
response = self.get_status(url)
if not response:
return None
html = response.text
soup = BeautifulSoup(html, "html5lib")
pages = soup.select("#listnav > ul > li > a")
url_list = []
for page in pages:
href = page.get("href")
get_url = re.findall(r"(https://.*?)/", url)
url = get_url[0] + href
url_list.append(url)
return url_list
def parse_html(self, url):
"""
页面解析
:param url:
:return:
"""
print(url)
response = self.get_status(url)
if not response:
return None
html = response.text
soup = BeautifulSoup(html, "html5lib")
trs = soup.select("#freelist tbody > tr")
ip_port_list = []
for tr in trs:
tds = tr.find_all("td")
ip = port = hidden = ip_type = get_post_support = location = speed = last_verification_time = ""
for i in range(len(tds)):
ip = tds[0].text
port = tds[1].text
ip_port = ip + ":" + port + "\n"
if not ip:
continue
ip_port_list.append(ip_port)
return ip_port_list
@staticmethod
def write_to_text(path, content):
path = os.path.abspath(path)
with open(path, 'a+', encoding='utf-8') as f:
f.writelines(content)
def main(self):
url = "https://www.kuaidaili.com"
self.get_index(url)
url = "https://www.kuaidaili.com/ops/"
url_list = self.parse_page(url)
path = os.path.join(os.getcwd(), "IP.txt")
path = os.path.abspath(path)
for url in url_list:
ip_port_list = self.parse_html(url)
self.write_to_text(path, ip_port_list)
time.sleep(self.time_interval)
if __name__ == '__main__':
kdl = KuaiDaiLiOps()
kdl.main()