import requests
import time
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 Edg/81.0.416.64'
}
ip_dict = {}
#返回请求到的bs4
def get_source(url):
r = requests.get(url, headers=header)
#print(r.text)
if(r.status_code==200):
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'html.parser')
# print(type(soup))
return soup
else:
print("请求网页失败")
return None
def get_url(URL): # 翻页
soup=get_source(URL)
return soup
#print('所有IP信息已准备就绪,共 {} 个。'.format(len()))
def get_info(soup):
items=soup.select('#list > table > tbody> tr')
for item in items:
IP = item.find('td', attrs={'data-title': "IP"}).string
PORT = item.find('td', attrs={'data-title': "PORT"}).string
TYPE = item.find('td', attrs={'data-title': "类型"}).string
ip_dict[IP] = TYPE + '://' + IP + ':' + PORT
def check_ip():
for IP in list(ip_dict.keys()):
try:
r = requests.get('https://www.baidu.com/', proxies={'http': ip_dict[IP]}, headers=header, timeout=0.1)
#抛出其他的状态码
r.raise_for_status()
except:
del ip_dict[IP]
print('已对所有IP进行检测,延时不超过 0.1 秒的有 {} 个。'.format(len(ip_dict)))
def save_ip():
f = open("ip.txt", 'a')
# 若文件不存在,系统自动创建。'a'表示可连续写入到文件,保留原内容,在原
# 内容之后写入。可修改该模式('w+','w','wb'等)
for IP in list(ip_dict.keys()):
print(ip_dict[IP])
f.write(ip_dict[IP]) # 写入文件中
f.write("\t") # 换行
#
# if __name__ == '__main__':
# fw = open("/exercise1/data/query_deal.txt", 'w') # 将要输出保存的文件地址
# for line in open("/exercise1/data/query.txt"): # 读取的文件
# fw.write("\"poiName\":\"" + line.rstrip("\n") + "\"") # 将字符串写入文件中
# # line.rstrip("\n")为去除行尾换行符
# fw.write("\n") # 换行
f.close()
if __name__ == '__main__':
url = 'https://www.kuaidaili.com/free/'
for page_num in range(1, 2):
URL = url + 'inha/{}/'.format(page_num)
print(URL)
time.sleep(1)
get_info(get_url(URL))
check_ip()
save_ip()
爬虫代理池
最新推荐文章于 2024-09-09 21:48:30 发布