导包
import requests
import time
from lxml import etree
from fake_useragent import UserAgent
import json
判断代理是否可用
def get_status(url, ht,ip,port):
url = 'https://www.baidu.com' # 这里以访问百度来判断,如果我们自己有目标网站,可以直接传参
headers = {
'user-agent':UserAgent().random
}
proxies = {
"http": "{ht}://{ip}:{port}".format(ht=ht,ip=ip,port=port),
}
res = requests.get(url=url,headers=headers,proxies=proxies,timeout=3)
return res.status_code
获取每一页的IP内容方法
def get_content(url):
# url = 'https://www.kuaidaili.com/free/inha/1/'
headers ={
'user-agent':UserAgent().random
}
res = requests.get(url,headers=headers)
# response = etree.HTML(res.text)
response = etree.HTML(res.content.decode())
# 获取tr节点
tr_list = response.xpath('//*[@id="list"]/table/tbody/tr|//*[@id="list"]/table/tr')
datas = []
for tr in tr_list:
# print(tr)
data = {}
data['IP'] = tr.xpath('./td[1]/text()')[0]
data['PORT'] = tr.xpath('./td[2]/text()')[0]
data['匿名度'] = tr.xpath('./td[3]/text()')[0]
data['类型'] = tr.xpath('./td[4]/text()')[0]
data['位置'] = tr.xpath('./td[5]/text()')[0]
data['响应速度'] = tr.xpath('./td[6]/text()')[0]
data['最后验证时间'] = tr.xpath('./td[7]/text()')[0]
# print(data)
statu = get_status(url, data['类型'],data['IP'],data['PORT']) # 调用方法获取返回状态码,状态码为200可用
print(statu)
if statu==200: #判断代理是否可用
with open('ip.json', 'a+', encoding='utf-8')as f:
da = json.dumps(data, ensure_ascii=False)# 写入的时候,添加ensure_ascii属性防止乱码.
f.write(da + '\n')
else:
continue
datas.append(data)
return datas
for循环爬取多页数据
if __name__ == '__main__':
for i in range(1,3590):
# 通过for循环组成每页的链接
url = 'https://www.kuaidaili.com/free/inha/{}/'.format(i)
print(i)
datas = get_content(url) # 调用获取内容的方法
time.sleep(1) # 休眠一秒,
if len(datas) == 0: # 判断当前页面获取的内容是否为空,如果为空结束循环.
break