爬取66代理网的代理ip
使用技术:
- python3
- request
- xpath
import userAgent
import requests
from lxml import etree
class Proxies:
"""
爬取66免费代理网的代理
proDicList = Proxies.get_proxies(4)
"""
url = 'http://www.66ip.cn/' #全国
# url = 'http://www.66ip.cn/areaindex_1/'
proxies = []
geshu = 0
@staticmethod
def get_one_proxy(count=1):
count = count
proxy = Proxies.get_proxies(count)
return proxy[0]
@staticmethod
def get_proxies(count):
"""
返回含有count个的代理字典的列表
"""
count = count
Proxies.geshu = count
i = 2 # 从第二页开始
url_list = [] # 存放有效的代理字典
tempDic = {} # 临时存取每一页的字典
while True: #一直爬取,直到大于指定的个数后,爬取完当前页,停止
i += 1
url_new = Proxies.url+ str(i) + '.html'
content = Proxies.get_content(url_new)
tempDic = Proxies.get_info(content) #爬取了一个页面的代理
if len(Proxies.proxies) > count:
return Proxies.proxies[:count]
@staticmethod
def get_content(url):
"""
根据url获取网页内容
"""
user_agent = userAgent.User_Agent.get_user_agent('pc')
headers = user_agent
response = requests.get(url=url, headers=headers)
return response.text
@staticmethod
def get_info(content):
datas_ips = etree.HTML(content).xpath(
'//div[contains(@id,"main")]/div/div[1]/table/tr[position()>1]/td[1]/text()')
datas_ports = etree.HTML(content).xpath(
'//div[contains(@id,"main")]/div/div[1]/table/tr[position()>1]/td[2]/text()')
for i in range(len(datas_ips)):
Proxies.verif_ip(datas_ips[i], datas_ports[i]) # 验证,将有效的代理保存下来
if len(Proxies.proxies)> Proxies.geshu:
break
# print(i)
# print(Proxies.proxies)
# print('%s :--- %s'%(datas_ips,datas_ports))
@staticmethod
def verif_ip(ip, port): #验证代理的有效性
user_agent = userAgent.User_Agent.get_user_agent('pc')
url = 'http://www.baidu.com'
proxies = {}
try:
left = 'http'
right = 'http://' + ip + ":" + port
proxies[left] = right
res = requests.get(url=url,proxies=proxies, headers=user_agent, timeout=0.1) //过滤一些反应慢的ip
if res.status_code == 200:
Proxies.proxies.append(proxies)
# print(Proxies.proxies)
# print('验证通过')
return True
except :
try:
left = 'https'
right = 'https://' + ip + ":" + port
proxies = {}
proxies[left] = right
res = requests.get(url=url, proxies=proxies, headers=user_agent, timeout=0.1)
if res.status_code == 200:
Proxies.proxies.append(proxies) # 如果代理有效,则保存到类属性
# print('----', Proxies.proxies)
# print('----验证通过')
return True
except Exception as e:
# print("验证失败", e)
return False
else:
# print("验证失败")
return False
if __name__ == '__main__':
proxies = Proxies.get_one_proxy()
proxies = Proxies.get_proxies(10)
print(proxies)