import time
import requests
from lxml import etree
from fake_useragent import UserAgent
def getPage(url):
headers = {
'User-Agent':'Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 72.0.3626.81 Safari / 537.36 SE 2.X MetaSr 1.0'
}
res = requests.get(url= url,headers =headers)
if res.status_code ==200 :
print("请求成功")
return res
def parseHTML(html):
response = html.content.decode("utf-8")
try:
res_html = etree.HTML(response)
ips = res_html.xpath('//table[@id="GridViewOrder"]//tr/td[2]/text()')
ports = res_html.xpath('//table[@id="GridViewOrder"]//tr/td[3]/text()')
data = list(zip(ips, ports))
return data
except:
print("错")
return False
def testIp(alist):
ua = UserAgent()
url = 'http://httpbin.org/get'
headers = {
'User-Agent': ua.random
}
for i in range(0,len(alist)):
proxies = {
'http': alist[i][0]+':'+alist[i][1],
'https': alist[i][0]+':'+alist[i][1]
}
try:
res = requests.get(url, headers=headers, proxies=proxies, timeout=5)
if res.status_code == 200:
data = res.json()
print(data)
except:
print('请求失败')
def main(num):
if num==1:
url = 'http://ip.yqie.com/proxygaoni/index.htm'
else:
url = 'http://ip.yqie.com/proxygaoni/index_' + str(num)+'.htm'
html = getPage(url)
print(url)
if html:
print(1010)
alist = parseHTML(html)
print(alist)
oklist = testIp(alist)
if __name__ == '__main__':
for i in range(1,11):
print('当前正在爬取第',i,'页')
main(i)
time.sleep(2)