#分页数据爬取
import time
import requests
from lxml import etree
from fake_useragent import UserAgent
#请求页面程序
def getPage(url):
#使用指定url,返回请求的页面
headers = {
'User-Agent':'Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 72.0.3626.81 Safari / 537.36 SE 2.X MetaSr 1.0'
}
res = requests.get(url= url,headers =headers)
if res.status_code ==200 :
print("请求成功")
return res
#解析页面程序
def parseHTML(html):
response = html.content.decode("utf-8")
try:
res_html = etree.HTML(response)
ips = res_html.xpath('//table[@id="GridViewOrder"]//tr/td[2]/text()')
ports = res_html.xpath('//table[@id="GridViewOrder"]//tr/td[3]/text()')
data = list(zip(ips, ports))
return data
except:
print("错")
return False
#测试IP是否好用
def testIp(alist):
ua = UserAgent()
url = 'http://httpbin.org/get'
headers = {
'User-Agent': ua.random
}
for i in range(0,len(alist)):
# 定义代理IP
proxies = {
'http': alist[i][0]+':'+alist[i][1],
'https': alist[i][0]+':'+alist[i][1]
}
try:
res = requests.get(url, headers=headers, proxies=proxies, timeout=5)
if res.status_code == 200:
data = res.json()
print(data)
except:
print('请求失败')
#主程序
def main(num):
#url拼接
if num==1:
url = 'http://ip.yqie.com/proxygaoni/index.htm'
else:
url = 'http://ip.yqie.com/proxygaoni/index_' + str(num)+'.htm'
#调用请求页面程序
html = getPage(url)
print(url)
if html:
print(1010)
#调用解析HTML的方法
alist = parseHTML(html)
print(alist)
#吧返回的数据,去发请求测试是否好用
oklist = testIp(alist)
#把返回的好用的数据写入文件
if __name__ == '__main__':
for i in range(1,11):
print('当前正在爬取第',i,'页')
main(i)
time.sleep(2)
python 分页爬取数据
最新推荐文章于 2023-10-06 11:00:59 发布