一般来说不同页码最后page=或者p 等等,只需要转化一下后面对应的数值即可,或者从尾页对应URL找到最后一页,也就是总页数即可
案例一:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pymysql # 导入 pymysql
import re
import time
import datetime
import requests
import string
from lxml import etree
# 打开数据库连接
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
response = requests.get('https://www.wdzj.com/dangan/search?filter=e3',headers=headers)
r = response.text
html = etree.HTML(r,etree.HTMLParser())
s = html.xpath('//*[@id="showTable"]/div/div//@currentnum')
total_page = int(s[-1])
for i in range(1,total_page+1):
final_url = 'https://www.wdzj.com/dangan/search?filter=e3¤tPage='+str(i)
response = requests.get(final_url, headers=headers)
r = response.text
html = etree.HTML(r, etree.HTMLParser())
r1 = html.xpath('//h2/a')
r2 = html.xpath('//h2/div[@class="itemTag"]')
r3 = html.xpath('//a[@class="itemConLeft"]/div[@class="itemConBox"][3]')
r4 = html.xpath('//a[@class="itemConLeft"]/div[@class="itemConBox bgBox"]/strong')
for j in range(25):
name = r1[j].xpath('string(.)')
problem = r2[j].xpath('string(.)')
address = r3[j].xpath('string(.)')
score = r4[j].xpath('string(.)')
print(name,problem.strip(),address,"综合评分:",score)
案例二:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pymysql # 导入 pymysql
import re
import time
import datetime
import requests
from lxml import etree
import string
from bs4 import BeautifulSoup
# 打开数据库连接
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
response = requests.get('https://www.p2peye.com/platform/zonghe/',headers=headers)
r = response.text
html = etree.HTML(r, etree.HTMLParser())
s = html.xpath('//*[@id="nv_platform"]/div[14]/div/div[1]/div/div/div[3]/div/div/a[7]/@href')
s[0] = str(s[0])
s[0] = re.sub("\D",'',s[0])
total_page = int(s[0])
for i in range(1,total_page+1):
final_url = 'https://www.p2peye.com/platform/zonghe/p'+str(i)
response = requests.get(final_url,headers=headers)
r = response.text
html = etree.HTML(r,etree.HTMLParser())
r0 = html.xpath('//td[1]')
r1 = html.xpath('//td[2]/a[@class="color-blue"]')
r2 = html.xpath('//tbody//td[3]')
r3 = html.xpath('//tbody//td[2]/a/@href')
for j in range(40):
rank = r0[j].xpath('string(.)')
name = r1[j].xpath('string(.)')
score = r2[j].xpath('string(.)')
url = r3[j]
print(rank, name, score, url)