python 爬取所有页面的对应数据

最新推荐文章于 2024-07-07 08:00:00 发布

物是人非gxd

最新推荐文章于 2024-07-07 08:00:00 发布

阅读量4.1k

点赞数

分类专栏： python 爬虫文章标签： python

本文链接：https://blog.csdn.net/wushirenfeig/article/details/83787077

版权

python 同时被 2 个专栏收录

13 篇文章 0 订阅

订阅专栏

爬虫

1 篇文章 0 订阅

订阅专栏

一般来说不同页码最后page=或者p 等等，只需要转化一下后面对应的数值即可，或者从尾页对应URL找到最后一页，也就是总页数即可

案例一：

#!/usr/bin/env python

# -*- coding: utf-8 -*-
import pymysql  # 导入 pymysql
import re
import time
import datetime
import requests
import string
from lxml import etree
# 打开数据库连接
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
response = requests.get('https://www.wdzj.com/dangan/search?filter=e3',headers=headers)
r = response.text
html = etree.HTML(r,etree.HTMLParser())
s = html.xpath('//*[@id="showTable"]/div/div//@currentnum')
total_page = int(s[-1])
for i in range(1,total_page+1):
    final_url = 'https://www.wdzj.com/dangan/search?filter=e3&currentPage='+str(i)
    response = requests.get(final_url, headers=headers)
    r = response.text
    html = etree.HTML(r, etree.HTMLParser())
    r1 = html.xpath('//h2/a')
    r2 = html.xpath('//h2/div[@class="itemTag"]')
    r3 = html.xpath('//a[@class="itemConLeft"]/div[@class="itemConBox"][3]')
    r4 = html.xpath('//a[@class="itemConLeft"]/div[@class="itemConBox bgBox"]/strong')
    for j in range(25):
        name = r1[j].xpath('string(.)')
        problem = r2[j].xpath('string(.)')
        address = r3[j].xpath('string(.)')
        score = r4[j].xpath('string(.)')
        print(name,problem.strip(),address,"综合评分：",score)

案例二：

#!/usr/bin/env python

# -*- coding: utf-8 -*-
import pymysql  # 导入 pymysql
import re
import time
import datetime
import requests
from lxml import etree
import string
from bs4 import BeautifulSoup
# 打开数据库连接
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
response = requests.get('https://www.p2peye.com/platform/zonghe/',headers=headers)
r = response.text
html = etree.HTML(r, etree.HTMLParser())
s = html.xpath('//*[@id="nv_platform"]/div[14]/div/div[1]/div/div/div[3]/div/div/a[7]/@href')
s[0] = str(s[0])
s[0] = re.sub("\D",'',s[0])
total_page = int(s[0])
for  i in range(1,total_page+1):
    final_url = 'https://www.p2peye.com/platform/zonghe/p'+str(i)
    response = requests.get(final_url,headers=headers)
    r = response.text
    html = etree.HTML(r,etree.HTMLParser())
    r0 = html.xpath('//td[1]')
    r1 = html.xpath('//td[2]/a[@class="color-blue"]')
    r2 = html.xpath('//tbody//td[3]')
    r3 = html.xpath('//tbody//td[2]/a/@href')
    for j in range(40):
        rank = r0[j].xpath('string(.)')
        name = r1[j].xpath('string(.)')
        score = r2[j].xpath('string(.)')
        url = r3[j]
        print(rank, name, score, url)