使用requests+pyquery爬取dd373地下城跨五最新商品信息

废话不多说直接上代码:

  可以使用openpyel库对爬取的信息写入Execl表格中代码我就不上传了

import requests
from urllib.parse import urlencode
from requests import RequestException
from pyquery import PyQuery as pq

def open_sh():
    #获取dd373html信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    data = {
        "minPrice":333,
        "maxPrice":""
    }
    url = "https://www.dd373.com/s/rbg22w-x9kjbs-wwf11b-0-0-0-qquvn4-0-0-0-0-0-0-0-0.html?"+urlencode(data)
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print("链接错误",url)
        return None

def doc_page(html):
    # 获取地下城账号信息
    doc = pq(html)
    content = doc("div.content")
    titleText = content.find(".box.money_ner").items()
    for items in titleText:
        product = {
            "地址":items.find("a.titleText").attr("href"),
            "账号信息":items.find("a.titleText").text(),
            "价格":items.find("div.money_text strong span").text()+'元',
            "是否存在":items.find("div.num.left").text()
        }
        print(product)
def page_sh(pagebox):
    # 循环遍历所有分页
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    data = {
        "minPrice": 333,
        "maxPrice": ""
    }
    for page in range(1,pagebox+1):
        url = "https://www.dd373.com/s/rbg22w-x9kjbs-wwf11b-0-0-0-qquvn4-0-0-0-0-0-0-0-%s.html?%s"%(page,urlencode(data))
        try:
            page1 = page_currentpage(url)
            if page1==page:
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    doc_page(response.text)
        except Exception as e:
            raise e

def page_currentpage(html):
    # 获取分页中被高亮的页数用于判断是否在 当前页面
    doc = pq(html)
    currentpage= doc("a.nb.currentpage").text()
    return int(currentpage)

def page_box(html):
    # 获取所有的页码
    doc = pq(html)
    pagebox = doc(".pagebox.clear ul li.yeshu").text()[9:-1]
    return int(pagebox)

def main():
    html = open_sh()
    page = page_box(html)
    page_sh(page)



if __name__ == "__main__":
    main()

  

转载于:https://www.cnblogs.com/zhmiao/p/10684570.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值