python 爬虫-养生之道_爬虫爬养生类文章-CSDN博客

本文链接：https://blog.csdn.net/weixin_45668089/article/details/126637278

Bs 最近开始养生之道，需要一些材料。

所谓养生，动词也，亦可为名词。原指道家通过各种方法颐养生命、增强体质、预防疾病，从而达到延年益寿的一种医事活动。养，即调养、保养、补养之意；生，即生命、生存、生长之意。现代意义的“养生”指的是根据人的生命过程规律主动进行物质与精神的身心养护活动。

1.目标

http://jlxw.jiudaifu.com/

请添加图片描述

定位元素，有一个 href。

请添加图片描述

详情页的 url = 主页的url + href。

请添加图片描述

2.开始获取所有的详情页 href

"""
    作者：mldsh
    日期：2022年08月26日10:20
    使用工具：PyCharm
"""
import requests
from icecream import ic
from lxml import etree


class AcuPoint(object):
    def __init__(self):
        # 主页 url
        self.url = 'http://jlxw.jiudaifu.com/'

    # 获取主页 html
    def get_index_html(self):
        # 发送 requests 请求
        res = requests.get(url=self.url).text
        return res

    # 获取主页所有的 href
    def get_href(self):
        index_html = self.get_index_html()
        tree = etree.HTML(index_html)
        detail_href = tree.xpath('//li[@class="menu_s center"]')
        for href_path in detail_href:
            href = href_path.xpath('./div/a/@href')
            ic(href)


if __name__ == '__main__':
    acu = AcuPoint()
    acu.get_href()

看到第一个 href 是 “xinfo/70”

请添加图片描述

跑出来的第一个结果也是一样。

请添加图片描述

在看最后一个，一样的，说明主页href 全部获取完成。

3.拼接详情页的 url，定位元素，获取数据。

"""
    作者：mldsh
    日期：2022年08月26日10:20
    使用工具：PyCharm
"""
import requests
from icecream import ic
from lxml import etree


class AcuPoint(object):
    def __init__(self):
        # 主页 url
        self.url = 'http://jlxw.jiudaifu.com/'

    # 获取主页 html
    def get_index_html(self):
        # 发送 requests 请求
        res = requests.get(url=self.url).text
        return res

    # 获取主页所有的 href
    def get_href(self):
        all_url = {}
        index_html = self.get_index_html()
        tree = etree.HTML(index_html)
        li = tree.xpath('//li[@class="menu center"]')
        for li_href in li:
            theme_url = li_href.xpath('./a/@href')
            theme_name = li_href.xpath('./a/span/text()')
            href = li_href.xpath('./ul/li[@class="menu_s center"]/div/a/@href')
            href_name = li_href.xpath('./ul/li[@class="menu_s center"]/div/a/span/text()')
            all_url[theme_name[0]] = {self.url + theme_url[0]: dict(zip(href_name, [self.url + i for i in href]))}
        return all_url

    # 详情页数据
    def detail_data(self):
        detail_url_dict = self.get_href()
        for detail_name in detail_url_dict:
            # print(list(detail_url_dict[detail_name].keys())[0],detail_name)
            res_html = requests.get(url=list(detail_url_dict[detail_name].keys())[0]).text
            tree = etree.HTML(res_html)
            div = tree.xpath('//div[@class="main"]')
            for h2_data in div:
                title = h2_data.xpath('./ul/li/h2/text()')
                text = h2_data.xpath('./ul/li/div/text()')
                img_url = h2_data.xpath('./ul/li/div/img/@src')
                text.extend(img_url)
                ic(title, text)
                return



if __name__ == '__main__':
    acu = AcuPoint()
    acu.detail_data()

请添加图片描述