python 如果遇到爬取解析到得文本为 style="display: none" 没有找到相关内容该怎样解决修改文本属性获取新的内容

最新推荐文章于 2024-05-13 17:11:26 发布

置顶 Hou_Monkey

最新推荐文章于 2024-05-13 17:11:26 发布

阅读量7k

点赞数 1

分类专栏：爬虫文章标签： python 正则表达式

本文链接：https://blog.csdn.net/hou9876543210/article/details/105881759

版权

爬虫专栏收录该内容

15 篇文章 4 订阅

订阅专栏

在有的爬取网页内容时候有时候遇到text得内容为空如下图所示这应该就是display:none的问题，遇到这样问题要改变css的style中的内容这需要pyquery这个库

在这里插入图片描述

下面图是原始f12中的disply：block 显示内容所以爬取时候内容一的到实际的内容为准也就是requests.get(url = " ",headers = " ").text

在这里插入图片描述

不说太多上代码下面有详细代码本章用的是正则的得到的代码不懂可以评论哈最终结果如图所示

import requests
import re
from pyquery import PyQuery as pq


""""
        爬取sopu信息
        url = http://www.soupu.com/pinpai/list.aspx?byt=6&syt=606&pptype=0
"""

class SoPu(object):

    def __init__(self) -> None:
        self.url = "http://www.soupu.com/pinpai/list.aspx?byt=6&syt=606&pptype=0"
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        }
    def get_url_list(self):
        response = requests.get(url=self.url,headers=self.header)
        html = response.text
        doc = pq(html)
        if doc('.ctl00_main_NoDataPanel').attr:
            doc('.NextPage').attr("display: block")
            response = requests.get(url=self.url, headers=self.header)
            html = response.text
            # print(html)
            return html
        else:
            print("无法获取本页面内容")

    def get_content(self,html):
        rE= re.compile(r'class="table_style2">(.*?)</p> </div>', re.M | re.S)
        img_url_ls = re.compile(r"<img.*?src='(.*?)'.*? />", re.M | re.S)
        name = re.compile(r"<img.*?alt='(.*?)'.*? />", re.M | re.S)
        td_ls =re.compile(r" <td width='255'>(.*?)</td>", re.M | re.S)
        adress = re.compile(r"<td width='170'>(.*?)</td>")
        type = re.compile(r"<td width='170'>.*? <td>(.*?)</td>")
        eara = re.compile(r"<td width='170'>.*? <td>.*?<td>(.*?)</td>")
        extend = re.compile(r"<td width='255'>.*?<td width='255'>(.*?)</td>")
        data = re.compile(r"<td width='170'>.*?<td width='170'>(.*?)</td>")
        updata_data =re.compile(r"<p class='a999'>(.*?)</p>")
        follow =re.compile(r"<span class='a_f_Georgia'>(.*?)</span>")
        # print("rE",type(rE),"name",type(name))
        ls = rE.findall(html)

        for each in ls:
            # print("each",each)
            match_name = name.search(each)
            if match_name != None:
                Shop_name = match_name.group(1)
            else:
                Shop_name = '未知'
            print('Shop_name:', Shop_name)

            match_url = img_url_ls.search(each)
            if match_url != None:
                Shop_img_url = match_url.group(1)
            else:
                Shop_img_url = '未知'
            print('Shop_img_url:', Shop_img_url)

            match_td = td_ls.search(each)
            if match_td != None:
                Compay_namme = match_td.group(1)
            else:
                Compay_namme = '未知'
            print('Compay_namme:', Compay_namme)

            match_adress = adress.search(each)
            if match_adress != None:
                Compay_adress = match_adress.group(1)
            else:
                Compay_adress = '未知'
            print('Compay_adress:', Compay_adress)

            match_type = type.search(each)
            if match_type != None:
                Compay_type = match_type.group(1)
            else:
                Compay_type = '未知'
            print('Compay_type:', Compay_type)

            match_eara = eara.search(each)
            if match_eara != None:
                Compay_eara = match_eara.group(1)
            else:
                Compay_eara = '未知'
            print('Compay_type:', Compay_eara)

            match_extend = extend.search(each)
            if match_extend != None:
                Compay_extend = match_extend.group(1)
            else:
                Compay_extend = '未知'
            print('Compay_extend:', Compay_extend)

            match_data = data.search(each)
            if match_data != None:
                Compay_data = match_data.group(1)
            else:
                Compay_data = '未知'
            print('Compay_data:', Compay_data)

            match_updata_data = updata_data.search(each)
            if match_updata_data != None:
                Compay_updata_data = match_updata_data.group(1)
            else:
                Compay_updata_data = '未知'
            print('Compay_updata_data:', Compay_updata_data)

            match_follow = follow.search(each)
            if match_follow != None:
                Compay_follow = match_follow.group(1)
            else:
                Compay_follow = '未知'
            print('Compay_follow:', Compay_follow)

    def net_page(self):
        pass
if __name__ == '__main__':
    sopu = SoPu()
    html =sopu.get_url_list()
    sopu.get_content(html)

Hou_Monkey

关注

1
点赞
踩
11

收藏

觉得还不错? 一键收藏
6
评论
python 如果遇到爬取解析到得文本为 style="display: none" 没有找到相关内容该怎样解决修改文本属性获取新的内容

在有的爬取网页内容时候有时候遇到text得内容为空如下图所示这应该就是display:none的问题，遇到这样问题要改变css的style中的内容这需要pyquery这个库下面图是原始f12中的disply：block 显示内容所以爬取时候内容一的到实际的内容为准也就是requests.get(url = " ",headers = " ").text不说太多上代码下面有详...
复制链接

扫一扫