python解析html，提取注释部分

最新推荐文章于 2024-08-27 10:00:00 发布

含泪呵呵

最新推荐文章于 2024-08-27 10:00:00 发布

阅读量2.4k

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/weixin_41710606/article/details/86089605

版权

python 专栏收录该内容

3 篇文章 1 订阅

订阅专栏

在这里插入图片描述

from bs4 import BeautifulSoup,Comment
import requests
def get_name(url):
    test = {'company_name': [],'code':[],'city':[],'industry':[],'register':[],'income':[],'profit':[]}
    req = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(req.text, 'html.parser')
    if soup.find_all('ul', {'class': 'hot-search clear'}):
        for i in soup.find_all('ul', {'class': 'hot-search clear'}):
            if i.find_all('div', {'class': 'inf'}):
                for j in i.find_all('div', {'class': 'inf'}):
                    name = j.find('h2').text   #公司名
                    test['company_name'].append(name)
                    test['code'].append(code)
                    info = (j.findAll(text=lambda text: isinstance(text, Comment)))[1]
                    info = BeautifulSoup(info, 'html.parser')
                    for s in info.find_all('p'):
                    if s.text == '注册资金':
                        register=s.find_next('span').text
                        test['register'].append(register)
                    if s.text == '营业收入':
                        income = s.find_next('span').text
                        test['income'].append(income)
                    if s.text == '净利润':
                        profit = s.find_next('span').text
                        test['profit'].append(profit)
    return test
info = get_name('https://www.ccotc.cn/Enterprise/index/cid/69/lid/84/industryid/C/cityid/10/businessIncomeId/A/order/4/p/2.html')