python xpath 解析html文件

该代码段主要用于解析HTML网页,提取项目名称、URL、类别评分、项目标签等信息,并将数据写入CSV文件。它遍历指定目录下的多个文件,对每个文件进行相同的操作,包括获取项目网站、项目评分、发布时间和问题及其回答等关键数据。
摘要由CSDN通过智能技术生成

# 解析html网页资源
def jiexi_html_files():

    project_url_dict = {}     #初始化字典
    with open(files_dir_path, 'r') as f:
        html_string = f.read()
        html = etree.HTML(html_string)
        project_list = html.xpath("//a[@class='ReviewCell_review_head_project_name__iy66Q']")
        for item in project_list:
            http_url = item.xpath("@href")[0]
            project = item.xpath('text()')[0]
            val = {project: http_url}
            project_url_dict.update(val)    # 向字典里跟新数据

    files_dir = './'
    files = os.listdir(files_dir)

    for file in files:
        if file.startswith('pqrs_project') :
            with open(files_dir + file, 'r') as f:
                print('当前处理文件:', file)
                html_string = f.read()

                html = etree.HTML(html_string)

                web_url = '无'
                website = html.xpath("//div[@class='OverviewSection_text_tile__3fjPH OverviewSection_protocol_website__1kGWZ']")
                if len(website) == 1:
                    web_url = website[0].xpath("./a/text()")[0]
                    #print(web_url)

                # 解析项目类别
                class_dict = {}
                class_list = html.xpath("//div[@class='AuditSummary_section_head__34n4V']")
                for item in class_list:
                    class_name = item.xpath('./button/text()')[0].replace('  ', '').replace('\n', '').replace('\r', '')
                    class_value = item.xpath('./span/text()')[0]
                    class_value = '{}%'.format(class_value)
                    val = {class_name: class_value}
                    class_dict.update(val)

                # 获取项目标签
                # project = html.xpath("//span[@class='BreadcrumbItem_breadcrumb_item__2tV1B BreadcrumbItem_breadcrumb_item_active__3M3p6']")[0]
                # project_symbol = project.xpath('text()')[0]

                project_symbol = file.split('_')[3].split('.')[0]

                project_pqr_url = project_url_dict.get(project_symbol)
                if project_pqr_url is None:
                    project_pqr_url = '无'

                # 获取项目评分
                project = html.xpath("//span[@class='OverviewSection_final_score_value__1QwX4']")[0]
                project_pqr_score = project.xpath('text()')[0]
                project_pqr_score = '{}%'.format(project_pqr_score)

                # 获取发布时间,发布版本
                final_score_list = html.xpath("//div[@class='OverviewSection_text_tile__3fjPH']")
                project_date = final_score_list[0].xpath('./span/text()')[1]
                project_date = transform_month(project_date)
                project_version = final_score_list[1].xpath('./span/text()')[2]

                # 获取问题,项目问题回复
                title = 'clumn1,clumn2,clumn3,clumn4,clumn5,clumn6,clumn7,clumn8,clumn9,clumn10,clumn11,clumn12,clumn13\n'
                file_content = ''
                flist = html.xpath("//div[contains(@class, 'AuditSummary_table_row__2Prwu')]")
                for item in flist:
                    pro_question = item.xpath('./button/text()')[0].replace('  ', '').replace('\n', '').replace('\r', '').replace(',', '.')
                    pro_question_value = item.xpath('./span')[1].xpath('text()')[0]
                    #print(project_symbol, '项目地址:', web_url, '项目评分网址:', project_pqr_url, '项目总评分:', project_pqr_score, '项目版本:', project_version, '类别评分:', class_values, '项目发布日期:', project_date, '项目问答评分:', pro_question, pro_question_value)

                    addv = '{},{},{},{},{},{},{},{},{},{},{},{},{}'\
                        .format(project_symbol, web_url, project_pqr_url, project_pqr_score, project_version, class_dict.get('Code And Team'), class_dict.get('Documentation'), class_dict.get('Testing'), class_dict.get('Security'), class_dict.get('Access Controls'), project_date, pro_question, pro_question_value)
                    file_content = file_content + addv + '\n'

                with open(output_path+project_symbol+'.csv', "w", encoding="utf-8") as f:
                    f.write(title)
                    f.write(file_content)

1)查找包含class属性=‘AuditSummary_table_row__2Prwu'’的所有div标签

flist = html.xpath("//div[contains(@class, 'AuditSummary_table_row__2Prwu')]")

2)python字典跟新,字典初始化:  val = {project: http_url}
project_url_dict.update(val)    # 向字典里跟新数据

3)python字典获取: project_pqr_url = project_url_dict.get(project_symbol)
      if project_pqr_url is None:
              project_pqr_url = '无'

4)xpath解析要点:

 

 

详见网址: 

XPath 语法 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值