python xpath 解析html文件

gwd777

已于 2022-09-23 15:47:41 修改

阅读量525

点赞数

文章标签： python html 前端

于 2022-09-23 15:45:26 首次发布

本文链接：https://blog.csdn.net/gwd777/article/details/127011892

版权

该代码段主要用于解析HTML网页，提取项目名称、URL、类别评分、项目标签等信息，并将数据写入CSV文件。它遍历指定目录下的多个文件，对每个文件进行相同的操作，包括获取项目网站、项目评分、发布时间和问题及其回答等关键数据。

摘要由CSDN通过智能技术生成


# 解析html网页资源
def jiexi_html_files():

    project_url_dict = {}     #初始化字典
    with open(files_dir_path, 'r') as f:
        html_string = f.read()
        html = etree.HTML(html_string)
        project_list = html.xpath("//a[@class='ReviewCell_review_head_project_name__iy66Q']")
        for item in project_list:
            http_url = item.xpath("@href")[0]
            project = item.xpath('text()')[0]
            val = {project: http_url}
            project_url_dict.update(val)    # 向字典里跟新数据

    files_dir = './'
    files = os.listdir(files_dir)

    for file in files:
        if file.startswith('pqrs_project') :
            with open(files_dir + file, 'r') as f:
                print('当前处理文件:', file)
                html_string = f.read()

                html = etree.HTML(html_string)

                web_url = '无'
                website = html.xpath("//div[@class='OverviewSection_text_tile__3fjPH OverviewSection_protocol_website__1kGWZ']")
                if len(website) == 1:
                    web_url = website[0].xpath("./a/text()")[0]
                    #print(web_url)

                # 解析项目类别
                class_dict = {}
                class_list = html.xpath("//div[@class='AuditSummary_section_head__34n4V']")
                for item in class_list:
                    class_name = item.xpath('./button/text()')[0].replace('  ', '').replace('\n', '').replace('\r', '')
                    class_value = item.xpath('./span/text()')[0]
                    class_value = '{}%'.format(class_value)
                    val = {class_name: class_value}
                    class_dict.update(val)

                # 获取项目标签
                # project = html.xpath("//span[@class='BreadcrumbItem_breadcrumb_item__2tV1B BreadcrumbItem_breadcrumb_item_active__3M3p6']")[0]
                # project_symbol = project.xpath('text()')[0]

                project_symbol = file.split('_')[3].split('.')[0]

                project_pqr_url = project_url_dict.get(project_symbol)
                if project_pqr_url is None:
                    project_pqr_url = '无'

                # 获取项目评分
                project = html.xpath("//span[@class='OverviewSection_final_score_value__1QwX4']")[0]
                project_pqr_score = project.xpath('text()')[0]
                project_pqr_score = '{}%'.format(project_pqr_score)

                # 获取发布时间,发布版本
                final_score_list = html.xpath("//div[@class='OverviewSection_text_tile__3fjPH']")
                project_date = final_score_list[0].xpath('./span/text()')[1]
                project_date = transform_month(project_date)
                project_version = final_score_list[1].xpath('./span/text()')[2]

                # 获取问题，项目问题回复
                title = 'clumn1,clumn2,clumn3,clumn4,clumn5,clumn6,clumn7,clumn8,clumn9,clumn10,clumn11,clumn12,clumn13\n'
                file_content = ''
                flist = html.xpath("//div[contains(@class, 'AuditSummary_table_row__2Prwu')]")
                for item in flist:
                    pro_question = item.xpath('./button/text()')[0].replace('  ', '').replace('\n', '').replace('\r', '').replace(',', '.')
                    pro_question_value = item.xpath('./span')[1].xpath('text()')[0]
                    #print(project_symbol, '项目地址:', web_url, '项目评分网址:', project_pqr_url, '项目总评分:', project_pqr_score, '项目版本:', project_version, '类别评分:', class_values, '项目发布日期:', project_date, '项目问答评分:', pro_question, pro_question_value)

                    addv = '{},{},{},{},{},{},{},{},{},{},{},{},{}'\
                        .format(project_symbol, web_url, project_pqr_url, project_pqr_score, project_version, class_dict.get('Code And Team'), class_dict.get('Documentation'), class_dict.get('Testing'), class_dict.get('Security'), class_dict.get('Access Controls'), project_date, pro_question, pro_question_value)
                    file_content = file_content + addv + '\n'

                with open(output_path+project_symbol+'.csv', "w", encoding="utf-8") as f:
                    f.write(title)
                    f.write(file_content)

1）查找包含class属性=‘AuditSummary_table_row__2Prwu'’的所有div标签

flist = html.xpath("//div[contains(@class, 'AuditSummary_table_row__2Prwu')]")

2）python字典跟新，字典初始化： val = {project: http_url}
project_url_dict.update(val) # 向字典里跟新数据

3）python字典获取： project_pqr_url = project_url_dict.get(project_symbol)
if project_pqr_url is None:
project_pqr_url = '无'

4）xpath解析要点：