实验一：huggingface数据集网页数据抽取_huggingface download last month-CSDN博客

本文链接：https://blog.csdn.net/Xm041206/article/details/136933756

这段代码展示了如何从HuggingFace的数据集网页中抓取数据简介、页面链接、下载次数和数据大小，使用正则表达式处理HTML内容，适用于类似网站的数据提取任务。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

目标网页：

https://huggingface.co/datasets

目标信息：

数据简介、数据页面链接、下载次数、数据大小、数据数目。

过程：

将网页源代码下载为仅html格式，再对文件进行操作

代码：

import re
import sys
def save_file(info):
    # 打开文件，并将标准输出重定向到文件
    file_out=r"E:\Procedure\Python\Experiment\output.txt"
    with open(file_out, "w",encoding="utf-8") as f:
        # 保存原来的标准输出
        original_stdout = sys.stdout
        # 将输出重定向到文件
        sys.stdout = f
        # 程序的输出
        for key, value in info.items():
            print(key + ":", value)
        # 恢复原来的标准输出
        sys.stdout = original_stdout

def remove_tags(content):
    # 使用re.sub()函数替换匹配的字符串为空字符串
    pattern = r"<.*?>"
    intro = re.sub(pattern,"",content)
    return intro

def Get_introduction(content):
    t=r'<!-- HTML_TAG_START -->'#'
    n=r'<h1 class="relative group flex items-center">'
    l=r'<!-- HTML_TAG_END --></div>'

    te=content.find(t)
    start=content.find(n,te)
    end = content.find(l,start)
    gap = content[start:end]
    intro = ""
    intro = remove_tags(gap)

    return intro

def get_Number(my_str):
    if(len(my_str)==0):return 0
    pattern=r"\d+"
    numbers=re.findall(pattern,my_str)
    return numbers[0]
def get_Size(my_str):
    if(len(my_str)==0):return 0
    pattern=r"\d+\.\d+|\d+"
    numbers=re.findall(pattern,my_str)
    return numbers[0]
def Get_type(size_str):
    flag = size_str.find("kB")
    if flag!=-1:
        t = 'kB'
        return t
    flag = size_str.find('MB')
    if flag!=-1:
        t = 'MB'
        return t
    flag = size_str.find('GB')
    if flag != -1:
        t = 'GB'
        return t
def extract_info(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # 提取数据页面链接
    link_tag='<meta property="og:url" content='
    link_start = content.find('<meta property="og:url" content=')
    link_end = content.find('/>', link_start)
    link = content[link_start+len(link_tag): link_end]

    # 提取数据简介

    intro = Get_introduction(content)

    # 提取下载次数
    download_start = content.find('Downloads last month')
    download_end = content.find('</dd>', download_start)
    download_start = content.find('font-semibold',download_start)
    download_str = content[download_start: download_end].replace(',','')
    download_count = get_Number(download_str)

    # 提取数据大小
    size_start = content.find('Size of downloaded')
    size_end = content.find('B', size_start)
    size_str = content[size_start: size_end+1]
    temp = float(get_Size(size_str))
    type = str(Get_type(size_str))
    if type=='MB':
        temp*=1024
    else :
        if type=='GB':
            temp*=1024*1024
    size = "%.1f kB"%(temp)

    # 提取数据数目
    num_start = content.find('Number of rows:&quot;')
    num_end = content.find(']', num_start)
    num_str = content[num_start: num_end].replace(',','')
    num_count = get_Number(num_str)

    return {
        "数据简介": intro,
        "数据页面链接": link,
        "下载次数": download_count,
        "数据大小": size,
        "数据数目": num_count
    }


# 替换为你保存的文件路径
file_path = r"E:\Procedure\Python\Experiment\TempHTML\DIBT_10k_prompts_ranked · Datasets at Hugging Face.html"

info = extract_info(file_path)
save_file(info)
#for key, value in info.items():
    #print(key + ":", value)