目标网页:
https://huggingface.co/datasets
目标信息:
数据简介、数据页面链接、下载次数、数据大小、数据数目。
过程:
将网页源代码下载为仅html格式,再对文件进行操作
代码:
import re
import sys
def save_file(info):
# 打开文件,并将标准输出重定向到文件
file_out=r"E:\Procedure\Python\Experiment\output.txt"
with open(file_out, "w",encoding="utf-8") as f:
# 保存原来的标准输出
original_stdout = sys.stdout
# 将输出重定向到文件
sys.stdout = f
# 程序的输出
for key, value in info.items():
print(key + ":", value)
# 恢复原来的标准输出
sys.stdout = original_stdout
def remove_tags(content):
# 使用re.sub()函数替换匹配的字符串为空字符串
pattern = r"<.*?>"
intro = re.sub(pattern,"",content)
return intro
def Get_introduction(content):
t=r'<!-- HTML_TAG_START -->'#'
n=r'<h1 class="relative group flex items-center">'
l=r'<!-- HTML_TAG_END --></div>'
te=content.find(t)
start=content.find(n,te)
end = content.find(l,start)
gap = content[start:end]
intro = ""
intro = remove_tags(gap)
return intro
def get_Number(my_str):
if(len(my_str)==0):return 0
pattern=r"\d+"
numbers=re.findall(pattern,my_str)
return numbers[0]
def get_Size(my_str):
if(len(my_str)==0):return 0
pattern=r"\d+\.\d+|\d+"
numbers=re.findall(pattern,my_str)
return numbers[0]
def Get_type(size_str):
flag = size_str.find("kB")
if flag!=-1:
t = 'kB'
return t
flag = size_str.find('MB')
if flag!=-1:
t = 'MB'
return t
flag = size_str.find('GB')
if flag != -1:
t = 'GB'
return t
def extract_info(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# 提取数据页面链接
link_tag='<meta property="og:url" content='
link_start = content.find('<meta property="og:url" content=')
link_end = content.find('/>', link_start)
link = content[link_start+len(link_tag): link_end]
# 提取数据简介
intro = Get_introduction(content)
# 提取下载次数
download_start = content.find('Downloads last month')
download_end = content.find('</dd>', download_start)
download_start = content.find('font-semibold',download_start)
download_str = content[download_start: download_end].replace(',','')
download_count = get_Number(download_str)
# 提取数据大小
size_start = content.find('Size of downloaded')
size_end = content.find('B', size_start)
size_str = content[size_start: size_end+1]
temp = float(get_Size(size_str))
type = str(Get_type(size_str))
if type=='MB':
temp*=1024
else :
if type=='GB':
temp*=1024*1024
size = "%.1f kB"%(temp)
# 提取数据数目
num_start = content.find('Number of rows:"')
num_end = content.find(']', num_start)
num_str = content[num_start: num_end].replace(',','')
num_count = get_Number(num_str)
return {
"数据简介": intro,
"数据页面链接": link,
"下载次数": download_count,
"数据大小": size,
"数据数目": num_count
}
# 替换为你保存的文件路径
file_path = r"E:\Procedure\Python\Experiment\TempHTML\DIBT_10k_prompts_ranked · Datasets at Hugging Face.html"
info = extract_info(file_path)
save_file(info)
#for key, value in info.items():
#print(key + ":", value)