import os.path
import pandas as pd
import re
from datetime import datetime
import os
import tkinter as tk
from tkinter import filedialog
def get_folder():
root = tk.Tk()
root.withdraw()
folder_path = filedialog.askdirectory()
return folder_path
def count_files_in_folder(folder_path):
try:
# 使用 os.listdir 获取文件夹内所有文件和文件夹的列表
files_list = os.listdir(folder_path)
# 使用 len 函数获取列表的长度,即文件的数量
file_count = len(files_list)
return file_count
except FileNotFoundError:
print(f"文件夹 '{folder_path}' 不存在")
return None
def create_dataframe_adjacent_pairs(names):
df = pd.DataFrame({'a': names[:-1], 'b': names[1:]})
return df
def read_html(html_file):
# 打开HTML文件
with open(html_file, 'r', encoding='utf-8') as file:
# 读取文件内容
html_content = file.read()
return html_content
def get_links(pattern,html_content):
# # 使用正则表达式提取 <!-- 和 --> 之间的内容
matches = re.findall(pattern, html_content, re.DOTALL)
# 存储提取的内容
extracted_content_list = [match.strip() for match in matches]
# for mylink in extracted_content_list:
# print(mylink)
return extracted_content_list
data_list = []
# 获取当前时间
current_time = datetime.now()
html_path = get_folder()
file_count = count_files_in_folder(html_path)
print(file_count)
index1 = 0
for k in range(1,file_count+1):
print(k)
file_name = str(k)+'.html'
html_file = os.path.join(html_path,file_name)
html_content = read_html(html_file)
# 使用正则表达式提取链接和文本
matches = re.finditer(r'<strong\s.*?><a\s.*?href="(.*?)".*?>(.*?)</a></strong>', html_content)
pattern2 = r'<p>负责人:(.*?)</p>'
fzr = get_links(pattern2,html_content)
pattern3 = r'<span class="goUnit">单位:(.*?)</span>'
dw = get_links(pattern3,html_content)
pattern4 = r'<p>金额:(.*?)</p>'
je = get_links(pattern4,html_content)
pattern5 = r'<p>类型:(.*?)</p>'
lx = get_links(pattern5,html_content)
pattern6 = r'<p>学科代码:(.*?)</p>'
xkdm = get_links(pattern6,html_content)
pattern7 = r'<p>开始时间:(.*?)</p>'
sj = get_links(pattern7,html_content)
for match, fzr1, dw1, je1, lx1, xkdm1, sj1 in zip(matches, fzr, dw, je, lx, xkdm, sj):
index1 = index1 + 1
print('--'+str(index1))
print(match.group(2))
print(match.group(1))
print(fzr1)
print(dw1)
print(je1)
print(lx1)
print(xkdm1)
print(sj1)
data_list.append([str(index1),match.group(2),fzr1, dw1, je1, lx1, xkdm1, sj1,match.group(1)])
# 创建DataFrame
columns = ['No','标题','负责人', '单位', '金额', '类型', '代码', '时间','链接']
df = pd.DataFrame(data_list, columns=columns)
# 格式化为指定的字符串
formatted_time = current_time.strftime("%Y-%m-%d %H_%M_%S")
print(formatted_time)
file_out = 'output '+formatted_time+'.xlsx'
# 保存为xlsx文件
df.to_excel(file_out, index=False)
print("已保存为"+file_out+"文件")
基金检索2
于 2024-02-19 10:05:51 首次发布
![](https://img-home.csdnimg.cn/images/20240611030827.png)