# !/usr/bin/env python
# -*-coding:utf-8 -*-
import os.path
import bs4,shutil,time
from pandas.core.frame import DataFrame
def get_html_tabledata(htmlpath,tableindex: int = 0):
"""
html文件,获取表格数据
:param htmlpath: html文件路径
:param tableindex: table索引,int,默认为0
:return:字典列表
"""
with open(htmlpath, 'r+',encoding='UTF-8') as f:
s = f.read()
wb = s.strip().replace('\ufeff', '')
soup = bs4.BeautifulSoup(wb, 'lxml') # 解析html
# 获取指定表格的数据
table=soup.findAll("table")[tableindex] # 读取第二个表格
table_rows = table.findAll("tr") # 获得表格中行的集合
# 获取表格第一行作为字典keykey
keys = [table_rows[0].findAll(['th', 'td'])[i].getText().strip() for i in range(len(table_rows[0].findAll(['th', 'td']))) ]
tabledata = []
for table_row in table_rows[1:]:
row = table_row.f
Python:获取html表格数据、html表格保存Excel
于 2022-06-20 18:42:43 首次发布