import pandas as pd
import numpy as np
import os
class markdownParser():
def __init__(self, path):
"""
读取md转换的txt文件
"""
self.mdPath = path
self.txtPath = path[:-3] + '.txt'
mdtmp = open(self.mdPath, 'rb')
txttmp = open(self.txtPath, 'wb')
txttmp.write(mdtmp.read())
self.file = open(self.txtPath, 'r', encoding='utf-8')
self.lines = self.file.readlines()
def __del__(self):
self.file.close()
os.remove(self.txtPath)
def getDataFrame(self):
"""
拿到md文件中存在的表格,返回dataframe格式的数据
"""
start = -1
end = -1
for i, line in enumerate(self.lines):
if (line[:3] == '| :'): start = i - 1
if (start != -1 and line[:2] != '| '):
end = i
if (start == -1):
raise "No Table"
if (end == -1): end = len(self.lines)
tableList = self.lines[start:end]
columns = [item.strip(' ') for item in tableList[0].strip("\n|").split("|")]
df = pd.DataFrame(columns=columns)
for line in tableList[2:]:
line = [item.strip(' ') for item in line.strip("\n|").split("|")]
df.loc[len(df)] = line
return df
markdown文档解析器
于 2023-01-16 19:51:43 首次发布