最近开始学习数据库,有一堆html的表格,但用pandas直接读取提示not found table。没有找到解决的办法后,使用Excel打开然后另存为xlsx。
得到的xlsx,我还需要利用xlwings删除第一行。现在想想是不是可以直接删除。之前想用pandas直接另存为xlsx,不过用pandas处理的时候出现了问题,所以就没有继续尝试了。
最后xlsx储存为csv再合并csv导入navicat.继续我的数据库之路。
import numpy as np
import pandas as pd
import xlwings as xw
import re
import win32com.client as win32
import os
from xlwings.constants import DeleteShiftDirection
import glob
def combine_csv():
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
# combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
# export to csv
combined_csv.to_csv("combined_csv.csv", index=False, encoding='utf-8-sig')
def xls_to_xlsx():
excel = win32.gencache.EnsureDispatch('Excel.Application')
for filename in os.listdir():
wb = excel.Workbooks.Open(os.path.join(fname,filename))
wb.SaveAs(filename+"x", FileFormat = 51) #FileFormat = 51 is for .xlsx extension
wb.Close() #FileFormat = 56 is for .xls extension
excel.Application.Quit()
def xlsx_to_csv():
for filename in os.listdir():
data_xls = pd.read_excel(filename, index_col=0)
data_xls.to_csv(filename + '.csv', encoding='utf-8') # convert xlsx to csv
def delete_firstrow():
app = xw.App()
print(filename)
for filename in os.listdir():
if re.match(r'.+?\.xlsx?',filename):#避免打开其他的报错
wb2 = app.books.open(filename)
try:
sht= wb2.sheets[0]
# print(sht.range("A1").value)
if sht.range("A1").value == "str":
print(1)
sht.range('1:1').api.Delete(DeleteShiftDirection.xlShiftUp)
wb2.save()
wb2.close()
except:
print('shinbaida')
wb2.close()
app.kill()
break
app.kill()
fname = r"path"
os.chdir(fname)
combine_csv()