python pandas 文件保存/转换/处理
"""
@File :readFile.py
@Author :xin-w
@Date :2023/7/10 19:22
@IDE :PyCharm
"""
import os
import json
import traceback
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
class Handle:
def txt2csv(self, path, fname, filename, keyNum, **dic):
"""
:param path: 文件路径
:param fname: 文件包含名称
:param filename: 保存文件名
:param keyNum: 字典数
:param dic: 字典参数
:return: 失败文件 保存的文件 df
"""
result = []
fail_file = []
for i in os.listdir(path):
if f'{fname}' in i:
file = f"{path}\\{i}"
try:
with open(file, "r", encoding="utf-8") as f:
data = json.loads(f.read())
if keyNum == 1:
try:
result += data[dic['dic_1']]
print(f'----{i} read successfully.----')
except:
fail_file.append(i)
elif keyNum == 2:
try:
result += data[dic['dic_1']][dic['dic_2']]
except:
fail_file.append(i)
elif keyNum == 31:
try:
result += data[dic['dic_1']][dic['dic_2']][dic['dic_3']]
except:
fail_file.append(i)
elif keyNum == 32:
try:
if ('bankAccountID' in data[dic['dic_1']][dic['dic_2']][0]) and ('bankAccountID' not in data[dic['dic_1']][dic['dic_3']][0]):
d1 = data[dic['dic_1']][dic['dic_2']][0]
d1['id'] = i.split('-')[0]
d1List = [d1]
result += d1List
print(f'----{i}文档读取成功----')
elif ('bankAccountID' not in data[dic['dic_1']][dic['dic_2']][0]) and ('bankAccountID' in data[dic['dic_1']][dic['dic_3']][0]):
d2 = data[dic['dic_1']][dic['dic_3']][0]
d2['id'] = i.split('-')[0]
d2List = [d2]
result += d2List
print(f'----{i}文档读取成功----')
elif ('bankAccountID' in data[dic['dic_1']][dic['dic_2']][0]) and ('bankAccountID' in data[dic['dic_1']][dic['dic_3']][0]):
d1 = data[dic['dic_1']][dic['dic_2']][0]
d2 = data[dic['dic_1']][dic['dic_3']][0]
d1['id'] = i.split('-')[0]
d2['id'] = i.split(('-'))[0]
d1List = [d1]
d2List = [d2]
result += d1List
result += d2List
print(f'----{i}文档读取成功----')
except:
fail_file.append(i)
except:
fail_file.append(i)
print('---------------------------')
print(np.unique(np.array(fail_file)))
df = pd.DataFrame(result)
df.to_csv(f'./{filename}.csv', encoding='utf-8-sig', index=False)
print(f'{filename} csv 文件保存成功')
return fail_file, df
def html2csv(self, path, fname, filename):
"""
html 文件保存 csv 文件
:param path: 文件路径
:param fname: 文件包含名称
:param filename: 保存文件名
:return: 失败文件 保存的文件 df
"""
fail_read = []
list_dict = []
for i in os.listdir(path):
if f'{fname}' in i:
with open(f'{path}\\{i}', mode='r', encoding='utf-8') as f:
html_text = f.read()
soup = BeautifulSoup(html_text, 'html.parser')
table = soup.find_all('table')[1]
trs = table.find_all('tr')[1:]
for tr in trs:
tds = tr.find_all('td')
dict_data = {}
dict_data['id-number'] = tds[0].text
dict_data['store-alias'] = tds[1].text
dict_data['pay'] = tds[3].text
dict_data['store-num'] = tds[4].text
dict_data['level-list'] = tds[5].text
dict_data['paymethod'] = tds[6].text
dict_data['count_amount'] = tds[7].text
dict_data['operation'] = tds[8].text
list_dict.append(dict_data)
print(f'{i} read successfully.')
else:
fail_read.append(i)
df = pd.DataFrame(list_dict)
df.to_csv(f'./{filename}.csv', encoding='utf-8-sig', index=False)
return fail_read, df
def htmlPd2csv(self, path, fname, filename):
"""
pandas to read html
:param path: 文件路径
:param fname: 文件包含名
:param filename: 保存文件名
:return:
"""
tables = []
fail_file = []
for i in os.listdir(path):
try:
if f'{fname}' in i:
html_text = pd.read_html(f'{path}\\{i}')
tables.append(html_text[0])
print(f'{i} read completely.')
except:
fail_file.append(i)
traceback.print_exc()
print(f'{i} read failed.')
print('---------------------------')
print(f'{fail_file} file read failed.')
df = pd.concat(tables)
df.to_csv(f'./{filename}.csv', encoding='utf-8-sig', index=False)
print(f'{filename} file save successfully.')
def txtHtml2csv(self, path, fname, filename, **dic):
"""
txt 文件中是 html 内容
:param path: 文件路径
:param fname: 文件包含名
:param filename: 保存文件名
:param dic: 字典参数
:return: 失败文件
"""
list_dict = []
fail_file = []
for i in os.listdir(path):
try:
if f'{fname}' in i:
with open(f'{path}\\{i}', mode='r', encoding='utf-8') as f:
html_text = json.loads(f.read())
soup = BeautifulSoup(html_text[dic['dic_1']][dic['dic_2']], 'html.parser')
tds = soup.find_all('td')
dict_data = {}
dict_data['agentAccount'] = tds[0].text.strip()
dict_data['username'] = tds[1].text.strip()
dict_data['statu'] = tds[2].text.strip()
dict_data['limit'] = tds[3].text.strip()
dict_data['newAddDate'] = tds[4].text.strip()
dict_data['lastLoginIp'] = tds[5].text.strip()
dict_data['lastLoginTime'] = tds[6].text.strip()
list_dict.append(dict_data)
print(f'{i} read completely.')
except:
fail_file.append(i)
traceback.print_exc()
print('---------------------------')
print(f'{fail_file} file read failed.')
df = pd.DataFrame(list_dict)
df.to_csv(f'./{filename}.csv', encoding='utf-8-sig', index=False)
print(f'{filename} file save successfully.')
def list2csv(self, path, fname, filename, **dic):
"""
txt 中数据是 list 而不是 dict
:param path: 文件路径
:param fname: 文件包含名
:param filename: 保存文件名
:param dic: 字典参数
:return: 失败文件 合并 df
"""
rowList = []
fail_file = []
for i in os.listdir(path):
try:
if f'{fname}' in i:
with open(f'{path}\\{i}', mode='r', encoding='utf-8') as f:
data = json.loads(f.read())
d = data[dic['dic_1']]
if len(d) == 0:
continue
else:
lenLi = np.arange(len(d[0]))
for j in range(len(d)):
dic_row = {}
for li in lenLi:
dic_row[f'{li}'] = d[j][li]
rowList.append(dic_row)
print(f'{i} read successfully.')
except:
fail_file.append(i)
print('---------------------------')
print(f'{fail_file} file read failed.')
df = pd.DataFrame(rowList)
df.to_csv(f'./handled/{filename}.csv', encoding = 'utf-8-sig', index = False)
print(f'{filename} file save successfully.')
return fail_file, df
def basisHandle(self, data, filename):
"""
:param data: df
:param filename: df file name
:return: handled df
"""
data.dropna(axis=0, how='all', inplace=True)
data.dropna(axis=1, how='all', inplace=True)
data.drop_duplicates(inplace=True)
cols = data.columns
print(data.shape)
for col in cols:
if data[col].duplicated().sum() == len(data[col]) - 1:
data.drop(labels = col, axis = 1, inplace = True)
print(data.shape)
print(f'{filename} handle successfully.')
return data
def split_str(self, data):
"""
:param data: 数据列
:return: 分割后的 array
"""
valLi = []
for i in data.values:
tempLi = []
temp = i.split(': ')[2]
temp_ = temp.split('\'')[1]
if temp_ == '':
tempLi.append(0.00)
else:
tempLi.append(float(temp_))
valLi += tempLi
return valLi