xlxs文件批量转csv
import os
import time
import pandas as pd
def excel_add(file_path):
time1 = time.time()
for parent, dirnames, filenames in os.walk(file_path):
for filename in filenames:
df_all = pd.DataFrame()
xls_file = pd.ExcelFile(os.path.join(parent, filename))
# 显示出读入excel文件中各个sheet表的名字
sheet_names = xls_file.sheet_names
for i in sheet_names:
# df = pd.read_excel(os.path.join(parent, filename), sheet_name=i,skiprows = [1,2,3,4])
df = pd.read_excel(os.path.join(parent, filename), sheet_name=i)
# df_all = df_all.append(df)
df_all = pd.concat([df_all, df])
# 第一个参数是说把dataframe写入到D盘下的TEST2.csv文件中,参数sep表示字段之间用’, ’分隔,header表示是否需要头部,index表示是否需要行号。
file_out = f'datacsv/{filename.split(".")[0]}.csv'
df_all.to_csv(file_out, sep=',', header=True, index=False)
if __name__ == '__main__':
file_path = r'data'
excel_add(file_path)
之后进行多梯度因素可视化和差异性分析
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
filename = '4d盐胁迫.csv'.replace('.csv',"")
f = pd.read_csv(rf"datacsv\{filename}.csv")
columns_names = f.columns.tolist()[1:]
print(columns_names)
data_csv = pd.DataFrame(columns=['Management', 'Character', 'p value', 'significance'])
ticks = f['RHIZO 2016a'].astype(str).str.split('-').str[0].unique()
ticks_ = f['RHIZO 2016a'].astype(str).str.split('-').str[0]
def draw(character, ticks):
plt.figure(figsize=(30, 15))
for n,i in enumerate(ticks):
plt.xlabel(character, color="r", fontsize=13)
i_character = f[ticks_ == i][character].dropna().values
color = plt.cm.viridis(n / len(ticks)) # 根据索引值选择颜色
plt.bar(i, i_character.mean(), width=0.35,color = color)
plt.scatter(np.full((1, i_character.shape[0]), i)[0], i_character, s=20, color='cornflowerblue')
pos_1 = 0
for pos_2 in range(1, 5):
sig(ticks, pos_1, pos_2, character)
def sig(ticks, pos_1, pos_2, character): # 0,1 0,2 0,3 0,4
# 添加显著性差异信息
data_pos1 = f[ticks_ == ticks[pos_1]][character].dropna().values
data_pos2 = f[ticks_ == ticks[pos_2]][character].dropna().values
t, p = stats.ttest_ind(data_pos1, data_pos2, equal_var=False)
mark = 'ns'
if 0.01 < p < 0.05:
mark = "*"
elif 0.001 < p < 0.01:
mark = "**"
elif p < 0.001:
mark = "***"
c_significance = "dimgray"
plt.text((2 * pos_1 + 1) / 2, (1+0.1*pos_2) * max(data_pos1.mean(),data_pos2.mean()), mark, ha='center', color='r', fontsize=14)
plt.hlines((1+0.1*pos_2) * max(data_pos1.mean(), data_pos2.mean()), pos_1, pos_2, c_significance)
plt.plot([pos_1, pos_1], [(1+0.1*pos_2) * max(data_pos1.mean(), data_pos2.mean()), data_pos1.mean()], c_significance)
plt.plot([pos_2, pos_2], [(1+0.1*pos_2) * max(data_pos1.mean(), data_pos2.mean()), data_pos2.mean()], c_significance)
print(ticks[pos_1], '/', ticks[pos_2], "在", character, " 的差异p值为", p, "——", mark)
data_csv.loc[len(data_csv)] = [ticks[pos_1] + '/' + ticks[pos_2], character, p, mark]
plt.savefig('pics\\'+filename +'_' + character + '.png')
columns_names = f.columns.tolist()[1:]
# print(columns_names)
for n, i in enumerate(columns_names):
print(i)
draw(i, ticks)
plt.close()
data_csv.to_csv('anas//'+filename+'_分析.csv',index=False)
可视化结果
显著性结果 (具体为表格)