当我们需要对多个表格表格中的某一列进行统计时,且该列数据存在异常极端值,可通过百分位数的方法将极端值筛选掉,在本次案例将小于5%的分位数和大于95%分位数的数值去掉,统计的5%~95%内的数据的平均值。
代码如下
import pandas as pd
import glob
import os
from pandas.core.frame import DataFrame
List=glob.glob(r"D:\xls\*.xls")
# data=[]
namedata=[]
p5data=[]
p95data=[]
meandata=[]
for i in List:
name=os.path.basename(i)[:-4]
print(name)
namedata.append(name)
df = pd.read_excel(i, usecols=[2], names=None)
p5 = df.grid_code.quantile(0.05)
p5data.append(p5)
p95 = df.grid_code.quantile(0.95)
p95data.append(p95)
dfsx = df[(df["grid_code"] > p5) & (df["grid_code"] < p95)]
mean = dfsx.grid_code.mean()
meandata.append(mean)
data={"name":namedata,"p5":p5data,"p95":p95data,"mean":meandata}
dfout=DataFrame(data)
print(dfout)
dfout.to_csv(r"D:\Cimouth.csv")