数据处理之蚀变数据筛选

目录

一、写在前面

二、常用代码片段

1.忽略警告信息

2.获取目录下的所有文件名

3.修改文件夹下二级子目录所有文件后缀

4.删除DataFrame中的全0行

5.保存文件;如果路径不存在则新建文件

6.DataFrame第一列插入数据

三、完整代码


一、写在前面

1.仅作代码记录

2.筛选规则:①白云母或者伊利石含量为0的数值,Assay表里Pos2200,Dep2200,Dep1900,以及IC值都不需要②如果白云母或伊利石与蒙脱石同时存在,Pos2200,Dep2200,Dep1900,以及IC值设为空值③如果绿泥石(各种绿泥石,只要含有绿泥石字段就算)在直方图中的含量全为0,Pos2250,Dep2250这两个值设为空值

二、常用代码片段

1.忽略警告信息

warnings.filterwarnings("ignore")

2.获取目录下的所有文件名

base_dir = r"C:\Users\zsllsz2022\Desktop\蚀变基础数据"
fileParent = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]

3.修改文件夹下二级子目录所有文件后缀

changePath = []
base_dir = r"C:\Users\zsllsz2022\Desktop\蚀变基础数据"
fileParent = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]

for eachParent in fileParent:
    filesSon = [os.path.join(eachParent, file) for file in os.listdir(eachParent)]
    for eachFile in filesSon:
        if ").osm" in eachFile:
            changePath.append(eachFile)
            
for eachPath in changePath:
    newName = eachPath.split(".")[0]+".csv"
    os.rename(eachPath,newName)

4.删除DataFrame中的全0行

#删除CSV文件多余数据并按各工程新建文件夹后保存CSV文件
for eachPath in changePath:
    data = pd.read_csv(eachPath,encoding="gbk",on_bad_lines='skip')[:-1] #删除最后一行中文
    data = data.loc[(data!=0).any(axis=1)] #删除全0行,只保存有效数据部分

5.保存文件;如果路径不存在则新建文件

#删除CSV文件多余数据并按各工程新建文件夹后保存CSV文件
for eachPath in changePath:
    data = pd.read_csv(eachPath,encoding="gbk",on_bad_lines='skip')[:-1] #删除最后一行中文
    data = data.loc[(data!=0).any(axis=1)] #删除全0行,只保存有效数据部分
    
    #路径不存在时候新建工程路径
    if not os.path.exists("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+eachPath.split("\\")[-2]):
        os.makedirs( "C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+eachPath.split("\\")[-2])
    #保存文件
    data.to_csv("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+eachPath.split("\\")[-2] 
                + "\\" + eachPath.split("\\")[-1],header=0)

6.DataFrame第一列插入数据

tempData.insert(loc=0,column="column",value=[filesSon[0].split("\\")[-2]]*len(tempData[2]))

三、完整代码

矿物基础数据与直方图-生成总表

import os
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings("ignore")

#①寻找文件夹中的所有osm文件
#②将osm文件后缀改为CSV文件
 
changePath = []
base_dir = r"C:\Users\zsllsz2022\Desktop\蚀变基础数据"
fileParent = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]

for eachParent in fileParent:
    filesSon = [os.path.join(eachParent, file) for file in os.listdir(eachParent)]
    for eachFile in filesSon:
        if ").osm" in eachFile:
            changePath.append(eachFile)
            
for eachPath in changePath:
    newName = eachPath.split(".")[0]+".csv"
    os.rename(eachPath,newName)

#重新寻找所有CSV后缀文件
changePath = []
base_dir = r"C:\Users\zsllsz2022\Desktop\蚀变基础数据"
fileParent = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]

for eachParent in fileParent:
    filesSon = [os.path.join(eachParent, file) for file in os.listdir(eachParent)]
    for eachFile in filesSon:
        if ").csv" in eachFile:
            changePath.append(eachFile)

#删除CSV文件多余数据并按各工程新建文件夹后保存CSV文件
for eachPath in changePath:
    data = pd.read_csv(eachPath,encoding="gbk",on_bad_lines='skip')[:-1] #删除最后一行中文
    data = data.loc[(data!=0).any(axis=1)] #删除全0行,只保存有效数据部分
    
    #路径不存在时候新建工程路径
    if not os.path.exists("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+eachPath.split("\\")[-2]):
        os.makedirs( "C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+eachPath.split("\\")[-2])
    #保存文件
    data.to_csv("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+eachPath.split("\\")[-2] 
                + "\\" + eachPath.split("\\")[-1],header=0)

#建立各工程矿物基础数据
#2200的表分列以后得E列放到矿物基础数据的F列,I列放到G列,J列放到H列
#2250的表E列放I列,J列放J列

base_dir = r"C:\Users\zsllsz2022\Desktop\提取蚀变基础数据"
fileParent = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]

count = 0  #如果正确打开,则count应该为67*2
for eachParent in fileParent:

    filesSon = [os.path.join(eachParent, file) for file in os.listdir(eachParent)]
#     print(filesSon[0].split("\\")[-2])  #工程名-备用
#     print(eachParent)
#     for eachPath in filesSon:
#         print(eachPath)
#     print("---")
#     print(filesSon)   
    if "2200" in filesSon[0]:
        data2200 = pd.read_csv(filesSon[0],header=None)
        count += 1
    if "2250" in filesSon[1]:
        data2250 = pd.read_csv(filesSon[1],header=None)
        count += 1
    
    tempData = data2250.iloc[:,:4].copy()
    tempData["4"] = 0  #蚀变矿物
    #A0 B1 C2 D3 E4 F5 G6 H7 I8 J9
    tempData["5"] = data2200.iloc[:,4] #2200第E列放到基础数据F列
    tempData["6"] = data2200.iloc[:,8] #2200第I列放到基础数据G列
    tempData["7"] = data2200.iloc[:,9] #2200第J列放到基础数据H列
    
    tempData["8"] = data2250.iloc[:,4] #2250第J列放到基础数据I列
    tempData["9"] = data2250.iloc[:,9] #2250第J列放到基础数据J列
    tempData["10"] = tempData["5"]/tempData["6"]
    
    tempData.insert(loc=0,column="column",value=[filesSon[0].split("\\")[-2]]*len(tempData[2]))
    tempData.insert(loc=1,column="样品顺序",value=[i for i in range(1,len(tempData[2])+1)])

    tempData.to_excel("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+filesSon[0].split("\\")[-2] 
                + "\\" + "矿物基础数据.xlsx",header=0,index=False)

print(count)

#提取基础矿物数据Z坐标
base_dir = r"C:\Users\zsllsz2022\Desktop\2024年新处理"
fileParent = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]

for eachParent in fileParent:
    filesSon = [os.path.join(eachParent, file) for file in os.listdir(eachParent)]
    for eachPath in filesSon:
        if "矿物基础数据" in eachPath and "$" not in eachPath: #400kz有问题,不知道为什么有一个~$文件?
            projectName = eachPath.split("\\")[-2]
#             print(eachPath)
            dataZ = pd.read_excel(eachPath)
            data = pd.read_excel("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+projectName+"\\矿物基础数据.xlsx",header=None)
            data[5] = dataZ["Z坐标"]
            data[6] = dataZ["蚀变矿物"]
            data.to_excel("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\\"+projectName+"\\矿物基础数据.xlsx",header=0,index=False)
    

#汇总总表表头
count = 0
base_dir = r"C:\Users\zsllsz2022\Desktop\提取蚀变基础数据"
fileParent = [os.path.join(base_dir, file) for file in os.listdir(base_dir)]

allTable = pd.DataFrame(columns=['工程名', "样品顺序",'样品号', 'X',"Y","Z","蚀变矿物","Dep2200",
                                "IC","Pos2200","Dep2250","Pos2250","Dep1900"])

for eachParent in fileParent:
    filesSon = [os.path.join(eachParent, file) for file in os.listdir(eachParent)]
    for eachPath in filesSon:
        if "矿物基础数据" in eachPath and "$" not in eachPath:
            count += 1
            tempData = pd.read_excel(eachPath,header=None)
            tempData.columns = ['工程名', "样品顺序",'样品号', 'X',"Y","Z","蚀变矿物","Dep2200",
                                "IC","Pos2200","Dep2250","Pos2250","Dep1900"]
            allTable = pd.concat([allTable,tempData])

allTable.to_excel("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\总表.xlsx",index=False)
print("处理工程数=",count)

#根据直方图筛选
data = pd.read_excel("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\总表.xlsx")
for i in range(len(data["样品顺序"])):
# for i in range(600):
    sampleNum = data["样品顺序"].iloc[i]

    if data["蚀变矿物"].iloc[i] in ["白云母","伊利石","蒙脱石","高岭石"]:
        if data["蚀变矿物"].iloc[i] == "伊利石":
            dataSelct = pd.read_excel("C:\\Users\\zsllsz2022\\Desktop\\2024年新处理\\"+data["工程名"].iloc[i]+"\\矿物直方图.xlsx")
            if "蒙脱石" in list(dataSelct.columns[5:]):
                value = dataSelct["蒙脱石"].iloc[sampleNum-1] #-1是因为直方图中有表头
                if value != 0:
                    print(i,data["工程名"].iloc[i],data["样品顺序"].iloc[i])
                    data["Dep2200"].iloc[i] = np.NaN
                    data["IC"].iloc[i] = np.NaN
                    data["Pos2200"].iloc[i] = np.NaN
                    data["Dep1900"].iloc[i] = np.NaN
            if "高岭石" in list(dataSelct.columns[5:]):
                value = dataSelct["高岭石"].iloc[sampleNum-1] #-1是因为直方图中有表头
                if value != 0:
                    print(i,data["工程名"].iloc[i],data["样品顺序"].iloc[i])
                    data["Dep2200"].iloc[i] = np.NaN
                    data["IC"].iloc[i] = np.NaN
                    data["Pos2200"].iloc[i] = np.NaN
                    data["Dep1900"].iloc[i] = np.NaN
                    
        if data["蚀变矿物"].iloc[i] == "白云母":
            dataSelct = pd.read_excel("C:\\Users\\zsllsz2022\\Desktop\\2024年新处理\\"+data["工程名"].iloc[i]+"\\矿物直方图.xlsx")
            if "蒙脱石" in list(dataSelct.columns[5:]):
                value = dataSelct["蒙脱石"].iloc[sampleNum-1] #-1是因为直方图中有表头
                if value != 0:
                    print(i,data["工程名"].iloc[i],data["样品顺序"].iloc[i])
                    data["Dep2200"].iloc[i] = np.NaN
                    data["IC"].iloc[i] = np.NaN
                    data["Pos2200"].iloc[i] = np.NaN
                    data["Dep1900"].iloc[i] = np.NaN
            if "高岭石" in list(dataSelct.columns[5:]):
                value = dataSelct["高岭石"].iloc[sampleNum-1] #-1是因为直方图中有表头
                if value != 0:
                    print(i,data["工程名"].iloc[i],data["样品顺序"].iloc[i])
                    data["Dep2200"].iloc[i] = np.NaN
                    data["IC"].iloc[i] = np.NaN
                    data["Pos2200"].iloc[i] = np.NaN
                    data["Dep1900"].iloc[i] = np.NaN
                    
        if data["蚀变矿物"].iloc[i] == "蒙脱石":
            dataSelct = pd.read_excel("C:\\Users\\zsllsz2022\\Desktop\\2024年新处理\\"+data["工程名"].iloc[i]+"\\矿物直方图.xlsx")
            if "伊利石" in list(dataSelct.columns[5:]):
                value = dataSelct["伊利石"].iloc[sampleNum-1] #-1是因为直方图中有表头
                if value != 0:
                    print(i,data["工程名"].iloc[i],data["样品顺序"].iloc[i])
                    data["Dep2200"].iloc[i] = np.NaN
                    data["IC"].iloc[i] = np.NaN
                    data["Pos2200"].iloc[i] = np.NaN
                    data["Dep1900"].iloc[i] = np.NaN
            if "白云母" in list(dataSelct.columns[5:]):
                value = dataSelct["白云母"].iloc[sampleNum-1] #-1是因为直方图中有表头
                if value != 0:
                    print(i,data["工程名"].iloc[i],data["样品顺序"].iloc[i])
                    data["Dep2200"].iloc[i] = np.NaN
                    data["IC"].iloc[i] = np.NaN
                    data["Pos2200"].iloc[i] = np.NaN
                    data["Dep1900"].iloc[i] = np.NaN
                    
        if data["蚀变矿物"].iloc[i] == "高岭石":
            dataSelct = pd.read_excel("C:\\Users\\zsllsz2022\\Desktop\\2024年新处理\\"+data["工程名"].iloc[i]+"\\矿物直方图.xlsx")
            if "伊利石" in list(dataSelct.columns[5:]):
                value = dataSelct["伊利石"].iloc[sampleNum-1] #-1是因为直方图中有表头
                if value != 0:
                    print(i,data["工程名"].iloc[i],data["样品顺序"].iloc[i])
                    data["Dep2200"].iloc[i] = np.NaN
                    data["IC"].iloc[i] = np.NaN
                    data["Pos2200"].iloc[i] = np.NaN
                    data["Dep1900"].iloc[i] = np.NaN
            if "白云母" in list(dataSelct.columns[5:]):
                value = dataSelct["白云母"].iloc[sampleNum-1] #-1是因为直方图中有表头
                if value != 0:
                    print(i,data["工程名"].iloc[i],data["样品顺序"].iloc[i])
                    data["Dep2200"].iloc[i] = np.NaN
                    data["IC"].iloc[i] = np.NaN
                    data["Pos2200"].iloc[i] = np.NaN
                    data["Dep1900"].iloc[i] = np.NaN

data.to_excel("C:\\Users\\zsllsz2022\\Desktop\\提取蚀变基础数据\总表-筛选.xlsx",index=False)

根据直方图对总表进行筛选

import os
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings("ignore")

#根据直方图筛选
data = pd.read_excel(r"C:\Users\zsllsz2022\Desktop\assay.xlsx")
# for i in range(len(data["样品顺序"])):
for i in range(11706):  #最后一个样品是11706
#     print(data.iloc[i])
    sampleNum = data["sample_id"].iloc[i]
    holdId = data["hole_id"].iloc[i]
    z = data["Z"].iloc[i]
#     print(holdId,sampleNum,z)
    
    tempData = "C:\\Users\\zsllsz2022\\Desktop\\2024年新处理\\{}\\矿物直方图.xlsx".format(holdId)
#     print(tempData)
    tempData = pd.read_excel(tempData,header=None)
    tempData.columns = [str(each).replace(" ","") for each in list(tempData.iloc[0])]
    tempData = tempData[1:]
    
    #绿泥石字段
    chlorite = [element for element in tempData.columns if "绿泥石" in element]
    drawData = tempData[tempData["Z坐标"]==z]
    
#     print(drawData)
    if "白云母" in tempData.columns:
        if float(drawData["白云母"].iloc[0])==0:
            print(holdId,sampleNum,z,":处理白云母")
            data["Dep2200"].iloc[i] = np.NaN
            data["IC"].iloc[i] = np.NaN
            data["Pos2200"].iloc[i] = np.NaN
            data["Dep1900"].iloc[i] = np.NaN
            
    if "伊利石" in tempData.columns:
        if float(drawData["伊利石"].iloc[0])==0:
            print(holdId,sampleNum,z,":处理伊利石")
            data["Dep2200"].iloc[i] = np.NaN
            data["IC"].iloc[i] = np.NaN
            data["Pos2200"].iloc[i] = np.NaN
            data["Dep1900"].iloc[i] = np.NaN
    
    if "伊利石" in tempData.columns and "蒙脱石" in tempData.columns:
        if float(drawData["伊利石"].iloc[0]) != 0 and float(drawData["蒙脱石"].iloc[0]) != 0:
            print(holdId,sampleNum,z,":处理组合①")
            data["Dep2200"].iloc[i] = np.NaN
            data["IC"].iloc[i] = np.NaN
            data["Pos2200"].iloc[i] = np.NaN
            data["Dep1900"].iloc[i] = np.NaN
            
    if "伊利石" in tempData.columns and "高岭石" in tempData.columns:
        if float(drawData["伊利石"].iloc[0]) != 0 and float(drawData["高岭石"].iloc[0]) != 0:
            print(holdId,sampleNum,z,":处理组合②")
            data["Dep2200"].iloc[i] = np.NaN
            data["IC"].iloc[i] = np.NaN
            data["Pos2200"].iloc[i] = np.NaN
            data["Dep1900"].iloc[i] = np.NaN
            
    if "白云母" in tempData.columns and "蒙脱石" in tempData.columns:
        if float(drawData["白云母"].iloc[0]) != 0 and float(drawData["蒙脱石"].iloc[0]) != 0:
            print(holdId,sampleNum,z,":处理组合③")
            data["Dep2200"].iloc[i] = np.NaN
            data["IC"].iloc[i] = np.NaN
            data["Pos2200"].iloc[i] = np.NaN
            data["Dep1900"].iloc[i] = np.NaN
            
    if "白云母" in tempData.columns and "高岭石" in tempData.columns:  
        if float(drawData["白云母"].iloc[0]) != 0 and float(drawData["高岭石"].iloc[0]) != 0:
            print(holdId,sampleNum,z,":处理组合④")
            
            data["Dep2200"].iloc[i] = np.NaN
            data["IC"].iloc[i] = np.NaN
            data["Pos2200"].iloc[i] = np.NaN
            data["Dep1900"].iloc[i] = np.NaN
        
    value = 0
    for eachChlorite in chlorite:
        value += float(drawData[eachChlorite].iloc[0])
    
    if value == 0:
        print(holdId,sampleNum,z,":处理绿泥石")
        data["Dep2250"].iloc[i] = np.NaN
        data["Pos2250"].iloc[i] = np.NaN
    
#     print(drawData)

data.to_excel("C:\\Users\\zsllsz2022\\Desktop\\assay-全筛选.xlsx",index=False)

  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

阿木霖

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值