合并过滤CSV

俊俊熊

已于 2023-03-18 17:36:14 修改

阅读量112

点赞数

文章标签： python pandas 机器学习

于 2023-03-14 10:21:21 首次发布

本文链接：https://blog.csdn.net/m0_49615569/article/details/129518826

版权

合并几个文件，然后过滤出需要的列useful_cols

import os #合并几个文件，然后过滤出需要的列useful_cols
import pandas as pd 
import numpy as np
import glob
from pandas.io.excel import ExcelWriter
df_all = pd.DataFrame()
useful_cols = ['Date', 'Time', 'SerialNum', 'TestDesc', 'PassFail', 'ResValue', 'ResMin', 'ResMax', 
            'SetValue1','SetValue2', 'SetValue4','SetValue5']
#test_list = ["VSG Power Accuracy High Outp","VSG Power Accuracy Low", "VSA Power Accuracy"]

for i in glob.glob('D:\CertData_collect\Test python\IQXL02939_20140503\*.csv', recursive=True):
    df = pd.read_csv(i, low_memory=False, parse_dates=[['Date', 'Time']], usecols=useful_cols)
    df_all = pd.concat([df_all,df], ignore_index=True) #合并所有CSV成一个，筛选出有用的列
df_all.to_excel("2939.xlsx" ,sheet_name="sheet1", index=False)

选择需要第一列里面的某一类，然后再过滤另一列的某几类!

import pandas as pd #选择需要第一列里面的某一类，然后再过滤另一列的某几类!
import numpy as np
data =pd.read_csv(r"D:\CertData_collect\Test python\IQXFinal20221205.csv",index_col=0,low_memory=False)
test_list = ["VSG Power Accuracy High Outp","VSG Power Accuracy Low","VSA Power Accuracy"] # "VSG Power Accuracy High Outp","VSG Power Accuracy Low","VSA Power Accuracy"
#temp = data.loc[data["TestDesc"].isin(test_list),:]
temp = data.loc[(data['SerialNum']=='IQXL02697') & (data["TestDesc"].isin(test_list)),:"SetUnit4"]
#选取第'SerialNum'列的里面是'IQ1218A3694'这一项 ，并且只选取列"TestDesc" 里面其中某几项的项目

#temp.iloc[:,:25].to_excel('3306.xlsx') #保存所以行以及前24列的数据保存未excel

temp.to_excel('3306.xlsx')

过滤某几项

import pandas as pd #过滤某几项
import numpy as np

data=pd.DataFrame()
test_list = ["VSG Power Accuracy High Outp","VSG Power Accuracy Low","VSA Power Accuracy"] #此项漏了，加上
data =pd.read_excel(r"D:\CertData_collect\Test python\2938.xlsx",index_col=0)
temp = data.loc[(data['SerialNum']=='IQXL02697') & (data["TestDesc"].isin(test_list)),:]
#选取第'SerialNum'列的里面是'IQ1218A3694'这一项 ，并且只选取列"TestDesc" 里面其中某几项的项目
temp.to_excel('3306.xlsx')

上面两个程序合并到一起

import os #上面两个程序1&2合并到一起
import pandas as pd 
import numpy as np
import glob

df_all = pd.DataFrame()
useful_cols = ['Date', 'Time', 'SerialNum', 'TestDesc', 'PassFail', 'ResValue', 'ResMin', 'ResMax', 
            'SetValue1','SetValue2', 'SetValue4','SetValue5']
test_list = ["VSG Power Accuracy High Outp","VSG Power Accuracy Low", "VSA Power Accuracy"]

for i in glob.glob('D:\CertData_collect\Test python\IQXL02939_20140503\*.csv', recursive=True):
    df = pd.read_csv(i, low_memory=False, parse_dates=[['Date', 'Time']], usecols=useful_cols)
    df_all = pd.concat([df_all,df], ignore_index=True) #合并所有CSV成一个，筛选出有用的列
    #df_all.to_excel("2938.xlsx" ,sheet_name="sheet1", index=False)

    temp = df_all.loc[(df_all['SerialNum']=='IQXL02939') & (df_all["TestDesc"].isin(test_list)),:] #筛选出需要的VSA VSG
    
temp.to_excel("2939.xlsx" ,sheet_name="sheet1", index=False)

遍历文件夹下面的CSV文件，然后合并去重，参考Yangzy

import pandas as pd #遍历文件夹下面的CSV文件，然后合并去重，参考Yangzy
import numpy as np
from tqdm import tqdm
import os 

result =pd.DataFrame()
path = r"D:\CertData_collect\Test python" 
#path = r"D:\CertData_collect\Test python\IQXL02939_20140503\" 

for file in os.listdir(path):
    if file.endswith(".csv"): 
        df = pd.read_csv(file)
        result = pd.concat([result,df[['Date','Time','SerialNum','TestDesc','PassFail','ResValue','ResMin','SetValue2','SetUnit5']]])
        

print(result.drop_duplicates())
uiqu_df = result.drop_duplicates()
uniq_df.to_csv("2312.csv")

对表格的多个列排序

sort1=pd.DataFrame()  #对表格的多列排序
file = pd.read_excel(r"D:\CertData_collect\Test python\2939_sort_soure_vsa.xlsx",sheet_name="sheet1")
sort1=file.sort_values(["SetValue4","SetValue0","SetValue1"],ascending=[True,True,True])
#vsa 的setvalue is 4-0-1, vsg is 4-0-2
sort1.to_excel("2939sort_vsa.xlsx",index=False)

对表格进行分割成不同的sheet，写到一个excel里面


# slice the 937clums data to vsa &vsg high &vsg low
import os 
import pandas as pd 
import numpy as np
import glob
from pandas.io.excel import ExcelWriter
from matplotlib import pyplot as plt

df = pd.read_excel(r"D:\CertData_collect\Test python\2939.xlsx") #将937列整的数据分切到不同的df,然后写入不同的sheet
df_acc = df.groupby("TestDesc")
df_vsg_low=df_acc.get_group('VSG Power Accuracy Low')
df_vsg_high=df_acc.get_group('VSG Power Accuracy High Outp')
df_vsa=df_acc.get_group('VSA Power Accuracy')

with pd.ExcelWriter('2939_vsghighlow_vsa.xlsx') as writer:# 将多个df写到不同的sheet
    df_vsg_low.to_excel(writer,sheet_name="VSG_Low", index=False)
    df_vsg_high.to_excel(writer,sheet_name="VSG_High", index=False)
    df_vsa.to_excel(writer,sheet_name="VSA", index=False)

切分937项到三个不同df，然后把df按照frequency，power accuracy 排序，后面生成三个不同的sheet

df = pd.read_excel(r"D:\CertData_collect\Test python\2939.xlsx") #切分937项到三个不同df，然后把df按照frequency，power accuracy 排序，后面生成三个不同的sheet
df_acc = df.groupby("TestDesc")

df_vsg_low=df_acc.get_group('VSG Power Accuracy Low')#切需要的列里面的子项
df_sort_vsg_low = df_vsg_low.sort_values(["SetValue5","SetValue0","SetValue2"],ascending=[True,True,True])#多列排序
    
df_vsg_high=df_acc.get_group('VSG Power Accuracy High Outp')
df_sort_vsg_high = df_vsg_high.sort_values(["SetValue4","SetValue0","SetValue2"],ascending=[True,True,True])

df_vsa=df_acc.get_group('VSA Power Accuracy')
df_sort_vsa = df_vsa.sort_values(["SetValue4","SetValue0","SetValue2"],ascending=[True,True,True])

    
with pd.ExcelWriter('2939_vsghighlow_vsa.xlsx') as writer:# 将多个df写到不同的sheet
    df_sort_vsg_low.to_excel(writer,sheet_name="VSG_Low", index=False)
    df_sort_vsg_high.to_excel(writer,sheet_name="VSG_High", index=False)
    df_sort_vsa.to_excel(writer,sheet_name="VSA", index=False)

一个工作表分成不同的工作簿

file_path=r"D:\jupyter\good\3066.xlsx"
data=pd.read_excel(file_path,sheet_name= "sheet1")
pro_data= data.groupby("Date_Time")

"""for i,j in pro_data:# 一个工作表分成不同的工作簿
    new_file_path="D:\jupyter\good\split" + i +".xlsx"
    j.to_excel(new_file_path,sheet_name=i,index=False)"""
#workbook=pd.ExcelWriter("3066split.xlsx")
workbook=pd.ExcelWriter(r"D:\jupyter\good\split\3066split.xlsx") #把一个工作表分成不同的工作表
for i,j in pro_data:
    j.to_excel(workbook,sheet_name=i,index = False)
workbook.save()