task 2 利用python完成excel中的简单数据处理

本文详细介绍了如何使用Python的Pandas库在Excel中进行数据读取、特定单元格操作(如字符串查找和替换)、排序、筛选以及数据汇总,包括同一Sheet和不同Sheet的数据处理,并展示了将这些操作封装为类的方法。
摘要由CSDN通过智能技术生成

task2.1 同一个excel中的同一个sheet里的数据处理,使用pandas

1. 读取excel,注意路径

import pandas as pd
df = pd.read_excel('../task2_data.xlsx', sheet_name='Answer')

2.处理数据,注意在excel中是第33行到第58行,在读取excel时,去掉表头一行,开始行数-1,即row_start = 32,pandas类型的数据,索引是从0开始的,故而row_start再-1,最终row_start=31,

row_end= 56,因为range后面是左开右闭区间,故而最终range里的范围是(31,57)在实际中如果怕算不清楚,可以通过简单的test确定range范围。

对excel中某一单元格进行操作,使用iloc()

for i in range(31,57):
    df.iloc[i,9]=df.iloc[i,0]

3.写入excel,注意路径

df.to_excel('../task2.1/task2.1.xlsx', sheet_name='Answer', index=False, header=True)

task2.2 同一个excel中的同一个sheet里的数据处理,包括对单元格内数据的处理,使用pandas,string

1. 读取excel,注意路径

import pandas as pd
df = pd.read_excel('../task2_data.xlsx', sheet_name='Answer')

2.处理数据

利用string.fing()对查找特定字符串,返回起始index

for row in range(31,57):
    b_string=df.iloc[row,1]
    i=b_string.find("GCR = ")
    j=b_string.find(" WaitingTime")
    df.iloc[row,11] = b_string[i+6:j]
    df.iloc[row,12] = b_string[8:i]

3.写入excel,注意路径

df.to_excel('../task2.2/task2.2.xlsx',sheet_name='Answer',index=False, header=True)

task 2.3 同一个excel中的不同的sheet里的数据处理,包括简单的排序,求和,筛选

1. 读取excel,注意路径

import pandas as pd

df = pd.read_excel('../task2_data.xlsx', sheet_name='Data_Plot')

2.处理数据

list_QC=['QC', 'box_2', 'first_job', 'last_job']
row = 0
for i in range(1, 27):
    if i < 10:
        # 判断某列['QC']中是否有某个数值['QC0' + str(i)]).any()
        if (df['QC'].isin(['QC0' + str(i)]).any()):
            QCi = df[df['QC'] == 'QC0' + str(i)]
            # 排序,ascending=True表示升序
            QCi_sort = QCi.sort_values(by='Time', ascending=True)
            # 根据条件筛选数据
            QCi_1800 = QCi_sort[QCi['Time'] >= 1800]
            # print(QCi_1800)
            QCi_Num = len(QCi_sort)
            row_num = len(QCi_1800)
            row += (QCi_Num - row_num)
            box_2 = QCi_1800['BoxCnt'].sum()#对某一列简单求和
            first_job = QCi_sort.loc[row, 'Time']
            last_index = row + row_num - 1
            last_job = QCi_sort.loc[last_index, 'Time']
            row += row_num
            # print(row)
            list_QCi = ['QC0' + str(i), str(box_2), first_job, last_job]
            list_QC += list_QCi
            #print(list_QCi, '\n')

        else:
            # print('there is no'+'QC0'+str(i)+'\n')
            list_QCi = ['QC0' + str(i), 'NA', 'NA', 'NA']
            list_QC += list_QCi
            #print(list_QCi, '\n')
    else:
        if (df['QC'].isin(['QC' + str(i)]).any()):
            QCi = df[df['QC'] == 'QC' + str(i)]
            # print('QC' + str(i))
            QCi_sort = QCi.sort_values(by='Time', ascending=True)
            QCi_1800 = QCi_sort[QCi['Time'] >= 1800]
            # print(QCi_1800)
            QCi_Num = len(QCi_sort)
            row_num = len(QCi_1800)
            row += (QCi_Num - row_num)
            # print(row)
            box_2 = QCi_1800['BoxCnt'].sum()
            first_job = QCi_sort.loc[row, 'Time']
            last_index = row + row_num - 1
            last_job = QCi_sort.loc[last_index, 'Time']
            row += row_num
            list_QCi = ['QC' + str(i), str(box_2), first_job, last_job]
            list_QC += list_QCi
            #print(list_QCi, '\n')
        else:
            # print('there is no'+'QC0'+str(i)+'\n')
            list_QCi = ['QC' + str(i), 'NA', 'NA', 'NA']
            list_QC += list_QCi
            #print(list_QCi, '\n')

#print(list_QC)
df_answer = pd.read_excel('../task2_data.xlsx', sheet_name='Answer')
for row in range(31, 57):
    df_answer.iloc[row, 13] = list_QC[(row - 30) * 4 + 1]
    df_answer.iloc[row, 16] = list_QC[(row - 30) * 4 + 2]
    df_answer.iloc[row, 17] = list_QC[(row - 30) * 4 + 3]

3.写入excel,注意路径

df_answer.to_excel('task2.3.xlsx', sheet_name='Answer', index=False, header=True)

task2.4 将task2.1至task2.3的脚本改写为类,在main中生成对象并且调用成员函数,注意,类的成员函数变量要包含self

task2_4.py

import pandas as pd

class AnswerofData:
    df_data = pd.DataFrame()
    def data_input (self, data_file_to_read,sheetname_to_read):
        df = pd.read_excel(data_file_to_read, sheet_name=sheetname_to_read)
        return df
    def process_QC (self, df_data, row_QC_start, row_QC_end, list_QC_original, list_QC_destination):
        for i in range(row_QC_start, row_QC_end):
            df_data.iloc[i, list_QC_destination] = df_data.iloc[i, list_QC_original]
    def data_output (self, df_data, data_file_to_write,sheetname_to_Write):
        df_data.to_excel(data_file_to_write, sheet_name=sheetname_to_Write, index=False, header=True)
    def process_box1_GCR(self,df_data, row_QC_start, row_QC_end, list_GCR, list_BOX_1):
        for row in range(row_QC_start, row_QC_end):
            b_string = df_data.iloc[row, 1]
            i = b_string.find("GCR = ")
            j = b_string.find(" WaitingTime")
            df_data.iloc[row, list_GCR] = b_string[i + 6:j]
            df_data.iloc[row, list_BOX_1] = b_string[8:i]
    def process_box2_first_last(self,QC_num, df_data_plot, df_data, list_scrab_num, row_QC_start, row_QC_end,list_Box_2, list_first_job, list_last_job):
        list_QC = ['QC', 'box_2', 'first_job', 'last_job']
        row = 0
        for i in range(1, QC_num+1):
            if i < 10:
                if (df_data_plot['QC'].isin(['QC0' + str(i)]).any()):
                    QCi = df_data_plot[df_data_plot['QC'] == 'QC0' + str(i)]
                    QCi_sort = QCi.sort_values(by='Time', ascending=True)
                    QCi_1800 = QCi_sort[QCi['Time'] >= 1800]
                    # print(QCi_1800)
                    QCi_Num = len(QCi_sort)
                    row_num = len(QCi_1800)
                    row += (QCi_Num - row_num)
                    box_2 = QCi_1800['BoxCnt'].sum()
                    first_job = QCi_sort.loc[row, 'Time']
                    last_index = row + row_num - 1
                    last_job = QCi_sort.loc[last_index, 'Time']
                    row += row_num
                    # print(row)
                    list_QCi = ['QC0' + str(i), str(box_2), first_job, last_job]
                    list_QC += list_QCi
                    # print(list_QCi, '\n')

                else:
                    # print('there is no'+'QC0'+str(i)+'\n')
                    list_QCi = ['QC0' + str(i), 'NA', 'NA', 'NA']
                    list_QC += list_QCi
                    # print(list_QCi, '\n')
            else:
                if (df_data_plot['QC'].isin(['QC' + str(i)]).any()):
                    QCi = df_data_plot[df_data_plot['QC'] == 'QC' + str(i)]
                    # print('QC' + str(i))
                    QCi_sort = QCi.sort_values(by='Time', ascending=True)
                    QCi_1800 = QCi_sort[QCi['Time'] >= 1800]
                    # print(QCi_1800)
                    QCi_Num = len(QCi_sort)
                    row_num = len(QCi_1800)
                    row += (QCi_Num - row_num)
                    # print(row)
                    box_2 = QCi_1800['BoxCnt'].sum()
                    first_job = QCi_sort.loc[row, 'Time']
                    last_index = row + row_num - 1
                    last_job = QCi_sort.loc[last_index, 'Time']
                    row += row_num
                    list_QCi = ['QC' + str(i), str(box_2), first_job, last_job]
                    list_QC += list_QCi
                    # print(list_QCi, '\n')
                else:
                    # print('there is no'+'QC0'+str(i)+'\n')
                    list_QCi = ['QC' + str(i), 'NA', 'NA', 'NA']
                    list_QC += list_QCi
                    #print(list_QCi, '\n')
        #print(list_QC)
        for row in range(row_QC_start, row_QC_end):
            df_data.iloc[row, list_Box_2] = list_QC[(row - row_QC_start+1) * list_scrab_num + 1]
            df_data.iloc[row, list_first_job] = list_QC[(row - row_QC_start+1) * list_scrab_num + 2]
            df_data.iloc[row, list_last_job] = list_QC[(row - row_QC_start+1) * list_scrab_num + 3]



main.py

from 文件名 import 类名 (要求文件和main在同一目录下)

def print_hi(name):
    # 在下面的代码行中使用断点来调试脚本。
    print(f'Hi, {name}')  # 按 Ctrl+F8 切换断点。

from task2_4 import AnswerofData

# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
    print_hi('PyCharm')
    data_file_to_read = '../task2_data.xlsx'
    sheetname_to_read = 'Answer'
    row_QC_start = 31
    row_QC_end = 57
    list_QC_original = 0
    list_QC_destination = 9
    data_file_to_write = '../task 2.4/task2.4.xlsx'
    sheetname_to_write = 'Answer'
    list_GCR = 11
    list_BOX_1 = 12
    QC_num = 26
    list_Box_2 = 13
    list_first_job = 16
    list_last_job = 17
    list_crab_num = 4
    MyAnswer = AnswerofData()
    df_data = MyAnswer.data_input(data_file_to_read, sheetname_to_read)
    df_data_plot = MyAnswer.data_input(data_file_to_read, 'Data_Plot')
    MyAnswer.process_QC(df_data, row_QC_start, row_QC_end, list_QC_original, list_QC_destination)
    MyAnswer.process_box1_GCR(df_data, row_QC_start, row_QC_end, list_GCR, list_BOX_1)
    MyAnswer.process_box2_first_last(QC_num, df_data_plot, df_data, list_crab_num, row_QC_start, row_QC_end, list_Box_2,list_first_job,list_last_job)
    MyAnswer.data_output(df_data, data_file_to_write, sheetname_to_write)

若需要调用的文件和main在不同的文件夹下,可参考以下做法

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值