task2.1 同一个excel中的同一个sheet里的数据处理,使用pandas
1. 读取excel,注意路径
import pandas as pd
df = pd.read_excel('../task2_data.xlsx', sheet_name='Answer')
2.处理数据,注意在excel中是第33行到第58行,在读取excel时,去掉表头一行,开始行数-1,即row_start = 32,pandas类型的数据,索引是从0开始的,故而row_start再-1,最终row_start=31,
row_end= 56,因为range后面是左开右闭区间,故而最终range里的范围是(31,57)在实际中如果怕算不清楚,可以通过简单的test确定range范围。
对excel中某一单元格进行操作,使用iloc()
for i in range(31,57):
df.iloc[i,9]=df.iloc[i,0]
3.写入excel,注意路径
df.to_excel('../task2.1/task2.1.xlsx', sheet_name='Answer', index=False, header=True)
task2.2 同一个excel中的同一个sheet里的数据处理,包括对单元格内数据的处理,使用pandas,string
1. 读取excel,注意路径
import pandas as pd
df = pd.read_excel('../task2_data.xlsx', sheet_name='Answer')
2.处理数据
利用string.fing()对查找特定字符串,返回起始index
for row in range(31,57):
b_string=df.iloc[row,1]
i=b_string.find("GCR = ")
j=b_string.find(" WaitingTime")
df.iloc[row,11] = b_string[i+6:j]
df.iloc[row,12] = b_string[8:i]
3.写入excel,注意路径
df.to_excel('../task2.2/task2.2.xlsx',sheet_name='Answer',index=False, header=True)
task 2.3 同一个excel中的不同的sheet里的数据处理,包括简单的排序,求和,筛选
1. 读取excel,注意路径
import pandas as pd
df = pd.read_excel('../task2_data.xlsx', sheet_name='Data_Plot')
2.处理数据
list_QC=['QC', 'box_2', 'first_job', 'last_job']
row = 0
for i in range(1, 27):
if i < 10:
# 判断某列['QC']中是否有某个数值['QC0' + str(i)]).any()
if (df['QC'].isin(['QC0' + str(i)]).any()):
QCi = df[df['QC'] == 'QC0' + str(i)]
# 排序,ascending=True表示升序
QCi_sort = QCi.sort_values(by='Time', ascending=True)
# 根据条件筛选数据
QCi_1800 = QCi_sort[QCi['Time'] >= 1800]
# print(QCi_1800)
QCi_Num = len(QCi_sort)
row_num = len(QCi_1800)
row += (QCi_Num - row_num)
box_2 = QCi_1800['BoxCnt'].sum()#对某一列简单求和
first_job = QCi_sort.loc[row, 'Time']
last_index = row + row_num - 1
last_job = QCi_sort.loc[last_index, 'Time']
row += row_num
# print(row)
list_QCi = ['QC0' + str(i), str(box_2), first_job, last_job]
list_QC += list_QCi
#print(list_QCi, '\n')
else:
# print('there is no'+'QC0'+str(i)+'\n')
list_QCi = ['QC0' + str(i), 'NA', 'NA', 'NA']
list_QC += list_QCi
#print(list_QCi, '\n')
else:
if (df['QC'].isin(['QC' + str(i)]).any()):
QCi = df[df['QC'] == 'QC' + str(i)]
# print('QC' + str(i))
QCi_sort = QCi.sort_values(by='Time', ascending=True)
QCi_1800 = QCi_sort[QCi['Time'] >= 1800]
# print(QCi_1800)
QCi_Num = len(QCi_sort)
row_num = len(QCi_1800)
row += (QCi_Num - row_num)
# print(row)
box_2 = QCi_1800['BoxCnt'].sum()
first_job = QCi_sort.loc[row, 'Time']
last_index = row + row_num - 1
last_job = QCi_sort.loc[last_index, 'Time']
row += row_num
list_QCi = ['QC' + str(i), str(box_2), first_job, last_job]
list_QC += list_QCi
#print(list_QCi, '\n')
else:
# print('there is no'+'QC0'+str(i)+'\n')
list_QCi = ['QC' + str(i), 'NA', 'NA', 'NA']
list_QC += list_QCi
#print(list_QCi, '\n')
#print(list_QC)
df_answer = pd.read_excel('../task2_data.xlsx', sheet_name='Answer')
for row in range(31, 57):
df_answer.iloc[row, 13] = list_QC[(row - 30) * 4 + 1]
df_answer.iloc[row, 16] = list_QC[(row - 30) * 4 + 2]
df_answer.iloc[row, 17] = list_QC[(row - 30) * 4 + 3]
3.写入excel,注意路径
df_answer.to_excel('task2.3.xlsx', sheet_name='Answer', index=False, header=True)
task2.4 将task2.1至task2.3的脚本改写为类,在main中生成对象并且调用成员函数,注意,类的成员函数变量要包含self
task2_4.py
import pandas as pd
class AnswerofData:
df_data = pd.DataFrame()
def data_input (self, data_file_to_read,sheetname_to_read):
df = pd.read_excel(data_file_to_read, sheet_name=sheetname_to_read)
return df
def process_QC (self, df_data, row_QC_start, row_QC_end, list_QC_original, list_QC_destination):
for i in range(row_QC_start, row_QC_end):
df_data.iloc[i, list_QC_destination] = df_data.iloc[i, list_QC_original]
def data_output (self, df_data, data_file_to_write,sheetname_to_Write):
df_data.to_excel(data_file_to_write, sheet_name=sheetname_to_Write, index=False, header=True)
def process_box1_GCR(self,df_data, row_QC_start, row_QC_end, list_GCR, list_BOX_1):
for row in range(row_QC_start, row_QC_end):
b_string = df_data.iloc[row, 1]
i = b_string.find("GCR = ")
j = b_string.find(" WaitingTime")
df_data.iloc[row, list_GCR] = b_string[i + 6:j]
df_data.iloc[row, list_BOX_1] = b_string[8:i]
def process_box2_first_last(self,QC_num, df_data_plot, df_data, list_scrab_num, row_QC_start, row_QC_end,list_Box_2, list_first_job, list_last_job):
list_QC = ['QC', 'box_2', 'first_job', 'last_job']
row = 0
for i in range(1, QC_num+1):
if i < 10:
if (df_data_plot['QC'].isin(['QC0' + str(i)]).any()):
QCi = df_data_plot[df_data_plot['QC'] == 'QC0' + str(i)]
QCi_sort = QCi.sort_values(by='Time', ascending=True)
QCi_1800 = QCi_sort[QCi['Time'] >= 1800]
# print(QCi_1800)
QCi_Num = len(QCi_sort)
row_num = len(QCi_1800)
row += (QCi_Num - row_num)
box_2 = QCi_1800['BoxCnt'].sum()
first_job = QCi_sort.loc[row, 'Time']
last_index = row + row_num - 1
last_job = QCi_sort.loc[last_index, 'Time']
row += row_num
# print(row)
list_QCi = ['QC0' + str(i), str(box_2), first_job, last_job]
list_QC += list_QCi
# print(list_QCi, '\n')
else:
# print('there is no'+'QC0'+str(i)+'\n')
list_QCi = ['QC0' + str(i), 'NA', 'NA', 'NA']
list_QC += list_QCi
# print(list_QCi, '\n')
else:
if (df_data_plot['QC'].isin(['QC' + str(i)]).any()):
QCi = df_data_plot[df_data_plot['QC'] == 'QC' + str(i)]
# print('QC' + str(i))
QCi_sort = QCi.sort_values(by='Time', ascending=True)
QCi_1800 = QCi_sort[QCi['Time'] >= 1800]
# print(QCi_1800)
QCi_Num = len(QCi_sort)
row_num = len(QCi_1800)
row += (QCi_Num - row_num)
# print(row)
box_2 = QCi_1800['BoxCnt'].sum()
first_job = QCi_sort.loc[row, 'Time']
last_index = row + row_num - 1
last_job = QCi_sort.loc[last_index, 'Time']
row += row_num
list_QCi = ['QC' + str(i), str(box_2), first_job, last_job]
list_QC += list_QCi
# print(list_QCi, '\n')
else:
# print('there is no'+'QC0'+str(i)+'\n')
list_QCi = ['QC' + str(i), 'NA', 'NA', 'NA']
list_QC += list_QCi
#print(list_QCi, '\n')
#print(list_QC)
for row in range(row_QC_start, row_QC_end):
df_data.iloc[row, list_Box_2] = list_QC[(row - row_QC_start+1) * list_scrab_num + 1]
df_data.iloc[row, list_first_job] = list_QC[(row - row_QC_start+1) * list_scrab_num + 2]
df_data.iloc[row, list_last_job] = list_QC[(row - row_QC_start+1) * list_scrab_num + 3]
main.py
from 文件名 import 类名 (要求文件和main在同一目录下)
def print_hi(name):
# 在下面的代码行中使用断点来调试脚本。
print(f'Hi, {name}') # 按 Ctrl+F8 切换断点。
from task2_4 import AnswerofData
# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
print_hi('PyCharm')
data_file_to_read = '../task2_data.xlsx'
sheetname_to_read = 'Answer'
row_QC_start = 31
row_QC_end = 57
list_QC_original = 0
list_QC_destination = 9
data_file_to_write = '../task 2.4/task2.4.xlsx'
sheetname_to_write = 'Answer'
list_GCR = 11
list_BOX_1 = 12
QC_num = 26
list_Box_2 = 13
list_first_job = 16
list_last_job = 17
list_crab_num = 4
MyAnswer = AnswerofData()
df_data = MyAnswer.data_input(data_file_to_read, sheetname_to_read)
df_data_plot = MyAnswer.data_input(data_file_to_read, 'Data_Plot')
MyAnswer.process_QC(df_data, row_QC_start, row_QC_end, list_QC_original, list_QC_destination)
MyAnswer.process_box1_GCR(df_data, row_QC_start, row_QC_end, list_GCR, list_BOX_1)
MyAnswer.process_box2_first_last(QC_num, df_data_plot, df_data, list_crab_num, row_QC_start, row_QC_end, list_Box_2,list_first_job,list_last_job)
MyAnswer.data_output(df_data, data_file_to_write, sheetname_to_write)
若需要调用的文件和main在不同的文件夹下,可参考以下做法