DataFrame 数据切片
df.iloc[3:7,:8] 截取3至7行,0至8列的数据表格
df[3:7] 截取3至7行
df.iloc[6,7] 获数据表格中获取第7行,第8列的数据
df = df[['开发部门','应用工作','是否重点','是否重要','是否更改','监管']] 按表头获取 只取这些列
DataFrame 数据筛选
#对一列进行数据筛选
df = df[df['中心项目编号']=='k20160838']
#对多列进行数据筛选 and or & |
df = df[(df['开发部门'].str.find('开发')!= -1) & (df['项目状态'].str.find('合并')== -1) ]
#Pandas 文本数据方法 contains() 是否包含查找的字符串
df = df[df['产品代码'].str.contains("wj")]
#match() 是否匹配正则
df = df[df['邮箱'].str.match("^[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+$")]
#当值为多个时 使用isin方式更佳
df = df[df['中心项目编号'].isin(['k20160838','k20160841'])]
DataFrame 其他常用方法
df.shape 返回一个元组 (行数,列数)
df.empty 判断dataframe 是否为空 也可以使用df.shape[0] == 0 ; len(df) == 0
column_list = df.columns.tolist() 获取表头 list
data_list = df.values.tolist() 数据表格转list 二维数组
df['中心项目编号'].tolist() 数据表格指定列转list
df['修改时间'] = '' 在dataframe最后添加一个新的列,值均为空串
DEMO:读取工单处理结果日志文件 构造 DataFrame 并输出 excel
#result_flg 处理结果
def write_log(work_code,job_type,result_flg):
today_file = open(code_path+’\\’+today_flg,’a+’)
if work_code is None or not work_code.strip():
pass
else:
today_file.write(job_type + ’;’ + work_code + ’;’ + result_flg)
today_file.close()
def write_log_to_excel():
data_titles = ['处理类型','业务编号','处理结果']
data_values = []
result_file_name = 'car_result%s.xlsx' % yesterday_flg
with open(code_path+'\\'+ yesterday_flg) as file:
try:
for line in file:
line = line.strip()
line = line.strip('\n')
if not line:
continue
else:
tmp_data = line.split(';')
if len(tmp_data) != 3:
continue
data_values.append(tuple(tmp_data))
except Exception as e:
pass
df = pd.DataFrame.from_records(data_values,columns = data_titles)
result_file_path = code_path + '\\' + result_file_name
df.to_excel(result_file_path,index=False)
DataFrame groupby 使用方法
# 按col1分组并按col2求和
print df.groupby(by='col1').agg({'col2':sum}).reset_index()
# 按col1分组并按col2求最值
print df.groupby(by='col1').agg({'col2':['max', 'min']}).reset_index()
# 按col1 ,col3分组并按col2求和
print df.groupby(by=['col1', 'col3']).agg({'col2':sum}).reset_index()
DEMO: 按类型筛选分 sheet ,邮箱号相同的数据金额列求和保存
def init_csv_data(csv_path,out_path):
print('数据源位置:'+csv_path)
if os.path.exists(csv_path):
pass
else:
return False
df = pd.read_csv(csv_path,header=None,engine='python')
group_df = df.groupby(df[0])
my_heads=['类型','地址','姓名','金额','邮箱','卡号','日期']
writer = pd.ExcelWriter(out_path)
df.to_excel(writer,'sheet1',index=False,header=my_heads)
for one_group in group_df:
one_type = one_group[0]
tmp_df = df[df[0] == one_type]
result_df = pd.DataFrame()
group_tmp_df = tmp_df.groupby(df[4])
for one_another_group in group_tmp_df:
address = one_another_group[0]
one_address_df = tmp_df[tmp_df[4] == address]
sum_amount = one_address_df.iloc[:, 3].sum()
sum_tmp_df = one_address_df[0:1]
sum_tmp_df[3] = sum_amount
result_df = result_df.append(sum_tmp_df)
result_df.to_excel(writer, one_type, index=False, header=my_heads)
writer.save()
print('数据筛选处理成功,输出路径:'+out_path)
return True