最近找数据整理用excel一直重复操作给恶心吐了,主要重复三种操作,多个面板的合并汇总,面板转横截面数据、截面数据转面板数据。所以自己写了代码解放双手,秒出结果,直接复制文件地址即可实现!
合并面板数据
def merge_panel(file_path,file_finish):
file_name_list = os.listdir(file_path)
file_list = []
for i in file_name_list:
a = file_path + '\\' + i
file_list.append(a)
# print(a)
data1 = pd.DataFrame(columns=['临时'])
for file in file_list:
data = pd.read_excel(file, sheet_name=0)
names_list = data.columns.tolist()
# print(names_list)
# data.reindex(columns=names_list)
data['临时'] = data[names_list[0]].astype(str) + data[names_list[1]].str[:2].astype(str)
names_list.insert(names_list.index(names_list[2]), '临时')
# print(names_list)
data = data.reindex(columns=names_list)
data = data.set_index([names_list[0], names_list[1]])
data1 = data1.merge(data, how='outer', on='临时')
# print(data1)
# print(data1)
# print(data[names_list[1]].str[:2])
# print(data)
data1["time"] = data1["临时"].str.extract('(\d+)')
# print(df)
data1["place"] = data1["临时"].str.extract('(\D+)', expand=False).str.strip()
names_list1 = data1.columns.tolist()
# print(names_list1)
names_list1.insert(names_list1.index(names_list1[1]), 'time')
# print(names_list1)
names_list1.insert(names_list1.index('time') + 1, "place")
data1 = data1.reindex(columns=names_list1[:-2])
data1 = data1.drop(['临时'], axis=1)
# data1 = data1.set_index(0)
# print(names_list1[:-2])
# print(data1.columns.tolist())
names_list2 = data1.columns.values
data1 = data1.set_index(names_list2[1])
data1.to_excel(file_finish)
截面数据转面板数据
def to_panel(file,file_finish):
data = pd.read_excel(file, sheet_name=0)
name = data.columns.values
data = data.melt(id_vars=name[0], value_vars=name[1:]).set_index(name[0])
# data.to_excel(file_finish)
data["time"] = data["variable"].str.extract('(\d+)')
# print(df)
data["variable"] = data["variable"].str.extract('(\D+)', expand=False).str.strip()
data = data.loc[:, ['time', 'variable', 'value']]
data = data.pivot_table(index=[name[0], 'time'], columns=["variable"]).reset_index().set_index(name[0])
# print(data.columns.values)
name12 = data.columns.values[:1]
name_list1 = []
for i in name12:
name_list1.append(i[0])
names = data.columns.values[1:]
name_list = []
for i in names:
name_list.append(i[1])
name_sum = name_list1 + name_list
# print(name_sum)
data.set_axis(name_sum, axis='columns', inplace=True)
# data.drop(data.index[0:3], inplace=True)
data = data.reset_index()
data = data.set_index('time')
data.to_excel(file_finish)
面板数据转截面数据
def reshape_wide(file,file_finish):
data = pd.read_excel(file, sheet_name=0)
# 获取列名
list_name = data.columns.values
# 读取省份(城市)
city_list = list(set(data[list_name[0]]))
data_Section = pd.DataFrame()
city_list.sort()
data_Section[list_name[0]] = city_list
# 读取年份
year_list = list(set(data[list_name[1]]))
year_list.sort()
# 截面变量
for para in list_name[2:]:
for year in year_list:
para_name = str(year) + para
data_Section[para_name] = ""
# 读取行值
for index, row in data.iterrows():
# 城市
row_city = row[0]
# 年份
row_year = row[1]
# 变量
row_para = row[2:]
paras_name = list_name[2:]
# 将两列表合成元组
hecheng = list(zip(row_para, str(row_year) + paras_name))
# print(hecheng)
city_weizhi_hang = data_Section[(data_Section[list_name[0]] == row_city)].index.tolist()
for num, paraname in hecheng:
data_Section.loc[city_weizhi_hang, paraname] = num
data_Section = data_Section.set_index(list_name[0])
data_Section.to_excel(file_finish)
全部代码
import pandas as pd
import numpy as np
import os
# 消去空值
def read_excel(file,file_finish):
data = pd.read_excel(file, sheet_name=0)
print(data)
data = data.dropna()
data.to_excel(r"C:\Users\ZhangX\Desktop\stata.xlsx")
# 面板数据转截面数据
def reshape_wide(file,file_finish):
data = pd.read_excel(file, sheet_name=0)
# 获取列名
list_name = data.columns.values
# 读取省份(城市)
city_list = list(set(data[list_name[0]]))
data_Section = pd.DataFrame()
city_list.sort()
data_Section[list_name[0]] = city_list
# 读取年份
year_list = list(set(data[list_name[1]]))
year_list.sort()
# 截面变量
for para in list_name[2:]:
for year in year_list:
para_name = str(year) + para
data_Section[para_name] = ""
# 读取行值
for index, row in data.iterrows():
# 城市
row_city = row[0]
# 年份
row_year = row[1]
# 变量
row_para = row[2:]
paras_name = list_name[2:]
# 将两列表合成元组
hecheng = list(zip(row_para, str(row_year) + paras_name))
# print(hecheng)
city_weizhi_hang = data_Section[(data_Section[list_name[0]] == row_city)].index.tolist()
for num, paraname in hecheng:
data_Section.loc[city_weizhi_hang, paraname] = num
data_Section = data_Section.set_index(list_name[0])
data_Section.to_excel(file_finish)
# 截面数据转面板数据
def to_panel(file,file_finish):
data = pd.read_excel(file, sheet_name=0)
name = data.columns.values
data = data.melt(id_vars=name[0], value_vars=name[1:]).set_index(name[0])
# data.to_excel(file_finish)
data["time"] = data["variable"].str.extract('(\d+)')
# print(df)
data["variable"] = data["variable"].str.extract('(\D+)', expand=False).str.strip()
data = data.loc[:, ['time', 'variable', 'value']]
data = data.pivot_table(index=[name[0], 'time'], columns=["variable"]).reset_index().set_index(name[0])
# print(data.columns.values)
name12 = data.columns.values[:1]
name_list1 = []
for i in name12:
name_list1.append(i[0])
names = data.columns.values[1:]
name_list = []
for i in names:
name_list.append(i[1])
name_sum = name_list1 + name_list
# print(name_sum)
data.set_axis(name_sum, axis='columns', inplace=True)
# data.drop(data.index[0:3], inplace=True)
data = data.reset_index()
data = data.set_index('time')
data.to_excel(file_finish)
# 合并面板数据
def merge_panel(file_path,file_finish):
file_name_list = os.listdir(file_path)
file_list = []
for i in file_name_list:
a = file_path + '\\' + i
file_list.append(a)
# print(a)
data1 = pd.DataFrame(columns=['临时'])
for file in file_list:
data = pd.read_excel(file, sheet_name=0)
names_list = data.columns.tolist()
# print(names_list)
# data.reindex(columns=names_list)
data['临时'] = data[names_list[0]].astype(str) + data[names_list[1]].str[:2].astype(str)
names_list.insert(names_list.index(names_list[2]), '临时')
# print(names_list)
data = data.reindex(columns=names_list)
data = data.set_index([names_list[0], names_list[1]])
data1 = data1.merge(data, how='outer', on='临时')
# print(data1)
# print(data1)
# print(data[names_list[1]].str[:2])
# print(data)
data1["time"] = data1["临时"].str.extract('(\d+)')
# print(df)
data1["place"] = data1["临时"].str.extract('(\D+)', expand=False).str.strip()
names_list1 = data1.columns.tolist()
# print(names_list1)
names_list1.insert(names_list1.index(names_list1[1]), 'time')
# print(names_list1)
names_list1.insert(names_list1.index('time') + 1, "place")
data1 = data1.reindex(columns=names_list1[:-2])
data1 = data1.drop(['临时'], axis=1)
# data1 = data1.set_index(0)
# print(names_list1[:-2])
# print(data1.columns.tolist())
names_list2 = data1.columns.values
data1 = data1.set_index(names_list2[1])
data1.to_excel(file_finish)
# ****面板数据合并格式********** #
# 年份 省份 变量1 变量2 ...
# 2003 重庆 123 1234
# ... ... ...
# **********************#
# ****面板数据转截面数格式****** #
# 省份 年份 变量1 变量2 ...
# 重庆 2003 123 1234
# ... ... ... ...
# ************************** #
# 读取需要转换的文件路径
# 合并面板数据数据指定文件夹路径即可
file_read = r"C:\Users\ZhangX\Desktop\999测试"
# 文件保存路径
file_finish = r"C:\Users\ZhangX\Desktop\999测试\汇总.xlsx"
if __name__ == '__main__':
merge_panel(file_read,file_finish)
# to_panel(file_read)
# reshape_wide(file_read)