背景:
这是一个朋友在处理excel数据的时候,原数据量小,想通过随机生成一些数据做后续统计模型的分析。由于要不断通过随机生成的样本量试错,苦于用excel一遍一遍操作起来非常繁琐。于是问我有没有好的办法批量生成。借着正好复习一下pandas的操作,写下了这些代码,为了重复调用和试错,我把它封装成一个类了。分享出来和大家共同探讨学习。
过程:
原数据,不一定符合正态分布。故写了几种情况搭配起来试错。
1、按照各列的均值和标准差随机生成指定样本量的数据
def create_mean_std_data(self):
mean_std_num = int(input('请输入要生成的样本数:'))
mean_std_dic = {}
for col in self.data.columns:
mean = np.mean(self.data[col])
std = np.std(self.data[col])
row_m_s = np.random.normal(mean, std, mean_std_num)
row_m_s = np.round(row_m_s)
row_m_s = np.clip(row_m_s, 1, 5)
mean_std_dic[col] = row_m_s
mean_std_data = pd.DataFrame(mean_std_dic)
return mean_std_data
2、按照正态分布随机生成指定样本量的数据
def create_normal_data(self):
normal_num = int(input('请输入要生成的样本数:'))
normal_dic = {}
for col in self.data.columns:
row_norm = np.random.normal(0, 1, normal_num)
row_norm = 2 * ((row_norm - np.min(row_norm)) / (np.max(row_norm) - np.min(row_norm))) + 1
row_norm = np.round(row_norm)
normal_dic[col] = row_norm
normal_data = pd.DataFrame(normal_dic)
return normal_data
3、按照指定的数字范围随机生成指定样本量的数据
def create_random_data(self):
how_time_want = int(input('打算重复操作几次:'))
random_data_list = [] # 空列表,用于存放多个生成的dataframe表
for i in range(how_time_want):
random_num = int(input('请输入要生成的样本数:'))
user_num = input('在几与几之间生成随机数(英文逗号隔开):')
low_num, high_num = user_num.split(',')
random_dic = {}
for col in self.data.columns:
random_row = np.random.randint(int(low_num), int(high_num), size=random_num)
random_dic[col] = random_row
time_random_data = pd.DataFrame(random_dic)
random_data_list.append(time_random_data)
random_data = pd.concat(random_data_list)
return random_data
完整代码
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
class CreateDataExcel:
def __init__(self):
self.data = pd.read_excel(r'***.xlsx') # 原文件路径
self.data = self.data.iloc[:, 1:]
# 按各列的均值和标准差生成数据
def create_mean_std_data(self):
mean_std_num = int(input('请输入要生成的样本数:'))
mean_std_dic = {}
for col in self.data.columns:
mean = np.mean(self.data[col])
std = np.std(self.data[col])
row_m_s = np.random.normal(mean, std, mean_std_num)
row_m_s = np.round(row_m_s)
row_m_s = np.clip(row_m_s, 1, 5)
mean_std_dic[col] = row_m_s
mean_std_data = pd.DataFrame(mean_std_dic)
return mean_std_data
# 按正态分布生成数据,并映射到1-5上
def create_normal_data(self):
normal_num = int(input('请输入要生成的样本数:'))
normal_dic = {}
for col in self.data.columns:
row_norm = np.random.normal(0, 1, normal_num)
row_norm = 2 * ((row_norm - np.min(row_norm)) / (np.max(row_norm) - np.min(row_norm))) + 1
row_norm = np.round(row_norm)
normal_dic[col] = row_norm
normal_data = pd.DataFrame(normal_dic)
return normal_data
# 各列随机生成指定的整数
def create_random_data(self):
how_time_want = int(input('打算重复操作几次:'))
random_data_list = [] # 空列表,用于存放多个生成的dataframe表
for i in range(how_time_want):
random_num = int(input('请输入要生成的样本数:'))
user_num = input('在几与几之间生成随机数(英文逗号隔开):')
low_num, high_num = user_num.split(',')
random_dic = {}
for col in self.data.columns:
random_row = np.random.randint(int(low_num), int(high_num), size=random_num)
random_dic[col] = random_row
time_random_data = pd.DataFrame(random_dic)
random_data_list.append(time_random_data)
random_data = pd.concat(random_data_list)
return random_data
# 数据拼接
def concat_data(self):
data_frames = []
if input('是否按各列均值和标准差生成数据(Y或N):').lower() == 'y':
data_frames.append(self.create_mean_std_data())
if input('是否按正态分布生成数据(Y或N):').lower() == 'y':
data_frames.append(self.create_normal_data())
if input('是否按指定整数生成数据(Y或N):').lower() == 'y':
data_frames.append(self.create_random_data())
finally_data = pd.concat(data_frames)
finally_data.dropna(how='any', inplace=True)
return finally_data
if __name__ == '__main__':
create_data = CreateDataExcel()
df_finally = create_data.concat_data()
file_num = input('生成文件名后缀,防止重复命名excel表):')
df_finally.to_excel(r'**\数据生成完成版{}.xlsx'.format(file_num), index=None) # 生成后的文件路径和名称
print('==================运行结果如下=====================')
print(f'数据生成完成,请从**查找文件名为》》》数据生成完成版{file_num}《《《的excel文件')