需求
现将数据集按照一定的数量大小,切割多少份,剩下的数据集进行保存。其目的是,切分一百个节点的训练集,剩下的数据集当做测试集的抽取来源。
代码
代码1:
直接从上到下,按照数量截取,但是由于垃圾邮件比例分布肯定是不均匀的,这样贼der。代码猥琐,因而正确性么有校验。
import pandas as pd
root_path = '../data/second-experiment/test/'
test_set_path = root_path + 'totalclient/puretr07.csv'
# 正常读取
df = pd.read_csv(testset_path, encoding='utf-8', sep=',')
#df = pd.read_csv(testset_path,encoding='utf-8',sep=',',skiprows=list,nrows =32232) # nrows:需要读取的行数(从文件头开始算起)
list = []
for num in range(1, 32232):
list.append(num)
df = pd.read_csv(test_set_path, encoding='utf-8', sep=',', skiprows=list) # 需要忽略的行数(从文件开始处算起),或需要跳过的行号列表(从0开始)。
代码2
只按数量,不按比例截取数据集。
import pandas as pd
x = [6295, 5946, ..., 0]
df = pd.read_csv('../data/second-experiment/test/totalclient/trec07.csv', encoding='utf-8', sep=',')
columns = df.columns.values # columns = ['label' 'message']
for i in range(0,101):
if i == 100:
df.to_csv('../data/second-experiment/test/totalclient/puretr07.csv', index=False, encoding='utf-8', columns=columns)
df2 = df.sample(n=x[i], replace=False, random_state=None, axis=0)
df = df.append(df2)
df = df.append(df2)
df = df.drop_duplicates(subset=columns, keep=False)
user_path = '../data/second-experiment/pure/user/user'+str(i)+'.csv'
df2.to_csv(user_path, index=False, encoding='utf-8', columns=columns)
代码3
按比例和数量,截取数据集。
import pandas as pd
import math
x = [6295, 5946, ..., 0]
df = pd.read_csv('../data/second-experiment/test/totalclient/trec07.csv', encoding='utf-8', sep=',')
df['label'] = df.label.apply(lambda x: 1 if x == 'spam' else 0)
columns = df.columns.values
# 拆分
df0 = df.loc[df['label'] == 0] # 0:不是垃圾邮件
df1 = df.loc[df['label'] == 1] # 1:是垃圾邮件
# 混淆
df0 = df0.sample(frac=1.0)
df1 = df1.sample(frac=1.0)
# 设置占比
ratio = 0.7
for i in range(0, 101):
print(len(df0), '============', len(df1))
if i == 100:
df1 = df1.append(df0)
df1.to_csv('../data/second-experiment/test/totalclient/puretr07.csv', index=False, encoding='utf-8', columns=columns)
# 混淆
df0 = df0.sample(frac=1.0)
df1 = df1.sample(frac=1.0)
# 算一下好坏的个数
num1 = math.floor(x[i]*ratio)
num0 = x[i] - num1
df0_1 = df0.sample(n=num0, replace=False, random_state=None, axis=0)
df1_1 = df1.sample(n=num1, replace=False, random_state=None, axis=0)
df0 = df0.append(df0_1)
df0 = df0.append(df0_1)
df0 = df0.drop_duplicates(subset=columns, keep=False)
df1 = df1.append(df1_1)
df1 = df1.append(df1_1)
df1 = df1.drop_duplicates(subset=columns, keep=False)
# 合并两个df取样
df1_1 = df1_1.append(df0_1)
user_path = '../data/second-experiment/pure/user/user' + str(i) + '.csv'
df1_1.to_csv(user_path, index=False, encoding='utf-8', columns=columns)
既然说到按一定分布截取数据,那就离不开合并数据
import pandas as pd
df2 = pd.read_csv("../data/first/split/train_data.csv")#填充数据
print(len(df2))
x = [6295,...,0]
for i in range(0,100):
df1 = pd.read_csv('../data/second-experiment/pure/user/user'+str(i)+'.csv')#待填充数据
df = pd.concat([df1[:x[i]],df2[x[i]:28366]])
df.to_csv('../data/second-experiment/mix/client-source/user'+str(i)+'.csv')