先来附上错误的源代码
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os
def dataProcess_X(Data):
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] # 读取非数字的column
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] # 数字的column
ObjectData = Data[listObjectColumn] #非数字的列
NonObjectData = Data[listNonObjedtColumn] # 数字的列
# insert set into nonobject data with male = 0 and female = 1
# set every element in object rows as an attribute
ObjectData = pd.get_dummies(ObjectData) #one-hot编码
Data = pd.concat([NonObjectData, ObjectData], axis=1) # 指定轴axis=1进行全连接
Data_x = Data.astype("int64")
# normalize
Data_x = (Data_x - Data_x.mean()) / Data_x.std()
return Data_x
def dataProcess_Y(rawData):
df_y = rawData['Clicked on Ad']
Data_y = pd.DataFrame(df_y)
return Data_y
def sigmoid(z):
res = 1 / (1.0 + np.exp(-z))
return np.clip(res, 1e-8, (1 - (1e-8)))
def _shuffle(X, Y): # X and Y are np.array
randomize = np.arange(X.shape[0])
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])
def split_valid_set(X, Y, percentage):
all_size = X.shape[0]
valid_size = int(floor(all_size * percentage))
X, Y = _shuffle(X, Y)
X_valid, Y_valid = X[: valid_size], Y[: valid_size]
X_train, Y_train = X[valid_size:], Y[valid_size:]
return X_train, Y_train, X_valid, Y_valid
# 求验证及上地实验结果
def valid(X, Y, w):
a = np.dot(w, X.T)
y = sigmoid(a)
y_ = np.around(y)
result = (np.squeeze(Y) == y_)
print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
return y_
def train(X_train, Y_train):
w = np.zeros(len(X_train[0]))
l_rate = 0.001
batch_size = 32 # minibatch
train_dataz_size = len(X_train)
step_num = int(floor(train_dataz_size / batch_size))
epoch_num = 300 # 训练迭代次数
list_cost = []
total_loss = 0.0
for epoch in range(1, epoch_num): # 左闭右开区间
total_loss = 0.0 # total_loss每次清零
X_train, Y_train = _shuffle(X_train, Y_train) # 每次都打乱,达到随机的效果
# 在每个minibatch中进行训练
for idx in range(1, step_num):
X = X_train[idx * batch_size:(idx + 1) * batch_size]
Y = Y_train[idx * batch_size:(idx + 1) * batch_size]
s_grad = np.zeros(len(X[0]))
z = np.dot(X, w)
y = sigmoid(z)
loss = y - np.squeeze(Y)
# 求交叉熵
cross_entropy = -1 * (np.dot(np.squeeze(Y.T), np.log(y)) + np.dot((1 - np.squeeze(Y.T)), np.log(1 - y))) / len(Y)
total_loss += cross_entropy
grad = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size, 1)), axis=0) # 求梯度
w = w - l_rate * grad
list_cost.append(total_loss)
# valid(X_valid, Y_valid, w)
plt.plot(np.arange(len(list_cost)), list_cost)
plt.title("Train Process")
plt.xlabel("epoch_num")
plt.ylabel("Cost Function (Cross Entropy)")
plt.show()
return w
if __name__ == "__main__":
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")
ans = pd.read_csv("test_answer.txt")
# here is one more attribute in trainData
x_train = dataProcess_X(trainData.drop(['Country'], axis=1)).values
x_test = dataProcess_X(testData).values
y_train = dataProcess_Y(trainData).values
y_ans = ans[:].values
x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1) # 加入b
x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1)
valid_set_percentage = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, valid_set_percentage)
w_train = train(X_train, Y_train) # 在验证集上做训练
valid(X_train, Y_train, w_train) # 验证集上的结果
w = train(x_train, y_train) # 在整个训练集上做训练
y_ = valid(x_test, y_ans, w) # 测试结果
# 存储结果
df = pd.DataFrame(y_,columns=['label'])
df.to_csv('lr_output.csv')
报错的图片
- 这个错误折磨了我巨久,报错的原因来听听大佬的说法:
因为训练集和测试集的特征种类不一样 测试集相应特征的类别较少get_dummies函数功能比较单一只能将当前数据转化为onehot编码形式,等于做了两个onehot编码的训练。重写这边自己写onehot编码函数可以用sklearn自带的onehot编码函数,在训练集fit_transform,训练和转换在测试集只transform。
之后我用sklearn的onehot编码方式就可以解决这个问题了,但是有的同学依然用的是pandas的onehot编码就可以成功运行,所以我心里还是有点不甘心的,也想找到pandas的onehot编码运行的方法。
先来看看我用sklearn是怎么解决的吧。
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from math import floor, log
import os
output_dir = "./output/"
def dataProcess_X(Data):
#对数据特征值进行处理
label_encoder = LabelEncoder()
# 对非数字列的数据进行onehot编码。
integer_encoded = label_encoder.fit_transform(list(Data["Ad Topic Line"]))
integer_encoded1 = label_encoder.fit_transform(list(Data["City"]))
integer_encoded2 = label_encoder.fit_transform(list(Data["Country"]))
integer_encoded3 = label_encoder.fit_transform(list(Data["Timestamp"]))
#将onehot编码后的数据放入Data中替换非数字的数据。
Data["Ad Topic Line"] = integer_encoded
Data["City"] = integer_encoded1
Data["Country"] = integer_encoded2
Data["Timestamp"] = integer_encoded3
Data_x = Data.astype("int64")
# normalize
Data_x = (Data_x - Data_x.mean()) / Data_x.std()
return Data_x
def dataProcess_Y(rawData):
df_y = rawData['Clicked on Ad']
Data_y = pd.DataFrame(df_y)
return Data_y
def sigmoid(z):
h = 1 / (1.0 + np.exp(-z))
return np.clip(h, 1e-8, 1-(1e-8))
def _shuffle(X, Y): # 随机选择
randomize = np.arange(X.shape[0])
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])
def split_valid_set(X, Y, percentage):
# 分割数据集获取训练集和测试集的特征和标签
all_size = X.shape[0]
valid_size = int(floor(all_size * percentage))
#打乱原数据的顺序
X, Y = _shuffle(X, Y)
#获取训练集
X_train, Y_train = X[valid_size:], Y[valid_size:]
X_valid, Y_valid = X[: valid_size], Y[: valid_size]
return X_train, Y_train, X_valid, Y_valid
def valid(X, Y, w):
a = np.dot(w, X.T)
y = sigmoid(a)
y_ = np.around(y)
result = (np.squeeze(Y) == y_)
print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
return y_
def train(X_train, Y_train):
w = np.zeros(len(X_train[0]))
l_rate = 0.001
batch_size = 24
train_dataz_size = len(X_train) # 训练数据集长度
step_num = int(floor(train_dataz_size/batch_size))
epoch_num = 500 #迭代训练次数
list_cost = []
total_loss = 0.0
for epoch in range(1, epoch_num):
total_loss = 0.0
X_train, Y_train = _shuffle(X_train, Y_train) #打乱训练集,达到随机的效果。
#分批进行训练,每一批是batch_size
for idx in range(1, step_num):
X = X_train[idx * batch_size:(idx + 1) * batch_size]
Y = Y_train[idx * batch_size:(idx + 1) * batch_size]
s_grad = np.zeros(len(X[0]))
z = np.dot(X, w) #求点积
y = sigmoid(z)
loss = y - np.squeeze(Y)
# 求交叉熵
cross_entropy = -1 * (np.dot(np.squeeze(Y), np.log(y)) + np.dot((1 - np.squeeze(Y)), np.log(1 - y)))# 损失函数
total_loss += cross_entropy
grad = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size,1)), axis=0) #梯度
w = w - l_rate * grad
list_cost.append(total_loss)
# valid(X_valid, Y_valid, w)
'''
可视化训练过程
your code
'''
plt.plot(np.arange(len(list_cost)), list_cost)
plt.title("Train Process")
plt.xlabel("epoch_num")
plt.ylabel("Cost Function (Cross Entropy)")
plt.show()
return w
if __name__ == "__main__":
trainData =pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")
ans = pd.read_csv("test_answer.txt",header=None)
# here is one more attribute in trainData
x_train = dataProcess_X(trainData.drop(["Clicked on Ad"], axis=1))
# x_train = dataProcess_X(trainData).drop(["Country"], axis=1).values
x_test = dataProcess_X(testData).values
y_train = dataProcess_Y(trainData).values
y_ans = ans[:].values
x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)
x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1)
valid_set_percentage = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train,y_train,valid_set_percentage)
w_train = train(X_train, Y_train) # X_train, Y_train 下的权重
valid(X_train, Y_train, w_train) # 验证w_train
w = train(x_train,y_train) # train.csv 所有数据的权重
y_ = valid(x_test, y_ans, w) # 验证w
# print(y_)
'''
结果输出保存
your code
'''
df = pd.DataFrame(y_,columns=["label"])
df.to_csv('one.csv')
参考了大佬的代码后给出pandas下onehot的解决办法
import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os
def dataProcess_X(Data):
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] # 读取非数字的column
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] # 数字的column
ObjectData = Data[listObjectColumn] #非数字的列
NonObjectData = Data[listNonObjedtColumn] # 数字的列
# set every element in object rows as an attribute
ObjectData = pd.get_dummies(ObjectData) #one-hot编码
Data = pd.concat([NonObjectData, ObjectData], axis=1) # 指定轴axis=1进行全连接
Data_x = Data.astype("int64")
# normalize
Data_x = (Data_x - Data_x.mean()) / Data_x.std()
return Data_x
def dataProcess_Y(rawData):
df_y = rawData['Clicked on Ad']
Data_y = pd.DataFrame(df_y)
return Data_y
def sigmoid(z):
res = 1 / (1.0 + np.exp(-z))
return np.clip(res, 1e-8, (1 - (1e-8)))
def _shuffle(X, Y): # X and Y are np.array
randomize = np.arange(X.shape[0])
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])
def split_valid_set(X, Y, percentage):
all_size = X.shape[0]
valid_size = int(floor(all_size * percentage))
X, Y = _shuffle(X, Y)
X_valid, Y_valid = X[: valid_size], Y[: valid_size]
X_train, Y_train = X[valid_size:], Y[valid_size:]
return X_train, Y_train, X_valid, Y_valid
# 求验证及上地实验结果
def valid(X, Y, w):
a = np.dot(w, X.T)
y = sigmoid(a)
y_ = np.around(y)
result = (np.squeeze(Y) == y_)
print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
return y_
def train(X_train, Y_train):
w = np.zeros(len(X_train[0]))
l_rate = 0.001
batch_size = 32 # minibatch
train_dataz_size = len(X_train)
step_num = int(floor(train_dataz_size / batch_size))
epoch_num = 300 # 训练迭代次数
list_cost = []
total_loss = 0.0
for epoch in range(1, epoch_num): # 左闭右开区间
total_loss = 0.0 # total_loss每次清零
X_train, Y_train = _shuffle(X_train, Y_train) # 每次都打乱,达到随机的效果
# 在每个minibatch中进行训练
for idx in range(1, step_num):
X = X_train[idx * batch_size:(idx + 1) * batch_size]
Y = Y_train[idx * batch_size:(idx + 1) * batch_size]
s_grad = np.zeros(len(X[0]))
z = np.dot(X, w)
y = sigmoid(z)
loss = y - np.squeeze(Y)
# 求交叉熵
cross_entropy = -1 * (np.dot(np.squeeze(Y.T), np.log(y)) + np.dot((1 - np.squeeze(Y.T)), np.log(1 - y))) / len(Y)
total_loss += cross_entropy
grad = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size, 1)), axis=0) # 求梯度
w = w - l_rate * grad
list_cost.append(total_loss)
# valid(X_valid, Y_valid, w)
plt.plot(np.arange(len(list_cost)), list_cost)
plt.title("Train Process")
plt.xlabel("epoch_num")
plt.ylabel("Cost Function (Cross Entropy)")
plt.show()
return w
if __name__ == "__main__":
trainData = pd.read_csv("train.csv")
testData = pd.read_csv("test.csv")
# here is one more attribute in trainData
x_train_hot = dataProcess_X(trainData.drop(['Clicked on Ad'], axis=1)) #去除结果标签后对训练集里面的非数字列数据进行onehot编码
x_test_hot = dataProcess_X(testData) #对测试集里面的非数字列数据进行onehot编码
# 按列对齐这两个DataFrame,并对列标签执行左连接
x_train, x_test = x_train_hot.align(x_test_hot,join='left',axis=1)
x_test.replace(np.nan,0,inplace = True) #处理缺失值
x_train = x_train.values #这两种方式获取的都是array,打印的type为<class 'numpy.ndarray'>
x_test = np.array(x_test)
y_train = dataProcess_Y(trainData).values
ans = pd.read_csv("test_answer.txt", header=None)
y_ans = ans[:].values
x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1) # 加入b
x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1)
valid_set_percentage = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, valid_set_percentage)
w_train = train(X_train, Y_train) # 在验证集上做训练
valid(X_valid, Y_valid, w_train) # 验证集上的结果
w = train(x_train, y_train) # 在整个训练集上做训练
y_ = valid(x_test, y_ans, w) # 测试结果
# 存储结果
df = pd.DataFrame(y_,columns=['label'])
df.to_csv('lr_output.csv')
不知道为啥,代码明明一样,数据集也一样但是第一次的valid出来的结果居然是1.好奇怪,就是不知道哪里出问题了。
主函数中的导致valid结果为1的错误代码
w_train = train(X_train, Y_train) # 在验证集上做训练
valid(X_train, Y_train, w_train) # 验证集上的结果
- 后来听大佬说是因为第一次valid的调用的是训练数据集,而传进去的w_train就是拿训练集数据得到的,所以进行valid验证的数据之前已经用在训练里面了,再拿来验证就肯定会百分百预测出来。就相当于考试前老师发了模拟题,考试的时候老师拿之前发出来的模拟题来考学生,那只要写过之前模拟题的学生就肯定可以考一百分(默认这些学生写过模拟题就记住了答案,哈哈这个比喻有点恰当)
好了问题都解决了,如果有收获记得点击关注哦。一键三连也可以。哈哈,谢谢您嘞!