数据维度错误——ValueError: shapes (2778,) and (269,67) not aligned: 2778 (dim 0) != 269 (dim 0)

先来附上错误的源代码

import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os

def dataProcess_X(Data):
    listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"]  # 读取非数字的column
    listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn]  # 数字的column

    ObjectData = Data[listObjectColumn] #非数字的列
    NonObjectData = Data[listNonObjedtColumn] # 数字的列
    # insert set into nonobject data with male = 0 and female = 1

    # set every element in object rows as an attribute
    ObjectData = pd.get_dummies(ObjectData) #one-hot编码

    Data = pd.concat([NonObjectData, ObjectData], axis=1) # 指定轴axis=1进行全连接
    Data_x = Data.astype("int64")

    # normalize
    Data_x = (Data_x - Data_x.mean()) / Data_x.std()

    return Data_x


def dataProcess_Y(rawData):
    df_y = rawData['Clicked on Ad']
    Data_y = pd.DataFrame(df_y)
    return Data_y


def sigmoid(z):
    res = 1 / (1.0 + np.exp(-z))
    return np.clip(res, 1e-8, (1 - (1e-8)))


def _shuffle(X, Y):  # X and Y are np.array
    randomize = np.arange(X.shape[0])
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])


def split_valid_set(X, Y, percentage):
    all_size = X.shape[0]
    valid_size = int(floor(all_size * percentage))

    X, Y = _shuffle(X, Y)
    X_valid, Y_valid = X[: valid_size], Y[: valid_size]
    X_train, Y_train = X[valid_size:], Y[valid_size:]

    return X_train, Y_train, X_valid, Y_valid


# 求验证及上地实验结果
def valid(X, Y, w):
    a = np.dot(w, X.T)
    y = sigmoid(a)
    y_ = np.around(y)
    result = (np.squeeze(Y) == y_)
    print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
    return y_


def train(X_train, Y_train):

    w = np.zeros(len(X_train[0]))

    l_rate = 0.001
    batch_size = 32  # minibatch
    train_dataz_size = len(X_train)
    step_num = int(floor(train_dataz_size / batch_size))
    epoch_num = 300  # 训练迭代次数
    list_cost = []

    total_loss = 0.0
    for epoch in range(1, epoch_num):  # 左闭右开区间
        total_loss = 0.0  # total_loss每次清零
        X_train, Y_train = _shuffle(X_train, Y_train)  # 每次都打乱,达到随机的效果

        # 在每个minibatch中进行训练
        for idx in range(1, step_num):
            X = X_train[idx * batch_size:(idx + 1) * batch_size]
            Y = Y_train[idx * batch_size:(idx + 1) * batch_size]

            s_grad = np.zeros(len(X[0]))

            z = np.dot(X, w)
            y = sigmoid(z)
            loss = y - np.squeeze(Y)

            # 求交叉熵
            cross_entropy = -1 * (np.dot(np.squeeze(Y.T), np.log(y)) + np.dot((1 - np.squeeze(Y.T)), np.log(1 - y))) / len(Y)
            total_loss += cross_entropy
            grad = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size, 1)), axis=0)  # 求梯度
            w = w - l_rate * grad

        list_cost.append(total_loss)

    # valid(X_valid, Y_valid, w)
    plt.plot(np.arange(len(list_cost)), list_cost)
    plt.title("Train Process")
    plt.xlabel("epoch_num")
    plt.ylabel("Cost Function (Cross Entropy)")
    plt.show()

    return w


if __name__ == "__main__":
    trainData = pd.read_csv("train.csv")
    testData = pd.read_csv("test.csv")
    ans = pd.read_csv("test_answer.txt")

    # here is one more attribute in trainData
    x_train = dataProcess_X(trainData.drop(['Country'], axis=1)).values
    x_test = dataProcess_X(testData).values
    y_train = dataProcess_Y(trainData).values
    y_ans = ans[:].values

    x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)  # 加入b
    x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1)

    valid_set_percentage = 0.1
    X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, valid_set_percentage)

    w_train = train(X_train, Y_train)  # 在验证集上做训练
    valid(X_train, Y_train, w_train)  # 验证集上的结果

    w = train(x_train, y_train)  # 在整个训练集上做训练

    y_ = valid(x_test, y_ans, w)  # 测试结果

    # 存储结果
    df = pd.DataFrame(y_,columns=['label'])
    df.to_csv('lr_output.csv')

报错的图片
在这里插入图片描述

  • 这个错误折磨了我巨久,报错的原因来听听大佬的说法:

因为训练集和测试集的特征种类不一样 测试集相应特征的类别较少get_dummies函数功能比较单一只能将当前数据转化为onehot编码形式,等于做了两个onehot编码的训练。重写这边自己写onehot编码函数可以用sklearn自带的onehot编码函数,在训练集fit_transform,训练和转换在测试集只transform。

之后我用sklearn的onehot编码方式就可以解决这个问题了,但是有的同学依然用的是pandas的onehot编码就可以成功运行,所以我心里还是有点不甘心的,也想找到pandas的onehot编码运行的方法。
先来看看我用sklearn是怎么解决的吧。

import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from math import floor, log
import os


output_dir = "./output/"

def dataProcess_X(Data):
    #对数据特征值进行处理
    label_encoder = LabelEncoder()
    # 对非数字列的数据进行onehot编码。
    integer_encoded = label_encoder.fit_transform(list(Data["Ad Topic Line"]))
    integer_encoded1 = label_encoder.fit_transform(list(Data["City"]))
    integer_encoded2 = label_encoder.fit_transform(list(Data["Country"]))
    integer_encoded3 = label_encoder.fit_transform(list(Data["Timestamp"]))
    #将onehot编码后的数据放入Data中替换非数字的数据。
    Data["Ad Topic Line"] = integer_encoded
    Data["City"] = integer_encoded1
    Data["Country"] = integer_encoded2
    Data["Timestamp"] = integer_encoded3
    Data_x = Data.astype("int64")

    # normalize
    Data_x = (Data_x - Data_x.mean()) / Data_x.std()
    return Data_x

def dataProcess_Y(rawData):
    df_y = rawData['Clicked on Ad']
    Data_y = pd.DataFrame(df_y)
    return Data_y


def sigmoid(z):
    h = 1 / (1.0 + np.exp(-z))
    return np.clip(h, 1e-8, 1-(1e-8))

def _shuffle(X, Y):       # 随机选择                       
    randomize = np.arange(X.shape[0])
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

def split_valid_set(X, Y, percentage):
    # 分割数据集获取训练集和测试集的特征和标签
    all_size = X.shape[0]
    valid_size = int(floor(all_size * percentage))

    #打乱原数据的顺序
    X, Y = _shuffle(X, Y)

    #获取训练集
    X_train, Y_train = X[valid_size:], Y[valid_size:]
    X_valid, Y_valid = X[: valid_size], Y[: valid_size]

    return X_train, Y_train, X_valid, Y_valid



def valid(X, Y, w):
    a = np.dot(w, X.T)
    y = sigmoid(a)
    y_ = np.around(y)
    result = (np.squeeze(Y) == y_)
    print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
    return y_

def train(X_train, Y_train):

    w = np.zeros(len(X_train[0]))
    l_rate = 0.001
    batch_size = 24
    train_dataz_size = len(X_train) # 训练数据集长度
    step_num = int(floor(train_dataz_size/batch_size))
    epoch_num = 500 #迭代训练次数
    list_cost = []

    total_loss = 0.0
    for epoch in range(1, epoch_num):
        total_loss = 0.0
        X_train, Y_train = _shuffle(X_train, Y_train) #打乱训练集,达到随机的效果。

        #分批进行训练,每一批是batch_size
        for idx in range(1, step_num):
            X = X_train[idx * batch_size:(idx + 1) * batch_size]
            Y = Y_train[idx * batch_size:(idx + 1) * batch_size]

            s_grad = np.zeros(len(X[0]))

            z = np.dot(X, w) #求点积
            y = sigmoid(z)
            loss = y - np.squeeze(Y)

            # 求交叉熵
            cross_entropy = -1 * (np.dot(np.squeeze(Y), np.log(y)) + np.dot((1 - np.squeeze(Y)), np.log(1 - y)))# 损失函数
            total_loss += cross_entropy

            grad = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size,1)), axis=0) #梯度

            w = w - l_rate * grad

        list_cost.append(total_loss)

    # valid(X_valid, Y_valid, w)
    ''' 
    可视化训练过程
    your code
    '''
    plt.plot(np.arange(len(list_cost)), list_cost)
    plt.title("Train Process")
    plt.xlabel("epoch_num")
    plt.ylabel("Cost Function (Cross Entropy)")
    plt.show()

    return w

if __name__ == "__main__":
    trainData =pd.read_csv("train.csv")
    testData = pd.read_csv("test.csv")
    ans = pd.read_csv("test_answer.txt",header=None)

    # here is one more attribute in trainData
    x_train = dataProcess_X(trainData.drop(["Clicked on Ad"], axis=1))

    # x_train = dataProcess_X(trainData).drop(["Country"], axis=1).values
    x_test = dataProcess_X(testData).values
    y_train = dataProcess_Y(trainData).values
    y_ans = ans[:].values


    x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)
    x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1)

    valid_set_percentage = 0.1
    X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train,y_train,valid_set_percentage)

    w_train = train(X_train, Y_train) # X_train, Y_train 下的权重
    valid(X_train, Y_train, w_train) # 验证w_train

    w = train(x_train,y_train) # train.csv 所有数据的权重

    y_ = valid(x_test, y_ans, w)  # 验证w
    # print(y_)

    '''
    结果输出保存
    your code
    '''
    df = pd.DataFrame(y_,columns=["label"])
    df.to_csv('one.csv')

参考了大佬的代码后给出pandas下onehot的解决办法

import pandas as pd
import numpy as np
from random import shuffle
from numpy.linalg import inv
import matplotlib.pyplot as plt
from math import floor, log
import os

def dataProcess_X(Data):
    listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"]  # 读取非数字的column
    listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn]  # 数字的column

    ObjectData = Data[listObjectColumn] #非数字的列
    NonObjectData = Data[listNonObjedtColumn] # 数字的列


    # set every element in object rows as an attribute
    ObjectData = pd.get_dummies(ObjectData) #one-hot编码

    Data = pd.concat([NonObjectData, ObjectData], axis=1) # 指定轴axis=1进行全连接
    Data_x = Data.astype("int64")

    # normalize
    Data_x = (Data_x - Data_x.mean()) / Data_x.std()

    return Data_x


def dataProcess_Y(rawData):
    df_y = rawData['Clicked on Ad']
    Data_y = pd.DataFrame(df_y)
    return Data_y


def sigmoid(z):
    res = 1 / (1.0 + np.exp(-z))
    return np.clip(res, 1e-8, (1 - (1e-8)))


def _shuffle(X, Y):  # X and Y are np.array
    randomize = np.arange(X.shape[0])
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])


def split_valid_set(X, Y, percentage):
    all_size = X.shape[0]
    valid_size = int(floor(all_size * percentage))

    X, Y = _shuffle(X, Y)
    X_valid, Y_valid = X[: valid_size], Y[: valid_size]
    X_train, Y_train = X[valid_size:], Y[valid_size:]

    return X_train, Y_train, X_valid, Y_valid


# 求验证及上地实验结果
def valid(X, Y, w):
    a = np.dot(w, X.T)
    y = sigmoid(a)
    y_ = np.around(y)
    result = (np.squeeze(Y) == y_)
    print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
    return y_


def train(X_train, Y_train):
    w = np.zeros(len(X_train[0]))

    l_rate = 0.001
    batch_size = 32  # minibatch
    train_dataz_size = len(X_train)
    step_num = int(floor(train_dataz_size / batch_size))
    epoch_num = 300  # 训练迭代次数
    list_cost = []

    total_loss = 0.0
    for epoch in range(1, epoch_num):  # 左闭右开区间
        total_loss = 0.0  # total_loss每次清零
        X_train, Y_train = _shuffle(X_train, Y_train)  # 每次都打乱,达到随机的效果

        # 在每个minibatch中进行训练
        for idx in range(1, step_num):
            X = X_train[idx * batch_size:(idx + 1) * batch_size]
            Y = Y_train[idx * batch_size:(idx + 1) * batch_size]

            s_grad = np.zeros(len(X[0]))

            z = np.dot(X, w)
            y = sigmoid(z)
            loss = y - np.squeeze(Y)

            # 求交叉熵
            cross_entropy = -1 * (np.dot(np.squeeze(Y.T), np.log(y)) + np.dot((1 - np.squeeze(Y.T)), np.log(1 - y))) / len(Y)
            total_loss += cross_entropy
            grad = np.sum(-1 * X * (np.squeeze(Y) - y).reshape((batch_size, 1)), axis=0)  # 求梯度
            w = w - l_rate * grad

        list_cost.append(total_loss)

    # valid(X_valid, Y_valid, w)
    plt.plot(np.arange(len(list_cost)), list_cost)
    plt.title("Train Process")
    plt.xlabel("epoch_num")
    plt.ylabel("Cost Function (Cross Entropy)")
    plt.show()

    return w


if __name__ == "__main__":
    trainData = pd.read_csv("train.csv")
    testData = pd.read_csv("test.csv")


    # here is one more attribute in trainData
    x_train_hot = dataProcess_X(trainData.drop(['Clicked on Ad'], axis=1)) #去除结果标签后对训练集里面的非数字列数据进行onehot编码
    x_test_hot = dataProcess_X(testData) #对测试集里面的非数字列数据进行onehot编码

    # 按列对齐这两个DataFrame,并对列标签执行左连接
    x_train, x_test = x_train_hot.align(x_test_hot,join='left',axis=1)

    x_test.replace(np.nan,0,inplace = True) #处理缺失值
    x_train = x_train.values #这两种方式获取的都是array,打印的type为<class 'numpy.ndarray'>
    x_test = np.array(x_test)

    y_train = dataProcess_Y(trainData).values

    ans = pd.read_csv("test_answer.txt", header=None)
    y_ans = ans[:].values

    x_test = np.concatenate((np.ones((x_test.shape[0], 1)), x_test), axis=1)  # 加入b
    x_train = np.concatenate((np.ones((x_train.shape[0], 1)), x_train), axis=1)

    valid_set_percentage = 0.1
    X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, valid_set_percentage)

    w_train = train(X_train, Y_train)  # 在验证集上做训练
    valid(X_valid, Y_valid, w_train)  # 验证集上的结果

    w = train(x_train, y_train)  # 在整个训练集上做训练

    y_ = valid(x_test, y_ans, w)  # 测试结果

    # 存储结果
    df = pd.DataFrame(y_,columns=['label'])
    df.to_csv('lr_output.csv')

不知道为啥,代码明明一样,数据集也一样但是第一次的valid出来的结果居然是1.好奇怪,就是不知道哪里出问题了。
主函数中的导致valid结果为1的错误代码

 w_train = train(X_train, Y_train)  # 在验证集上做训练
    valid(X_train, Y_train, w_train)  # 验证集上的结果
  • 后来听大佬说是因为第一次valid的调用的是训练数据集,而传进去的w_train就是拿训练集数据得到的,所以进行valid验证的数据之前已经用在训练里面了,再拿来验证就肯定会百分百预测出来。就相当于考试前老师发了模拟题,考试的时候老师拿之前发出来的模拟题来考学生,那只要写过之前模拟题的学生就肯定可以考一百分(默认这些学生写过模拟题就记住了答案,哈哈这个比喻有点恰当)
    好了问题都解决了,如果有收获记得点击关注哦。一键三连也可以。哈哈,谢谢您嘞!
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值