第一次写具体问题的代码,碰到很多问题,特别是数据处理,总结一下
1.用pandas读入,数据以DataFrame存储,但是经过一些函数切片后,变成numpy。
2.原先照着训练模型写的,并没有把预测的数据集并入训练的数据集一起预测,导致数据处理了两次,非常繁琐
3.总结一下数据处理的流程,第一步先读入,第二部删去无关列,第三步用平均值和众数去填补缺失值,第四步把一些字符映射成相关数值,第五步删除重复行并重置索引。但是测试集不要第三步和第五步。第六步数据分成data和label两个数据集,即x和y。第七步把标准化后训练集划分成测试集和训练集,这样在训练的时候就可以大致知道模型的准确率。避免过拟合和欠拟合。
4.在算损失的时候,经常遇到forward的结果和label维度不一样,这时候用view处理。还有在DataFrame取列的时候要把series转成numpy,才能和新的标签生成新的DataFrame.
5.最后把生成的csv文件提交到kaggle能有0.75左右。
import numpy as np
import torch
import sklearn
import pandas as pd
from sklearn import preprocessing as ps
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
first_data = pd.read_csv("train.csv")
# print(first_data)
# 数据清洗
print(len(first_data["Name"].unique()) == first_data.shape[0])
# 获取first_data["Name"]:这部分是获取数据集first_data中的名为"Name"的列。unique():这是一个Pandas Series对象的方法,用于返回该列中的唯一值。
# first_data.shape[0]:这是获取数据集first_data的行数。
first_data = first_data.drop(["Cabin", "Name", 'PassengerId', 'Ticket'], axis=1)
# 删除无关列
# 填补缺失值
age = float(int(first_data["Age"].mean())) # 均值
embarked = first_data["Embarked"].value_counts().index[0] # 众数
first_data.fillna({"Age": age, "Embarked": embarked}, inplace=True)
# str--->int 将字符串映射成序号
sex = first_data["Sex"].unique().tolist()
emb = first_data["Embarked"].unique().tolist()
first_data["Sex"] = first_data["Sex"].apply(lambda x: sex.index(x) + 1)
first_data["Embarked"] = first_data["Embarked"].apply(lambda x: emb.index(x) + 1)
# 删除重复值
first_data.drop_duplicates(inplace=True)
# 重置索引
first_data.index = range(first_data.shape[0])
# 定义训练集与测试集的索引
import random
test_set = set([])
while len(test_set) < 277:
num = random.randint(0, 776)
test_set.add(num)
test_list = list(test_set)
train_list = [x for x in range(first_data.shape[0]) if x not in test_list]
random.shuffle(train_list) # 打乱
print(test_list)
print(train_list)
print(len(test_list), len(train_list))
print(len(test_list) + len(train_list) == first_data.shape[0])
# 划分数据与标签
X = first_data[[x for x in first_data.columns if x != "Survived"]]
Y = first_data["Survived"]
# 标准化处理
X = ps.scale(X)
print(X)
print(Y)
# 划分测试集与训练集
print(type(X[train_list]))
print(type(Y[train_list]))
print(type(Y[train_list].values))
x_train = torch.from_numpy(X[train_list]).type(torch.float32)
y_train = torch.from_numpy(Y[train_list].values).type(torch.float32)
x_test = torch.from_numpy(X[test_list]).type(torch.float32)
y_test = torch.from_numpy(Y[test_list].values).type(torch.float32)
# 初始化参数与数据
batch = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=False)
test_dataset = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=False)
class Model(torch.nn.Module):
def __init__(self, size):
super().__init__()
self.l1 = torch.nn.Linear(size, 1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
x = self.l1(x)
x = self.sigmoid(x)
return x
model = Model(x_train.shape[1])
model.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
def train(epoch):
loss_sum = 0.0
for i, data in enumerate(train_loader):
inputs, target = data
inputs, target = inputs.to(device), target.to(device)
y_pred = model(inputs)
target = target.view(y_pred.size())
loss = criterion(y_pred, target)
loss_sum += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(epoch, loss_sum)
def test():
with torch.no_grad():
total_correct = 0
total_samples = 0
loss = 0
for data, label in test_loader:
data, label = data.to(device), label.to(device)
y_pre = model(data)
y_pre_label = torch.where(y_pre > 0.5, torch.tensor([1.0], device=device),torch.tensor([0.0], device=device))
y_pre_label = y_pre_label.to(device)
#loss += criterion(y_pre_label,label.view_as(y_pre_label)).item()
total_correct += torch.eq(y_pre_label, label.view_as(y_pre_label)).sum().item()
total_samples += label.size(0)
acc = total_correct / total_samples
print("acc", acc)
#print("loss",loss/len(test_loader))
second_data = pd.read_csv("test.csv")
passenger_id = second_data["PassengerId"].values
second_data = second_data.drop(["Cabin", "Name", 'PassengerId', 'Ticket'], axis=1)
# 填补缺失值
second_data.fillna({"Age": age, "Embarked": embarked}, inplace=True)
# str--->int 将字符串映射成序号
second_data["Sex"] = second_data["Sex"].apply(lambda x: sex.index(x) + 1)
second_data["Embarked"] = second_data["Embarked"].apply(lambda x: emb.index(x) + 1)
# 重置索引
second_data.index = range(second_data.shape[0])
s = ps.scale(second_data)
s = torch.from_numpy(s).type(torch.float32)
s = s.to(device)
if __name__ == '__main__':
for i in range(100):
train(i)
if i % 4 == 0: test()
y_pre = model(s)
y_pre = torch.where(y_pre > 0.5, torch.tensor([1], device=device), torch.tensor([0], device=device))
y_pre = y_pre.cpu()
y_pre = y_pre.numpy()
y_pre = y_pre.astype(int).flatten()
pred_df = pd.DataFrame({"PassengerId":passenger_id,"Survived":y_pre})
pred_df.to_csv('titanic_pred.csv',index=False)