[数据挖掘] 数据挖掘算法之python基础操作(一)

[数据挖掘] 数据挖掘算法之python基础操作(一)

本人在读研究生,将与大家分享机器学习、深度学习算法学习资料,所有算法将提供可运行的python源程序,基于anaconda3.7, pycharm_x64 2019.2.3,tensorflow平台实现。这是【数据挖掘】系列第一篇博文,该系列所有算法实现将采用同一数据集(titanic)便于大家熟悉各类算法,并对各种机器学习算法进行比较分析,数据集下载链接如下:
https://www.kaggle.com/c/titanic https://www.kaggle.com/c/titanic/details/getting-started-with-python-ii 。

一、 导入数据集

import pandas as pd 
from pandas import DataFrame 
import numpy as np 
import matplotlib.pyplot as plt 

data=pd.read_csv("D:/机器学习算法集/DecisionTree/DecisionTree-master/titanic_train.csv",header=0)
data_test=pd.read_csv("D:/机器学习算法集/DecisionTree/DecisionTree-master/test.csv",header=0)

二、 查看数据集

查看变量信息

data.info()

在这里插入图片描述

预览数据

print(data.head())

统计信息描述

data.describe() data_test.describe()
在这里插入图片描述

查看字段类型

print(data.dtypes )

在这里插入图片描述

处理各字段信息

import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt


# 导入数据集
data=pd.read_csv("D:/机器学习算法集/DecisionTree/DecisionTree-master/titanic_train.csv",header=0)
data_test=pd.read_csv("D:/机器学习算法集/DecisionTree/DecisionTree-master/test.csv",header=0)


# drop unnecessary columns
data=data.drop(['PassengerId','Name','Ticket'],axis=1)
data_test=data_test.drop(['Name','Ticket'],axis=1)


# 处理SEX字段
data['Gender']=data['Sex'].map({'female':0,'male':1}).astype(int)
data_test['Gender']=data_test['Sex'].map({'female':0,'male':1}).astype(int)
data['Gender'].head()
data_test['Gender'].head()


# 处理AGE字段
# 统计Age的平均值,中位数
data['Age'].mean()
data['Age'].median()

# 筛选数据
data[data['Age']>60]
data[data['Age']>60][['Sex','Pclass','Age','Survived']]
data[data['Age'].isnull()][['Sex','Pclass','Age']]

# 绘制直方图
data['Age'].dropna().hist(bins=16,range=(0,80),alpha=0.5 )
plt.show()

#===================== 处理Age字段的缺失值等信==================================
# Use the age that was typical in each class
# 根据性别和客舱级别求年龄的平均值
median_ages=np.zeros((2,3))  #性别2类,客舱级别3类,分别计算各个性别乘坐每个级别客舱的平均年龄
median_ages_test=np.zeros((2,3))
median_ages
median_ages_test
for i in range(0,2):
    for j in range(0,3):
        median_ages[i, j] = data[(data['Gender'] == i) & (data['Pclass'] == j + 1)]['Age'].dropna().median()
        median_ages_test[i, j] = data_test[(data_test['Gender'] == i) & (data_test['Pclass'] == j + 1)]['Age'].dropna().median()
print(median_ages)
print(median_ages_test)

# 填充缺失值
data['AgeFill']=data['Age']
data_test['AgeFill']=data_test['Age']
data.head()
data[data['Age'].isnull()][['Gender','Pclass','Age','AgeFill']].head(10)
for i in range(0,2):
    for j in range(0,3):
        data.loc[(data.Age.isnull()) & (data.Gender == i) & (data.Pclass == j + 1), 'AgeFill'] = median_ages[i, j]
        data_test.loc[(data_test.Age.isnull()) & (data_test.Gender == i) & (data_test.Pclass == j + 1), 'Age Fill'] =median_ages_test[i, j]
data[data['Age'].isnull()][['Gender', 'Pclass', 'Age', 'AgeFill']].head(10)
data_test[data_test['Age'].isnull()][['Gender', 'Pclass', 'Age', 'AgeFill']].head(10)

#创建字段,记录Age中哪些是原有的,哪些是后加入的
data['AgeIsNull']=pd.isnull(data.Age).astype(int)
data_test['AgeIsNull']=pd.isnull(data_test.Age).astype(int)
print(data['AgeIsNull'].head())


#Using sex and age for person column
# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age, sex = passenger
    return 'child' if age < 16 else sex
data['Person']=data[['Age','Sex']].apply(get_person,axis=1)
data_test['Person']=data_test[['Age','Sex']].apply(get_person,axis=1)
# create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
person_dummies=pd.get_dummies(data['Person'])
person_dummies.columns=['Child','Female','Male']
person_dummies.drop(['Male'],axis=1,inplace=True)
person_dummies_test=pd.get_dummies(data_test['Person'])
person_dummies_test.columns=['Child','Female','Male']
person_dummies_test.drop(['Male'],axis=1,inplace=True)
data=data.join(person_dummies)
data_test=data_test.join(person_dummies)
print(data['Person'].value_counts())


#处理 Embarked 字段
print(data['Embarked'].describe())
print(data["Embarked"].value_counts())

# 创建Embarked , Survived透视图
temp1 = data['Embarked'].value_counts()
temp1.plot(kind = 'bar')
plt.show()

temp2=data.pivot_table(values="Survived", index=["Embarked"],aggfunc=lambda
x:x.mean())
temp2.plot(kind='bar')
plt.show()

# Either to consider Embarked column in predictions,
# and remove "S" dummy variable,
# and leave "C" & "Q", since they seem to have a good rate f
# # because logically, Embarked doesn't seem to be useful in prediction. or Survival.
# OR, don't create dummy variables for Embarked column, just drop it,
embark_dummies=pd.get_dummies(data['Embarked'])
embark_dummies.drop(['S'],axis=1,inplace=True)
embark_dummies_test=pd.get_dummies(data_test['Embarked'])
embark_dummies_test.drop(['S'],axis=1,inplace=True)
#原数据集与哑变量合并
data=data.join(embark_dummies)
data_test=data_test.join(embark_dummies_test)
print(data.head())


#处理 Fare 字段
## only for data_test, since there is a missing "Fare" values
data_test["Fare"].fillna(data_test["Fare"].median(),inplace=True)
Fare_avg_test=data_test["Fare"].mean()
Fare_std_test=data_test["Fare"].std()
data_test["Fare"]=(data_test["Fare"]-Fare_avg_test)/Fare_std_test
# 标准化处理
Fare_avg = data["Fare"].mean()
Fare_std = data["Fare"].std()
data["Fare"] = (data["Fare"] - Fare_avg) / Fare_std
temp3=data["Fare"].hist(bins=40)
temp3.plot(kind='bar')
plt.show()

#处理 Family 字段
data['FamilySize']=data['SibSp']+data['Parch']
data_test['FamilySize']=data_test['SibSp']+data_test['Parch']
data['FamilySize'].loc[data['FamilySize']>0]=1
data['FamilySize'].loc[data['FamilySize']==0]=0
data['FamilySize'].value_counts()
temp4 = data['FamilySize'].value_counts()
temp4.plot(kind='bar')
plt.show()

#处理 Pclass 字段
temp3=data.pivot_table(values="Survived",index=["Pclass"] ,aggfunc=lambda x:x.mean())
temp3.plot(kind='bar')
plt.show()
# create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
pclass_dummies=pd.get_dummies(data['Pclass'])
pclass_dummies.columns=['class_1','class_2','class_3']
pclass_dummies.drop(['class_3'],axis=1,inplace=True)
pclass_dummies_test=pd.get_dummies(data_test['Pclass'])
pclass_dummies_test.columns=['class_1','class_2','class_3']
pclass_dummies_test.drop(['class_3'],axis=1,inplace=True)
data=data.join(pclass_dummies)
data_test=data_test.join(pclass_dummies)

#Drop unimportant columns
data.drop(['Pclass','Sex','Age','SibSp', 'Parch','Cabin','Embarked','Gender','AgeFill', 'AgeIsNull','Person'],axis=1,inplace=True)
data=data.dropna()
data_test.drop(['Pclass','Sex','Age','SibSp', 'Parch','Cabin','Embarked','Gender','AgeFill', 'AgeIsNull','Person'],axis=1,inplace=True)
data_test=data_test.dropna()

#定义训练集和测试集
X_train=data.drop("Survived",axis=1)
Y_train=data["Survived"]
X_test=data_test.drop("PassengerId",axis=1).copy()

总结

至此,数据预处理结束,定义完成训练集和测试集,接下来将借助该训练集与测试集进行建模分析,采用SVC、Linear、KNN、Logistic、Random Forest 、Naive Bayes等机器学习算法进行机器学习分析,详见下文。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值