《Python机器学习kaggle案例》-- 网易云课堂
https://study.163.com/course/courseMain.htm?courseId=1003551009
numpy--python科学计算库
pandas--Python数据分析处理库
scikit-learn -- Python机器学习库
titannic数据
Variable | Definition | Key |
---|---|---|
survival | Survival | 0 = No, 1 = Yes |
pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
sex | Sex | |
Age | Age in years | |
sibsp | # of siblings / spouses aboard the Titanic | |
parch | # of parents / children aboard the Titanic | |
ticket | Ticket number | |
fare | Passenger fare | |
cabin | Cabin number | |
embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.
https://study.163.com/course/courseMain.htm?courseId=1003551009
LinearRegression
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 1 09:24:27 2018
@author: zh
"""
import pandas as pd
import numpy as np
titanic = pd.read_csv('train.csv')
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1
titanic['Embarked'] = titanic['Embarked'].replace('nan', np.nan).fillna('S')
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = LinearRegression()
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
train_predictors = (titanic[predictors].iloc[train, :])
train_target = titanic['Survived'].iloc[train]
alg.fit(train_predictors, train_target)
test_predictions = alg.predict(titanic[predictors].iloc[test, :])
predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0
accuracy = sum(predictions == titanic['Survived'])/len(predictions)
#accuracy = 0.7833894500561167
LogisticRegression
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 1 09:34:55 2018
@author: zh
"""
import pandas as pd
import numpy as np
titanic = pd.read_csv('train.csv')
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1
titanic['Embarked'] = titanic['Embarked'].replace('nan', np.nan).fillna('S')
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)
accuracy = scores.mean()
#accuracy = 0.7878787878787877
RandomForestClassifier
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 1 09:37:31 2018
@author: zh
"""
import pandas as pd
import numpy as np
titanic = pd.read_csv('train.csv')
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1
titanic['Embarked'] = titanic['Embarked'].replace('nan', np.nan).fillna('S')
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
accuracy = scores.mean()
#accuracy = 0.7856341189674523
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
accuracy = scores.mean()
#accuracy = 0.8159371492704826
max_acc = 0
for n_estimators in range(1,60,10):
for min_samples_split in range(2,10):
for min_samples_leaf in range(1,10):
alg = RandomForestClassifier(random_state=1, n_estimators=n_estimators, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
accuracy = scores.mean()
if accuracy>max_acc:
print(n_estimators,min_samples_split,min_samples_leaf)
max_acc = accuracy
print(max_acc)
#max_acc = 0.8316498316498316
feature_selection
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 1 09:52:38 2018
@author: zh
"""
import pandas as pd
import numpy as np
titanic = pd.read_csv('train.csv')
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1
titanic['Embarked'] = titanic['Embarked'].replace('nan', np.nan).fillna('S')
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))
import re
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ''
titles = titanic['Name'].apply(get_title)
#pd.value_counts(titles)
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 6, 'Col': 7, 'Major': 8, 'Mlle': 9, 'Capt': 10, 'Ms': 11, 'Jonkheer': 12, 'Don':13, 'Sir':14, 'Countess':15, 'Lady':16, 'Mme':17}
for k,v in title_mapping.items():
titles[titles==k]=v
#pd.value_counts(titles)
titanic['Title'] = titles
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'Title', 'NameLength']
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic['Survived'])
scores = -np.log10(selector.pvalues_)
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
predictors = ['Pclass', 'Sex', 'Fare', 'Title']
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf)
accuracy = scores.mean()
#accuracy=0.8114478114478114
GradientBoostingClassifier
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 1 09:52:38 2018
@author: zh
"""
import pandas as pd
import numpy as np
titanic = pd.read_csv('train.csv')
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1
titanic['Embarked'] = titanic['Embarked'].replace('nan', np.nan).fillna('S')
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']
titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x))
import re
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
if title_search:
return title_search.group(1)
return ''
titles = titanic['Name'].apply(get_title)
#pd.value_counts(titles)
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 6, 'Col': 7, 'Major': 8, 'Mlle': 9, 'Capt': 10, 'Ms': 11, 'Jonkheer': 12, 'Don':13, 'Sir':14, 'Countess':15, 'Lady':16, 'Mme':17}
for k,v in title_mapping.items():
titles[titles==k]=v
#pd.value_counts(titles)
titanic['Title'] = titles
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title']],
[LogisticRegression(random_state=1), ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title']]
]
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
train_target = titanic['Survived'].iloc[train]
full_test_predictions = []
for alg, predictors in algorithms:
alg.fit(titanic[predictors].iloc[train, :], train_target)
test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:,1]
full_test_predictions.append(test_predictions)
test_predictions = (full_test_predictions[0]*3 + full_test_predictions[1])/4
test_predictions[test_predictions <= 0.5] = 0
test_predictions[test_predictions > 0.5] = 1
predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)
accuracy = sum(predictions == titanic['Survived'])/len(predictions)
#accuracy=0.8204264870931538
posted on 2018-08-30 09:44 sundaygeek 阅读(...) 评论(...) 编辑 收藏