学习笔记
第一课
数据与可视化
#numpy科学计算工具箱
import numpy as np
#使用make_classification构造1000个样本,每个样本有20个feature
from sklearn.datasets import make_classification
X, y = make_classification(1000, n_features=20, n_informative=2,
n_redundant=2, n_classes=2, random_state=0)
#存为dataframe格式
from pandas import DataFrame
df = DataFrame(np.hstack((X, y[:, None])),columns = range(20) + ["class"]) #注意hstack
df[:6]
import matplotlib.pyplot as plt
import seaborn as sns
#使用pairplot去看不同特征维度pair下数据的空间分布状况
_ = sns.pairplot(df[:50], vars=[8, 11, 12, 14, 19], hue="class", size=1.5)
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 10))
_ = sns.corrplot(df, annot=False) #新版本中移除此函数
plt.show()
修改学习曲线
from sklearn.svm import LinearSVC
from sklearn.learning_curve import learning_curve
#绘制学习曲线,以确定模型的状况
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
train_sizes=np.linspace(.1, 1.0, 5)):
"""
画出data在某模型上的learning curve.
参数解释
----------
estimator : 你用的分类器。
title : 表格的标题。
X : 输入的feature,numpy类型
y : 输入的target vector
ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
"""
plt.figure()
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=5, n_jobs=1, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.legend(loc="best")
plt.grid("on")
if ylim:
plt.ylim(ylim)
plt.title(title)
plt.show()
#少样本的情况情况下绘出学习曲线
plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0)",
X, y, ylim=(0.8, 1.01),
train_sizes=np.linspace(.05, 0.2, 5))
#增大一些样本量
plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0)",
X, y, ylim=(0.8, 1.1),
train_sizes=np.linspace(.1, 1.0, 5))
plot_learning_curve(LinearSVC(C=10.0), "LinearSVC(C=10.0) Features: 11&14", X[:, [11, 14]], y, ylim=(0.8, 1.0), train_sizes=np.linspace(.05, 0.2, 5))
模型融合:stacking融合方法
"""Kaggle competition: Predicting a Biological Response.
Blending {RandomForests, ExtraTrees, GradientBoosting} + stretching to
[0,1]. The blending scheme is related to the idea Jose H. Solorzano
presented here:
http://www.kaggle.com/c/bioresponse/forums/t/1889/question-about-the-process-of-ensemble-learning/10950#post10950
'''You can try this: In one of the 5 folds, train the models, then use
the results of the models as 'variables' in logistic regression over
the validation data of that fold'''. Or at least this is the
implementation of my understanding of that idea :-)
The predictions are saved in test.csv. The code below created my best
submission to the competition:
- public score (25%): 0.43464
- private score (75%): 0.37751
- final rank on the private leaderboard: 17th over 711 teams :-)
Note: if you increase the number of estimators of the classifiers,
e.g. n_estimators=1000, you get a better score/rank on the private
test set.
Copyright 2012, Emanuele Olivetti.
BSD license, 3 clauses.
"""
from __future__ import division
import numpy as np
import load_data
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
def logloss(attempt, actual, epsilon=1.0e-15):
"""Logloss, i.e. the score of the bioresponse competition.
"""
attempt = np.clip(attempt, epsilon, 1.0-epsilon) #这个方法会给出一个区间,在区间之外的数字将被剪除到区间的边缘,例如给定一个区间[0,1],则小于0的将变成0,大于1则变成1.
return - np.mean(actual * np.log(attempt) +
(1.0 - actual) * np.log(1.0 - attempt)) ##注意logLoss的具体写法
if __name__ == '__main__':
np.random.seed(0) # seed to shuffle the train set
n_folds = 10
verbose = True
shuffle = False
X, y, X_submission = load_data.load()
if shuffle:
idx = np.random.permutation(y.size) #产生随机数
X = X[idx]
y = y[idx]
skf = list(StratifiedKFold(y, n_folds)) #分层KFold
clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]
print "Creating train and test sets for blending."
dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))
for j, clf in enumerate(clfs): #注意此种写法,enumerate
print j, clf
dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
for i, (train, test) in enumerate(skf):
print "Fold", i
X_train = X[train]
y_train = y[train]
X_test = X[test]
y_test = y[test]
clf.fit(X_train, y_train)
y_submission = clf.predict_proba(X_test)[:, 1]
dataset_blend_train[test, j] = y_submission
dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
print
print "Blending."
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
print "Saving Results."
tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T
np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f', #保存为文本
header='MoleculeId,PredictedProbability', comments='')
# 基本CSV读写操作
# 我们需要读取给定的训练数据,再进行后续的数据(特征等)处理
def read_data(file_name):
f = open(file_name)
#ignore header
f.readline()
samples = []
target = []
for line in f:
line = line.strip().split(",")
sample = [float(x) for x in line]
samples.append(sample)
return samples
def write_delimited_file(file_path, data,header=None, delimiter=","):
f_out = open(file_path,"w")
if header is not None:
f_out.write(delimiter.join(header) + "\n")
for line in data:
if isinstance(line, str): ## 注意需要判断是否是实例, isinstance
f_out.write(line + "\n")
else:
f_out.write(delimiter.join(line) + "\n")
f_out.close()
#!/usr/bin/env python
bio competition https://www.kaggle.com/c/bioresponse#description
from sklearn.linear_model import LogisticRegression
import csv_io ##此模块?
import math
import scipy
def train_and_predict():
#read in the training file
train = read_data("train.csv") #使用read_data 在csv_io 自写模块中
print '读取训练数据完毕\n...\n'
#set the training responses
target = [x[0] for x in train]
#set the training features
train = [x[1:] for x in train]
#read in the test file
realtest = read_data("test.csv")
print '读取待预测数据\n...\n'
# code for logistic regression
lr = LogisticRegression()
lr.fit(train, target)
print 'Logistic Regression训练完毕!\n...\n'
predicted_probs = lr.predict_proba(realtest)
# write solutions to file
predicted_probs = ["%f" % x[1] for x in predicted_probs]
write_delimited_file("lr_solution.csv", predicted_probs)
print 'Logistic Regression预测完毕! 请提交lr_solution.csv文件到Kaggle'
if __name__=="__main__":
train_and_predict()
Kaggle旧金山犯罪类型分类问题,https://www.kaggle.com/c/sf-crime
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import numpy as np
# 先了解自己的数据
train = pd.read_csv('sf_data/train.csv', parse_dates=['Dates']) # 注意设置时间
test = pd.read_csv('sf_data/test.csv', parse_dates=['Dates'])
train.head()
test.head()
all_addr = np.array(train.Address.tolist() + test.Address.tolist())
list(all_addr)
stop_words = ['dr', 'wy', 'bl', 'av', 'st', 'ct', 'ln', 'block', 'of']
vectorizer = CountVectorizer(max_features=300, stop_words=stop_words)
features = vectorizer.fit_transform(all_addr).toarray() # 稀疏矩阵用toarray()转化为矩阵
features[0,:]
X = features[:train.shape[0]]
y = train.Category
#分成80%的训练集和20%的验证集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)
log_model = LogisticRegression().fit(X=X_train, y=y_train)
results = log_model.predict_proba(X_test)
np.round(results[1], 3)
log_loss_score = log_loss(y_test, results)
print('log loss score: {0}'.format(round(log_loss_score, 3)))
log_model = LogisticRegression().fit(X=features[:train.shape[0]], y=train.Category)
results = log_model.predict_proba(features[train.shape[0]:])
results
submission = pd.DataFrame(results) #注意转换为DataFrame,然后进行下面的一系列操作
submission.columns = sorted(train.Category.unique())
submission.set_index(test.Id) #set_index
submission.index.name="Id"
submission.to_csv('py_submission_logreg_addr_300.csv')
经典又兼具备趣味性的Kaggle案例 https://www.kaggle.com/c/titanic
# 这个ipython notebook主要是我解决Kaggle Titanic问题的思路和过程
import pandas as pd #数据分析
import numpy as np #科学计算
from pandas import Series,DataFrame
data_train = pd.read_csv("Train.csv")
data_train.columns
#data_train[data_train.Cabin.notnull()]['Survived'].value_counts()
data_train.info()
data_train.describe()
import matplotlib.pyplot as plt
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
plt.subplot2grid((2,3),(0,0)) # 在一张大图里分列几个小图
data_train.Survived.value_counts().plot(kind='bar')# plots a bar graph of those who surived vs those who did not.
plt.title(u"获救情况 (1为获救)") # puts a title on our graph
plt.ylabel(u"人数")
plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind="bar")
plt.ylabel(u"人数")
plt.title(u"乘客等级分布")
plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived, data_train.Age)
plt.ylabel(u"年龄") # sets the y axis lable
plt.grid(b=True, which='major', axis='y') # formats the grid line style of our graphs
plt.title(u"按年龄看获救分布 (1为获救)")
plt.subplot2grid((2,3),(1,0), colspan=2)
data_train.Age[data_train.Pclass == 1].plot(kind='kde') # plots a kernel desnsity estimate of the subset of the 1st class passanges's age
data_train.Age[data_train.Pclass == 2].plot(kind='kde')
data_train.Age[data_train.Pclass == 3].plot(kind='kde')
plt.xlabel(u"年龄")# plots an axis lable
plt.ylabel(u"密度")
plt.title(u"各等级的乘客年龄分布")
plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best') # sets our legend for our graph.
plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title(u"各登船口岸上船人数")
plt.ylabel(u"人数")
plt.show()
#看看各乘客等级的获救情况
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各乘客等级的获救情况")
plt.xlabel(u"乘客等级")
plt.ylabel(u"人数")
plt.show()
#看看各登录港口的获救情况
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts()
df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各登录港口乘客的获救情况")
plt.xlabel(u"登录港口")
plt.ylabel(u"人数")
plt.show()
#看看各性别的获救情况
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()
Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()
df=pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f})
df.plot(kind='bar', stacked=True)
plt.title(u"按性别看获救情况")
plt.xlabel(u"性别")
plt.ylabel(u"人数")
plt.show()
#然后我们再来看看各种舱级别情况下各性别的获救情况
fig=plt.figure()
fig.set(alpha=0.65) # 设置图像透明度,无所谓
plt.title(u"根据舱等级和性别的获救情况")
ax1=fig.add_subplot(141)
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass != 3].value_counts().plot(kind='bar', label="female highclass", color='#FA2479')
ax1.set_xticklabels([u"获救", u"未获救"], rotation=0)
ax1.legend([u"女性/高级舱"], loc='best')
ax2=fig.add_subplot(142, sharey=ax1)
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass == 3].value_counts().plot(kind='bar', label='female, low class', color='pink')
ax2.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"女性/低级舱"], loc='best')
ax3=fig.add_subplot(143, sharey=ax1)
data_train.Survived[data_train.Sex == 'male'][data_train.Pclass != 3].value_counts().plot(kind='bar', label='male, high class',color='lightblue')
ax3.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"男性/高级舱"], loc='best')
ax4=fig.add_subplot(144, sharey=ax1)
data_train.Survived[data_train.Sex == 'male'][data_train.Pclass == 3].value_counts().plot(kind='bar', label='male low class', color='steelblue')
ax4.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend([u"男性/低级舱"], loc='best')
plt.show()
g = data_train.groupby(['SibSp','Survived']) # 注意分组统计,多层分组的应用
df = pd.DataFrame(g.count()['PassengerId']) ###使用此种方式进行多层分组统计
data_train.Cabin.value_counts() # value_counts 应用, 和count区别 values_counts分类了, count没有分类,统一统计了
#cabin的值计数太分散了,绝大多数Cabin值只出现一次。感觉上作为类目,加入特征未必会有效
#那我们一起看看这个值的有无,对于survival的分布状况,影响如何吧
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
Survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
Survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
df=pd.DataFrame({u'有':Survived_cabin, u'无':Survived_nocabin}).transpose() ##注意用词典的方式进行画图。用上面过两维,和有无两维,组成了类似2*2交叉表,此处需要transpose,作图的时候
df.plot(kind='bar', stacked=True)
plt.title(u"按Cabin有无看获救情况")
plt.xlabel(u"Cabin有无")
plt.ylabel(u"人数")
plt.show()
#似乎有cabin记录的乘客survival比例稍高,那先试试把这个值分为两类,有cabin值/无cabin值,一会儿加到类别特征好了
from sklearn.ensemble import RandomForestRegressor
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
# 把已有的数值型特征取出来丢进Random Forest Regressor中
age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']] #注意使用此种方式固定列,便于以后操作
# 乘客分成已知年龄和未知年龄两部分
known_age = age_df[age_df.Age.notnull()].as_matrix() # 注意notnull, isnull 的用法,并且转化为矩阵
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
# y即目标年龄
y = known_age[:, 0]
# X即特征属性值
X = known_age[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::]) #此种写法,不包含第一列
# 用得到的预测结果填补原缺失数据
df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges # 用loc取
return df, rfr
def set_Cabin_type(df):
df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"
df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"
return df
data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
data_train
# 因为逻辑回归建模时,需要输入的特征都是数值型特征
# 我们先对类目型的特征离散/因子化
# 以Cabin为例,原本一个属性维度,因为其取值可以是['yes','no'],而将其平展开为'Cabin_yes','Cabin_no'两个属性
# 原本Cabin取值为yes的,在此处的'Cabin_yes'下取值为1,在'Cabin_no'下取值为0
# 原本Cabin取值为no的,在此处的'Cabin_yes'下取值为0,在'Cabin_no'下取值为1
# 我们使用pandas的get_dummies来完成这个工作,并拼接在原来的data_train之上,如下所示
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked') #get_dummies的使用方法,
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')
df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1) #注意最后用pd的concat 连接, numpy有concatenate连接方法
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) #注意inplace, 删除列用drop
df
# 接下来我们要接着做一些数据预处理的工作,比如scaling,将一些变化幅度较大的特征化到[-1,1]之内
# 这样可以加速logistic regression的收敛
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'])
df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param) #注意这种写法和普通的不同,Age,Fare开始使用相同的scaler,但又使用同的训练数据,所以把训练完的对象又单独加入fit_transform的参数中
fare_scale_param = scaler.fit(df['Fare'])
df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)
df
# 我们把需要的feature字段取出来,转成numpy格式,使用scikit-learn中的LogisticRegression建模
from sklearn import linear_model
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*') # 使用正则表达式过滤,filter(regex= |)
train_np = train_df.as_matrix()
# y即Survival结果
y = train_np[:, 0]
# X即特征属性值
X = train_np[:, 1:]
# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)
clf
X.shape
#测试集和训练集做一样的操作
data_test = pd.read_csv("test.csv")
data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X) #注意此处是训练集得来的模型
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param) #来此训练集
df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)
df_test
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions.csv", index=False)
pd.read_csv("logistic_regression_predictions.csv")
import numpy as np
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
# 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,
train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
"""
画出data在某模型上的learning curve.
参数解释
----------
estimator : 你用的分类器。
title : 表格的标题。
X : 输入的feature,numpy类型
y : 输入的target vector
ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份)
n_jobs : 并行的的任务数(默认1)
"""
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) #注意此处的train_sizes是0.05到1的值,如何为实际
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None: #注意在Python中None和NULL的区分
plt.ylim(*ylim) #平方??
plt.xlabel(u"训练样本数")
plt.ylabel(u"得分")
plt.gca().invert_yaxis() ###利用gca()获得ax的属性,然后利用invert_yaxis() 反向Y轴
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分")
plt.legend(loc="best")
plt.draw()
plt.gca().invert_yaxis()
plt.show()
midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff
plot_learning_curve(clf, u"学习曲线", X, y)
pd.DataFrame({"columns":list(train_df.columns)[1:], "coef":list(clf.coef_.T)})
from sklearn import cross_validation
# 简单看看打分情况
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
all_data = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
X = all_data.as_matrix()[:,1:]
y = all_data.as_matrix()[:,0]
print cross_validation.cross_val_score(clf, X, y, cv=5)
# 分割数据
split_train, split_cv = cross_validation.train_test_split(df, test_size=0.3, random_state=0)
train_df = split_train.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
# 生成模型
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(train_df.as_matrix()[:,1:], train_df.as_matrix()[:,0])
# 对cross validation数据进行预测
cv_df = split_cv.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(cv_df.as_matrix()[:,1:])
split_cv[ predictions != cv_df.as_matrix()[:,0] ].drop()
# 去除预测错误的case看原始dataframe数据
#split_cv['PredictResult'] = predictions
origin_data_train = pd.read_csv("Train.csv")
bad_cases = origin_data_train.loc[origin_data_train['PassengerId'].isin(split_cv[predictions != cv_df.as_matrix()[:,0]]['PassengerId'].values)] #注意去除写法 isin
bad_cases
data_train[data_train['Name'].str.contains("Major")]
data_train = pd.read_csv("Train.csv")
data_train['Sex_Pclass'] = data_train.Sex + "_" + data_train.Pclass.map(str)
from sklearn.ensemble import RandomForestRegressor
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
# 把已有的数值型特征取出来丢进Random Forest Regressor中
age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
# 乘客分成已知年龄和未知年龄两部分
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
# y即目标年龄
y = known_age[:, 0]
# X即特征属性值
X = known_age[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::])
# 用得到的预测结果填补原缺失数据
df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges
return df, rfr
def set_Cabin_type(df):
df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"
df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"
return df
data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')
dummies_Sex_Pclass = pd.get_dummies(data_train['Sex_Pclass'], prefix= 'Sex_Pclass') #新增加特征
df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Sex_Pclass'], axis=1, inplace=True)
import sklearn.preprocessing as preprocessing
scaler = preprocessing.StandardScaler()
age_scale_param = scaler.fit(df['Age'])
df['Age_scaled'] = scaler.fit_transform(df['Age'], age_scale_param)
fare_scale_param = scaler.fit(df['Fare'])
df['Fare_scaled'] = scaler.fit_transform(df['Fare'], fare_scale_param)
from sklearn import linear_model
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*')
train_np = train_df.as_matrix()
# y即Survival结果
y = train_np[:, 0]
# X即特征属性值
X = train_np[:, 1:]
# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)
clf
data_test = pd.read_csv("test.csv")
data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
data_test['Sex_Pclass'] = data_test.Sex + "_" + data_test.Pclass.map(str)
# 接着我们对test_data做和train_data中一致的特征变换
# 首先用同样的RandomForestRegressor模型填上丢失的年龄
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根据特征属性X预测年龄并补上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
dummies_Sex_Pclass = pd.get_dummies(data_test['Sex_Pclass'], prefix= 'Sex_Pclass')
df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Sex_Pclass'], axis=1, inplace=True)
df_test['Age_scaled'] = scaler.fit_transform(df_test['Age'], age_scale_param)
df_test['Fare_scaled'] = scaler.fit_transform(df_test['Fare'], fare_scale_param)
df_test
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions2.csv", index=False)
from sklearn.ensemble import BaggingRegressor
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
train_np = train_df.as_matrix()
# y即Survival结果
y = train_np[:, 0]
# X即特征属性值
X = train_np[:, 1:]
# fit到BaggingRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(X, y) #用同一个模型,数据集分为10份
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
predictions = bagging_clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("/Users/MLS/Downloads/logistic_regression_predictions2.csv", index=False)
用两个分类器
import numpy as np
import pandas as pd
from pandas import DataFrame
from patsy import dmatrices #用于生成设计矩阵
import string
from operator import itemgetter
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit,StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.externals import joblib #持久化模块
##Read configuration parameters
train_file="train.csv"
MODEL_PATH="./"
test_file="test.csv"
SUBMISSION_PATH="./"
seed= 0
print train_file,seed
# 输出得分
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] #注意此种排序的写法
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
#清理和处理数据
def substrings_in_string(big_string, substrings):
for substring in substrings:
if string.find(big_string, substring) != -1: ##注意!=-1此种写法
return substring
print big_string
return np.nan
le = preprocessing.LabelEncoder() #标签编码
enc=preprocessing.OneHotEncoder() #OneHot编码
def clean_and_munge_data(df):
#处理缺省值
df.Fare = df.Fare.map(lambda x: np.nan if x==0 else x) #注意此种写法缺失值转为为0的写法
#处理一下名字,生成Title字段
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
'Don', 'Jonkheer']
df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list)) #lambda 和map结合在处理缺失值的妙用
#处理特殊的称呼,全处理成mr, mrs, miss, master
def replace_titles(x):
title=x['Title']
if title in ['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
return 'Mr'
elif title in ['Master']:
return 'Master'
elif title in ['Countess', 'Mme','Mrs']:
return 'Mrs'
elif title in ['Mlle', 'Ms','Miss']:
return 'Miss'
elif title =='Dr':
if x['Sex']=='Male':
return 'Mr'
else:
return 'Mrs'
elif title =='':
if x['Sex']=='Male':
return 'Master'
else:
return 'Miss'
else:
return title
df['Title']=df.apply(replace_titles, axis=1) #apply(func,args,kwargs)从Python2.3开始,已经被func(*args,**kwargs)代替了.
#看看家族是否够大,咳咳
df['Family_Size']=df['SibSp']+df['Parch']
df['Family']=df['SibSp']*df['Parch']
df.loc[ (df.Fare.isnull())&(df.Pclass==1),'Fare'] =np.median(df[df['Pclass'] == 1]['Fare'].dropna()) #众数填充
df.loc[ (df.Fare.isnull())&(df.Pclass==2),'Fare'] =np.median( df[df['Pclass'] == 2]['Fare'].dropna())
df.loc[ (df.Fare.isnull())&(df.Pclass==3),'Fare'] = np.median(df[df['Pclass'] == 3]['Fare'].dropna())
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int) #注意map内部是字典
df['AgeFill']=df['Age']
mean_ages = np.zeros(4)
mean_ages[0]=np.average(df[df['Title'] == 'Miss']['Age'].dropna())
mean_ages[1]=np.average(df[df['Title'] == 'Mrs']['Age'].dropna())
mean_ages[2]=np.average(df[df['Title'] == 'Mr']['Age'].dropna())
mean_ages[3]=np.average(df[df['Title'] == 'Master']['Age'].dropna())
df.loc[ (df.Age.isnull()) & (df.Title == 'Miss') ,'AgeFill'] = mean_ages[0]
df.loc[ (df.Age.isnull()) & (df.Title == 'Mrs') ,'AgeFill'] = mean_ages[1]
df.loc[ (df.Age.isnull()) & (df.Title == 'Mr') ,'AgeFill'] = mean_ages[2]
df.loc[ (df.Age.isnull()) & (df.Title == 'Master') ,'AgeFill'] = mean_ages[3]
df['AgeCat']=df['AgeFill']
df.loc[ (df.AgeFill<=10) ,'AgeCat'] = 'child'
df.loc[ (df.AgeFill>60),'AgeCat'] = 'aged'
df.loc[ (df.AgeFill>10) & (df.AgeFill <=30) ,'AgeCat'] = 'adult'
df.loc[ (df.AgeFill>30) & (df.AgeFill <=60) ,'AgeCat'] = 'senior'
df.Embarked = df.Embarked.fillna('S')
df.loc[ df.Cabin.isnull()==True,'Cabin'] = 0.5
df.loc[ df.Cabin.isnull()==False,'Cabin'] = 1.5
df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)
#Age times class
df['AgeClass']=df['AgeFill']*df['Pclass']
df['ClassFare']=df['Pclass']*df['Fare_Per_Person']
df['HighLow']=df['Pclass']
df.loc[ (df.Fare_Per_Person<8) ,'HighLow'] = 'Low'
df.loc[ (df.Fare_Per_Person>=8) ,'HighLow'] = 'High'
le.fit(df['Sex'] )
x_sex=le.transform(df['Sex'])
df['Sex']=x_sex.astype(np.float)
le.fit( df['Ticket'])
x_Ticket=le.transform( df['Ticket'])
df['Ticket']=x_Ticket.astype(np.float)
le.fit(df['Title'])
x_title=le.transform(df['Title'])
df['Title'] =x_title.astype(np.float)
le.fit(df['HighLow'])
x_hl=le.transform(df['HighLow'])
df['HighLow']=x_hl.astype(np.float)
le.fit(df['AgeCat'])
x_age=le.transform(df['AgeCat'])
df['AgeCat'] =x_age.astype(np.float)
le.fit(df['Embarked'])
x_emb=le.transform(df['Embarked'])
df['Embarked']=x_emb.astype(np.float)
df = df.drop(['PassengerId','Name','Age','Cabin'], axis=1) #remove Name,Age and PassengerId
return df
#读取数据
traindf=pd.read_csv(train_file)
##清洗数据
df=clean_and_munge_data(traindf)
########################################formula################################
formula_ml='Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size' #这一部重要,需要研究
y_train, x_train = dmatrices(formula_ml, data=df, return_type='dataframe') # 生成矩阵,根据参数见的关系生成。参数之间相关性。
y_train = np.asarray(y_train).ravel()
print y_train.shape,x_train.shape
##选择训练和测试集
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.2,random_state=seed)
#初始化分类器
clf=RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=5, min_samples_split=1,
min_samples_leaf=1, max_features='auto', bootstrap=False, oob_score=False, n_jobs=1, random_state=seed,
verbose=0)
###grid search找到最好的参数
param_grid = dict( )
##创建分类pipeline
pipeline=Pipeline([ ('clf',clf) ])
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3,scoring='accuracy',\
cv=StratifiedShuffleSplit(Y_train, n_iter=10, test_size=0.2, train_size=None, indices=None, \ #CV嵌套在GridSearch里面,CV使用Y_train分割
random_state=seed, n_iterations=None)).fit(X_train, Y_train)
# 对结果打分
print("Best score: %0.3f" % grid_search.best_score_) #注意,best_score_
print(grid_search.best_estimator_)
report(grid_search.grid_scores_)
print('-----grid search end------------')
print ('on all train set')
scores = cross_val_score(grid_search.best_estimator_, x_train, y_train,cv=3,scoring='accuracy') #全量, 注意best_estimator
print scores.mean(),scores
print ('on test set')
scores = cross_val_score(grid_search.best_estimator_, X_test, Y_test,cv=3,scoring='accuracy')
print scores.mean(),scores
# 对结果打分
print(classification_report(Y_train, grid_search.best_estimator_.predict(X_train) ))
print('test data')
print(classification_report(Y_test, grid_search.best_estimator_.predict(X_test) ))
model_file=MODEL_PATH+'model-rf.pkl'
joblib.dump(grid_search.best_estimator_, model_file)
Kaggle自行车租赁预测比赛 https://www.kaggle.com/c/bike-sharing-demand
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df_train = pd.read_csv('kaggle_bike_competition_train.csv',header = 0)
df_train.head(10)
df_train.dtypes
#让它告诉我们形状
df_train.shape
df_train.count()
type(df_train.datetime)
# 把月、日、和 小时单独拎出来,放到3列中
df_train['month'] = pd.DatetimeIndex(df_train.datetime).month #处理时间,使用pd.DatetimeIndex().month
df_train['day'] = pd.DatetimeIndex(df_train.datetime).dayofweek
df_train['hour'] = pd.DatetimeIndex(df_train.datetime).hour
# 那个,保险起见,咱们还是先存一下吧
df_train_origin = df_train
# 抛掉不要的字段
df_train = df_train.drop(['datetime','casual','registered'], axis = 1)
# 看一眼
df_train.head(5)
df_train_target = df_train['count'].values #注意后面加了value
df_train_data = df_train.drop(['count'],axis = 1).values
print 'df_train_data shape is ', df_train_data.shape
print 'df_train_target shape is ', df_train_target.shape
from sklearn import linear_model
from sklearn import cross_validation
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.learning_curve import learning_curve
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import explained_variance_score
# 总得切分一下数据咯(训练集和测试集)
cv = cross_validation.ShuffleSplit(len(df_train_data), n_iter=3, test_size=0.2, # 注意在此处使用的是len,最终使用的是索引
random_state=0)
# 各种模型来一圈
print "岭回归"
for train, test in cv:
svc = linear_model.Ridge().fit(df_train_data[train], df_train_target[train])
print("train score: {0:.3f}, test score: {1:.3f}\n".format(
svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
print "支持向量回归/SVR(kernel='rbf',C=10,gamma=.001)"
for train, test in cv:
svc = svm.SVR(kernel ='rbf', C = 10, gamma = .001).fit(df_train_data[train], df_train_target[train])
print("train score: {0:.3f}, test score: {1:.3f}\n".format(
svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
print "随机森林回归/Random Forest(n_estimators = 100)"
for train, test in cv:
svc = RandomForestRegressor(n_estimators = 100).fit(df_train_data[train], df_train_target[train])
print("train score: {0:.3f}, test score: {1:.3f}\n".format(
svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
X = df_train_data
y = df_train_target
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
X, y, test_size=0.2, random_state=0)
tuned_parameters = [{'n_estimators':[10,100,500]}]
scores = ['r2']
for score in scores:
print score
clf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring=score)
clf.fit(X_train, y_train)
print("别!喝!咖!啡!了!最佳参数找到了亲!!:")
print ""
#best_estimator_ returns the best estimator chosen by the search
print(clf.best_estimator_)
print ""
print("得分分别是:")
print ""
#grid_scores_的返回值:
# * a dict of parameter settings
# * the mean score over the cross-validation folds
# * the list of scores for each fold
for params, mean_score, scores in clf.grid_scores_: # grid_scores_里面只有测试集的分数?
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() / 2, params))
print ""
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
title = "Learning Curves (Random Forest, n_estimators = 100)"
cv = cross_validation.ShuffleSplit(df_train_data.shape[0], n_iter=10,test_size=0.2, random_state=0)
estimator = RandomForestRegressor(n_estimators = 100)
plot_learning_curve(estimator, title, X, y, (0.0, 1.01), cv=cv, n_jobs=4)
plt.show()
# 尝试一下缓解过拟合,当然,未必成功
print "随机森林回归/Random Forest(n_estimators=200, max_features=0.6, max_depth=15)" # 这里调高了n_estimators,max_ 的数量,在Random Forest里面降低模型复杂度?
for train, test in cv:
svc = RandomForestRegressor(n_estimators = 200, max_features=0.6, max_depth=15).fit(df_train_data[train], df_train_target[train])
print("train score: {0:.3f}, test score: {1:.3f}\n".format(
svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
# 看你们自己的咯
df_train_registered = df_train_origin.drop(['datetime','casual','count'], axis = 1)
df_train_casual = df_train_origin.drop(['datetime','count','registered'], axis = 1)
df_train_registered.head()
# 风速
df_train_origin.groupby('windspeed').mean().plot(y='count', marker='o') #注意groupby分组统计后直接作图
plt.show()
# 湿度
df_train_origin.groupby('humidity').mean().plot(y='count', marker='o')
plt.show()
#温度湿度变化
df_train_origin.plot(x='temp', y='humidity', kind='scatter') #直接作图
plt.show()
# scatter一下各个维度
fig, axs = plt.subplots(2, 3, sharey=True)
df_train_origin.plot(kind='scatter', x='temp', y='count', ax=axs[0, 0], figsize=(16, 8), color='magenta')
df_train_origin.plot(kind='scatter', x='atemp', y='count', ax=axs[0, 1], color='cyan')
df_train_origin.plot(kind='scatter', x='humidity', y='count', ax=axs[0, 2], color='red')
df_train_origin.plot(kind='scatter', x='windspeed', y='count', ax=axs[1, 0], color='yellow')
df_train_origin.plot(kind='scatter', x='month', y='count', ax=axs[1, 1], color='blue')
df_train_origin.plot(kind='scatter', x='hour', y='count', ax=axs[1, 2], color='green')
sns.pairplot(df_train_origin[["temp", "month", "humidity", "count"]], hue="count") # 注意seabosn中的pairplot 画多个变量之间的关系
corr = df_train_origin[['temp','weather','windspeed','day', 'month', 'hour','count']].corr() # corr计算各特征变量之间的关联度
corr
plt.figure()
plt.matshow(corr) # 显示相关性图,matshow
plt.colorbar() # 颜色par
plt.show()
特征工程 数据集来源于Data Hackathon 3.x
import pandas as pd
import numpy as np
%matplotlib inline
#载入数据:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
train.dtypes
train.head(5)
#合成一个总的data
train['source']= 'train'
test['source'] = 'test'
data=pd.concat([train, test],ignore_index=True) # 合成的使用用pandas的concate 或python的。。。
data.shape
data.apply(lambda x: sum(x.isnull())) # 注意用此方式查看缺省值
var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']
for v in var:
print '\n%s这一列数据的不同取值和出现的次数\n'%v
print data[v].value_counts()
len(data['City'].unique()) # 注意unique的使用
data.drop('City',axis=1,inplace=True)
data['DOB'].head()
#创建一个年龄的字段Age
data['Age'] = data['DOB'].apply(lambda x: 115 - int(x[-2:]))
data['Age'].head()
#把原始的DOB字段去掉:
data.drop('DOB',axis=1,inplace=True)
data.boxplot(column=['EMI_Loan_Submitted'],return_type='axes')
#好像缺失值比较多,干脆就开一个新的字段,表明是缺失值还是不是缺失值
data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)
#原始那一列就可以不要了
data.drop('EMI_Loan_Submitted',axis=1,inplace=True)
len(data['Employer_Name'].value_counts())
#丢掉
data.drop('Employer_Name',axis=1,inplace=True)
data.boxplot(column='Existing_EMI',return_type='axes')
data['Existing_EMI'].describe()
#缺省值不多,用均值代替
data['Existing_EMI'].fillna(0, inplace=True)
data.boxplot(column=['Interest_Rate'],return_type='axes')
#缺省值太多,也造一个字段,表示有无
data['Interest_Rate_Missing'] = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0) #造一个有无的字段
print data[['Interest_Rate','Interest_Rate_Missing']].head(10)
data.drop('Interest_Rate',axis=1,inplace=True)
#找中位数去填补缺省值(因为缺省的不多)
data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace=True)
data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)
# 缺省值太多。。。是否缺省。。。
data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)
data['Source'].value_counts()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() # 数值编码, 原来为object类型,转化为int类型
var_to_encode = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']
for col in var_to_encode:
data[col] = le.fit_transform(data[col])
data = pd.get_dummies(data, columns=var_to_encode) #类别型的One-Hot 编码, 此处先把类别行的用LabelEncoder编码为数字,然后在转化为one_hot编码,可以直接one_hot,只是起的列名字不同而已
data.columns
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']
XGBoost模型调优
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
train = pd.read_csv('train_modified.csv')
test = pd.read_csv('test_modified.csv')
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4 #注意此处, 默认设置图形大小
train = pd.read_csv('train_modified.csv')
test = pd.read_csv('test_modified.csv')
train.shape, test.shape
target='Disbursed'
IDcol = 'ID'
train['Disbursed'].value_counts()
#test_results = pd.read_csv('test_results.csv')
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
xgtest = xgb.DMatrix(dtest[predictors].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
early_stopping_rounds=early_stopping_rounds, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#建模
alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
#对训练集预测
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#输出模型的一些结果
print "\n关于现在这个模型"
print "准确率 : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
print "AUC 得分 (训练集): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb1, train, test, predictors)
#对subsample 和 max_features 用grid search查找最好的参数
param_test1 = {
'max_depth':range(3,10,2),
'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
# 对于max_depth和min_child_weight查找最好的参数
param_test2 = {
'max_depth':[4,5,6],
'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
#交叉验证对min_child_weight寻找最合适的参数
param_test2b = {
'min_child_weight':[6,8,10,12]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test2b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2b.fit(train[predictors],train[target])
#Grid seach选择合适的gamma
param_test3 = {
'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb2 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=4,
min_child_weight=6,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb2, train, test, predictors)
#对subsample 和 colsample_bytree用grid search寻找最合适的参数
param_test4 = {
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
# 同上
param_test5 = {
'subsample':[i/100.0 for i in range(75,90,5)],
'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
#对reg_alpha用grid search寻找最合适的参数
param_test6 = {
'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(train[predictors],train[target])
# 换一组参数对reg_alpha用grid search寻找最合适的参数
param_test7 = {
'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27),
param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch7.fit(train[predictors],train[target])
xgb3 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=4,
min_child_weight=6,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.005,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb3, train, test, predictors)
第二课
房价预测案例
import numpy as np
import pandas as pd
import xgboost
train_df = pd.read_csv('../input/train.csv', index_col=0) #注意../ 代表上一个目录?
test_df = pd.read_csv('../input/test.csv', index_col=0)
%matplotlib inline
prices = pd.DataFrame({"price":train_df["SalePrice"], "log(price + 1)":np.log1p(train_df["SalePrice"])}) # 注意此处使用了lo1p,它是log(1+X) 防止X为零的一个类似拉普拉斯平滑,log1p()就需要expm1(); 回归的时候如果初始数据不是正态分布,需要做处理, 分类的时候没必要
prices.hist()
y_train = np.log1p(train_df.pop('SalePrice'))
all_df = pd.concat((train_df, test_df), axis=0) # pandas 里面的concat 合并
all_df.shape
all_df['MSSubClass'].dtypes
all_df['MSSubClass'] = all_df['MSSubClass'].astype(str) # 转换为astype
all_df['MSSubClass'].value_counts()
pd.get_dummies(all_df['MSSubClass'], prefix='MSSubClass').head()
all_dummy_df = pd.get_dummies(all_df) #把所有的数据进行了one-hot-encode
all_dummy_df.head()
all_dummy_df.isnull().sum().sort_values(ascending=False).head(10) #缺失值, sum , sort_values
mean_cols = all_dummy_df.mean()
mean_cols.head(10)
all_dummy_df = all_dummy_df.fillna(mean_cols)
all_dummy_df.isnull().sum().sum()
numeric_cols = all_df.columns[all_df.dtypes != 'object'] #判断对那些列是numerical类型,即不是对象类型的, 注意此处使用的是dtypes!='object', 注意取出是个列表
numeric_cols
numeric_col_means = all_dummy_df.loc[:, numeric_cols].mean() #计算出所有数值型数字的标准列
numeric_col_std = all_dummy_df.loc[:, numeric_cols].std()
all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std # 标准化
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
X_train = dummy_train_df.values #此处是values 注意DataFrame 转化为narray的方式
X_test = dummy_test_df.values
alphas = np.logspace(-3, 2, 50)
test_scores = []
for alpha in alphas:
clf = Ridge(alpha)
test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error')) #注意此处有个负号, 和scoring 的选择有关
test_scores.append(np.mean(test_score))
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(alphas, test_scores)
plt.title("Alpha vs CV Error");
max_features = [.1, .3, .5, .7, .9, .99]
test_scores = []
for max_feat in max_features:
clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
plt.plot(max_features, test_scores)
plt.title("Max Features vs CV Error");
ridge = Ridge(alpha=15)
rf = RandomForestRegressor(n_estimators=500, max_features=.3)
ridge.fit(X_train, y_train)
rf.fit(X_train, y_train)
y_ridge = np.expm1(ridge.predict(X_test)) #注意前面用了log(x+1),此处我们用的expm1是反过程
y_rf = np.expm1(rf.predict(X_test))
y_final = (y_ridge + y_rf) / 2 #模型融合 取平均
submission_df = pd.DataFrame(data= {'Id' : test_df.index, 'SalePrice': y_final})
房价预测案例(进阶版)
dummy_train_df = all_dummy_df.loc[train_df.index]
dummy_test_df = all_dummy_df.loc[test_df.index]
dummy_train_df.shape, dummy_test_df.shape
X_train = dummy_train_df.values
X_test = dummy_test_df.values
from sklearn.linear_model import Ridge
ridge = Ridge(15)
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score
params = [1, 10, 15, 20, 25, 30, 40]
test_scores = []
for param in params: #此处bagging 用的是同一个模型下面不同分类器的组合
clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error");
params = [10, 15, 20, 25, 30, 40, 50, 60, 70, 100]
test_scores = []
for param in params:
clf = BaggingRegressor(n_estimators=param)
test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
from sklearn.ensemble import AdaBoostRegressor
params = [10, 15, 20, 25, 30, 35, 40, 45, 50]
test_scores = []
for param in params:
clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
from xgboost import XGBRegressor
params = [1,2,3,4,5,6]
test_scores = []
for param in params:
clf = XGBRegressor(max_depth=param)
test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
用每日新闻预测金融市场变化 标准版
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from datetime import date
data = pd.read_csv('../input/Combined_News_DJIA.csv')
data["combined_news"] = data.filter(regex=("Top.*")).apply(lambda x: ''.join(str(x.values)), axis=1) #注意pd中使用filter.后使用apply, 在axis=1,此时lambda 中的x是一列一列,是每个Series
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
feature_extraction = TfidfVectorizer()
X_train = feature_extraction.fit_transform(train["combined_news"].values)
X_test = feature_extraction.transform(test["combined_news"].values)
y_train = train["Label"].values
y_test = test["Label"].values
clf = SVC(probability=True, kernel='rbf')
clf = SVC(probability=True, kernel='rbf')
clf.fit(X_train, y_train)
predictions = clf.predict_proba(X_test)
print('ROC-AUC yields ' + str(roc_auc_score(y_test, predictions[:,1])))
进阶版
X_train = train["combined_news"].str.lower().str.replace('"', '').str.replace("'", '').str.split()
X_test = test["combined_news"].str.lower().str.replace('"', '').str.replace("'", '').str.split()
from nltk.corpus import stopwords
stop = stopwords.words('english')
import re
def hasNumbers(inputString):
return bool(re.search(r'\d', inputString))
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def check(word):
"""
如果需要这个单词,则True
如果应该去除,则False
"""
if word in stop:
return False
elif hasNumbers(word):
return False
else:
return True
X_train = X_train.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])
X_test = X_test.apply(lambda x: [wordnet_lemmatizer.lemmatize(item) for item in x if check(item)])
print(X_test[1611])
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))
print(X_test[1611])
feature_extraction = TfidfVectorizer(lowercase=False)
X_train = feature_extraction.fit_transform(X_train.values)
X_test = feature_extraction.transform(X_test.values)
clf = SVC(probability=True, kernel='rbf')
clf.fit(X_train, y_train)
predictions = clf.predict_proba(X_test)
print('ROC-AUC yields ' + str(roc_auc_score(y_test, predictions[:,1])))
第三课
Click-Through Rate Prediction
第四课
关键词搜索
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
df_train = pd.read_csv('../input/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('../input/test.csv', encoding="ISO-8859-1")
df_desc = pd.read_csv('../input/product_descriptions.csv')
df_train.head()
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_desc, how='left', on='product_uid')
stemmer = SnowballStemmer('english')
def str_stemmer(s):
return " ".join([stemmer.stem(word) for word in s.lower().split()])
def str_common_word(str1, str2):
return sum(int(str2.find(word)>=0) for word in str1.split())
关键词搜索(进阶版)
df_all = pd.merge(df_all, df_desc, how='left', on='product_uid')
stemmer = SnowballStemmer('english')
def str_stemmer(s):
return " ".join([stemmer.stem(word) for word in s.lower().split()])
def str_common_word(str1, str2):
return sum(int(str2.find(word)>=0) for word in str1.split())
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
import Levenshtein
Levenshtein.ratio('hello', 'hello world')
df_all['dist_in_title'] = df_all.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_title']), axis=1)
df_all['dist_in_desc'] = df_all.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_description']), axis=1)
df_all['all_texts']=df_all['product_title'] + ' . ' + df_all['product_description'] + ' . '
from gensim.utils import tokenize
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(list(tokenize(x, errors='ignore')) for x in df_all['all_texts'].values)
print(dictionary)
class MyCorpus(object):
def __iter__(self):
for x in df_all['all_texts'].values:
yield dictionary.doc2bow(list(tokenize(x, errors='ignore')))
# 这里这么折腾一下,仅仅是为了内存friendly。面对大量corpus数据时,你直接存成一个list,会使得整个运行变得很慢。
# 所以我们搞成这样,一次只输出一组。但本质上依旧长得跟 [['sentence', '1'], ['sentence', '2'], ...]一样
corpus = MyCorpus()
from gensim.models.tfidfmodel import TfidfModel
tfidf = TfidfModel(corpus)
tfidf[dictionary.doc2bow(list(tokenize('hello world, good morning', errors='ignore')))]
from gensim.similarities import MatrixSimilarity
# 先把刚刚那句话包装成一个方法
def to_tfidf(text):
res = tfidf[dictionary.doc2bow(list(tokenize(text, errors='ignore')))]
return res
# 然后,我们创造一个cosine similarity的比较方法
def cos_sim(text1, text2):
tfidf1 = to_tfidf(text1)
tfidf2 = to_tfidf(text2)
index = MatrixSimilarity([tfidf1],num_features=len(dictionary))
sim = index[tfidf2]
# 本来sim输出是一个array,我们不需要一个array来表示,
# 所以我们直接cast成一个float
return float(sim[0])
text1 = 'hello world'
text2 = 'hello from the other side'
cos_sim(text1, text2)
df_all['tfidf_cos_sim_in_title'] = df_all.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)
df_all['tfidf_cos_sim_in_title'][:5]
df_all['tfidf_cos_sim_in_desc'] = df_all.apply(lambda x: cos_sim(x['search_term'], x['product_description']), axis=1)
import nltk
# nltk也是自带一个强大的句子分割器。
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer.tokenize(df_all['all_texts'].values[0])
sentences = [tokenizer.tokenize(x) for x in df_all['all_texts'].values]
sentences = [y for x in sentences for y in x]
from nltk.tokenize import word_tokenize
w2v_corpus = [word_tokenize(x) for x in sentences]
from gensim.models.word2vec import Word2Vec
model = Word2Vec(w2v_corpus, size=128, window=5, min_count=5, workers=4)
# 先拿到全部的vocabulary
vocab = model.vocab
# 得到任意text的vector
def get_vector(text):
# 建立一个全是0的array
res =np.zeros([128])
count = 0
for word in word_tokenize(text):
if word in vocab:
res += model[word]
count += 1
return res/count
df_all['w2v_cos_sim_in_title'] = df_all.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
df_all['w2v_cos_sim_in_desc'] = df_all.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)
df_all = df_all.drop(['search_term','product_title','product_description','all_texts'],axis=1)
df_train = df_all.loc[df_train.index]
df_test = df_all.loc[df_test.index]
test_ids = df_test['id']
y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
第七课
便利店销量预测
import pandas as pd
import datetime
import csv
import numpy as np
import os
import scipy as sp
import xgboost as xgb
import itertools
import operator
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import TransformerMixin
from sklearn import cross_validation
from matplotlib import pylab as plt
plot = True
goal = 'Sales'
myid = 'Id'
定义一些变换和评判准则
def ToWeight(y):
w = np.zeros(y.shape, dtype=float)
ind = y != 0
w[ind] = 1./(y[ind]**2)
return w
def rmspe(yhat, y):
w = ToWeight(y)
rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
return rmspe
def rmspe_xg(yhat, y):
# y = y.values
y = y.get_label()
y = np.exp(y) - 1
yhat = np.exp(yhat) - 1
w = ToWeight(y)
rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
return "rmspe", rmspe
store = pd.read_csv('./data/store.csv')
store.head()
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
def load_data():
"""
加载数据,设定数值型和非数值型数据
"""
store = pd.read_csv('./data/store.csv')
train_org = pd.read_csv('./data/train.csv',dtype={'StateHoliday':pd.np.string_})
test_org = pd.read_csv('./data/test.csv',dtype={'StateHoliday':pd.np.string_})
train = pd.merge(train_org,store, on='Store', how='left')
test = pd.merge(test_org,store, on='Store', how='left')
features = test.columns.tolist()
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features_numeric = test.select_dtypes(include=numerics).columns.tolist()
features_non_numeric = [f for f in features if f not in features_numeric]
return (train,test,features,features_non_numeric)
数据与特征处理
def process_data(train,test,features,features_non_numeric):
"""
Feature engineering and selection.
"""
# # FEATURE ENGINEERING
train = train[train['Sales'] > 0]
for data in [train,test]:
# year month day
data['year'] = data.Date.apply(lambda x: x.split('-')[0])
data['year'] = data['year'].astype(float)
data['month'] = data.Date.apply(lambda x: x.split('-')[1])
data['month'] = data['month'].astype(float)
data['day'] = data.Date.apply(lambda x: x.split('-')[2])
data['day'] = data['day'].astype(float)
# promo interval "Jan,Apr,Jul,Oct"
data['promojan'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Jan" in x else 0)
data['promofeb'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Feb" in x else 0)
data['promomar'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Mar" in x else 0)
data['promoapr'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Apr" in x else 0)
data['promomay'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "May" in x else 0)
data['promojun'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Jun" in x else 0)
data['promojul'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Jul" in x else 0)
data['promoaug'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Aug" in x else 0)
data['promosep'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Sep" in x else 0)
data['promooct'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Oct" in x else 0)
data['promonov'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Nov" in x else 0)
data['promodec'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Dec" in x else 0)
# # Features set.
noisy_features = [myid,'Date']
features = [c for c in features if c not in noisy_features]
features_non_numeric = [c for c in features_non_numeric if c not in noisy_features]
features.extend(['year','month','day'])
# Fill NA
class DataFrameImputer(TransformerMixin):
# http://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn
def __init__(self):
"""Impute missing values.
Columns of dtype object are imputed with the most frequent value
in column.
Columns of other types are imputed with mean of column.
"""
def fit(self, X, y=None):
self.fill = pd.Series([X[c].value_counts().index[0] # mode
if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], # mean
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.fill)
train = DataFrameImputer().fit_transform(train)
test = DataFrameImputer().fit_transform(test)
# Pre-processing non-numberic values
le = LabelEncoder()
for col in features_non_numeric:
le.fit(list(train[col])+list(test[col]))
train[col] = le.transform(train[col])
test[col] = le.transform(test[col])
# LR和神经网络这种模型都对输入数据的幅度极度敏感,请先做归一化操作
scaler = StandardScaler()
for col in set(features) - set(features_non_numeric) - \
set([]): # TODO: add what not to scale
scaler.fit(list(train[col])+list(test[col]))
train[col] = scaler.transform(train[col])
test[col] = scaler.transform(test[col])
return (train,test,features,features_non_numeric)
训练与分析
def XGB_native(train,test,features,features_non_numeric):
depth = 13
eta = 0.01
ntrees = 8000
mcw = 3
params = {"objective": "reg:linear",
"booster": "gbtree",
"eta": eta,
"max_depth": depth,
"min_child_weight": mcw,
"subsample": 0.9,
"colsample_bytree": 0.7,
"silent": 1
}
print "Running with params: " + str(params)
print "Running with ntrees: " + str(ntrees)
print "Running with features: " + str(features)
# Train model with local split
tsize = 0.05
X_train, X_test = cross_validation.train_test_split(train, test_size=tsize)
dtrain = xgb.DMatrix(X_train[features], np.log(X_train[goal] + 1))
dvalid = xgb.DMatrix(X_test[features], np.log(X_test[goal] + 1))
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, ntrees, evals=watchlist, early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)
train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, X_test[goal].values)
print error
# Predict and Export
test_probs = gbm.predict(xgb.DMatrix(test[features]))
indices = test_probs < 0
test_probs[indices] = 0
submission = pd.DataFrame({myid: test[myid], goal: np.exp(test_probs) - 1})
if not os.path.exists('result/'):
os.makedirs('result/')
submission.to_csv("./result/dat-xgb_d%s_eta%s_ntree%s_mcw%s_tsize%s.csv" % (str(depth),str(eta),str(ntrees),str(mcw),str(tsize)) , index=False)
# Feature importance
if plot:
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
importance = gbm.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
# Plotitup
plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('Feature_Importance_xgb_d%s_eta%s_ntree%s_mcw%s_tsize%s.png' % (str(depth),str(eta),str(ntrees),str(mcw),str(tsize)))
print "=> 载入数据中..."
train,test,features,features_non_numeric = load_data()
print "=> 处理数据与特征工程..."
train,test,features,features_non_numeric = process_data(train,test,features,features_non_numeric)
print "=> 使用XGBoost建模..."
XGB_native(train,test,features,features_non_numeric)
Kaggle event 推荐比赛
from __future__ import division
import itertools
import cPickle
import datetime
import hashlib
import locale
import numpy as np
import pycountry
import scipy.io as sio
import scipy.sparse as ss
import scipy.spatial.distance as ssd
from collections import defaultdict
from sklearn.preprocessing import normalize
class DataCleaner:
"""
Common utilities for converting strings to equivalent numbers
or number buckets.
"""
def __init__(self):
# 载入 locales
self.localeIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):
self.localeIdMap[l] = i + 1
# 载入 countries
self.countryIdMap = defaultdict(int)
ctryIdx = defaultdict(int)
for i, c in enumerate(pycountry.countries):
self.countryIdMap[c.name.lower()] = i + 1
if c.name.lower() == "usa":
ctryIdx["US"] = i
if c.name.lower() == "canada":
ctryIdx["CA"] = i
for cc in ctryIdx.keys():
for s in pycountry.subdivisions.get(country_code=cc):
self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
# 载入 gender id 字典
self.genderIdMap = defaultdict(int, {"male":1, "female":2})
def getLocaleId(self, locstr):
return self.localeIdMap[locstr.lower()]
def getGenderId(self, genderStr):
return self.genderIdMap[genderStr]
def getJoinedYearMonth(self, dateString):
dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
return "".join([str(dttm.year), str(dttm.month)])
def getCountryId(self, location):
if (isinstance(location, str)
and len(location.strip()) > 0
and location.rfind(" ") > -1):
return self.countryIdMap[location[location.rindex(" ") + 2:].lower()]
else:
return 0
def getBirthYearInt(self, birthYear):
try:
return 0 if birthYear == "None" else int(birthYear)
except:
return 0
def getTimezoneInt(self, timezone):
try:
return int(timezone)
except:
return 0
def getFeatureHash(self, value):
if len(value.strip()) == 0:
return -1
else:
return int(hashlib.sha224(value).hexdigest()[0:4], 16)
def getFloatValue(self, value):
if len(value.strip()) == 0:
return 0.0
else:
return float(value)
2.处理user和event关联数据
class ProgramEntities:
"""
我们只关心train和test中出现的user和event,因此重点处理这部分关联数据
"""
def __init__(self):
# 统计训练集中有多少独立的用户的events
uniqueUsers = set()
uniqueEvents = set()
eventsForUser = defaultdict(set)
usersForEvent = defaultdict(set)
for filename in ["train.csv", "test.csv"]:
f = open(filename, 'rb')
f.readline().strip().split(",")
for line in f:
cols = line.strip().split(",")
uniqueUsers.add(cols[0])
uniqueEvents.add(cols[1])
eventsForUser[cols[0]].add(cols[1])
usersForEvent[cols[1]].add(cols[0])
f.close()
self.userEventScores = ss.dok_matrix((len(uniqueUsers), len(uniqueEvents)))
self.userIndex = dict()
self.eventIndex = dict()
for i, u in enumerate(uniqueUsers):
self.userIndex[u] = i
for i, e in enumerate(uniqueEvents):
self.eventIndex[e] = i
ftrain = open("train.csv", 'rb')
ftrain.readline()
for line in ftrain:
cols = line.strip().split(",")
i = self.userIndex[cols[0]]
j = self.eventIndex[cols[1]]
self.userEventScores[i, j] = int(cols[4]) - int(cols[5])
ftrain.close()
sio.mmwrite("PE_userEventScores", self.userEventScores)
# 为了防止不必要的计算,我们找出来所有关联的用户 或者 关联的event
# 所谓的关联用户,指的是至少在同一个event上有行为的用户pair
# 关联的event指的是至少同一个user有行为的event pair
self.uniqueUserPairs = set()
self.uniqueEventPairs = set()
for event in uniqueEvents:
users = usersForEvent[event]
if len(users) > 2:
self.uniqueUserPairs.update(itertools.combinations(users, 2))
for user in uniqueUsers:
events = eventsForUser[user]
if len(events) > 2:
self.uniqueEventPairs.update(itertools.combinations(events, 2))
cPickle.dump(self.userIndex, open("PE_userIndex.pkl", 'wb'))
cPickle.dump(self.eventIndex, open("PE_eventIndex.pkl", 'wb'))
用户与用户相似度矩阵
class Users:
"""
构建 user/user 相似度矩阵
"""
def __init__(self, programEntities, sim=ssd.correlation):
cleaner = DataCleaner()
nusers = len(programEntities.userIndex.keys())
fin = open("users.csv", 'rb')
colnames = fin.readline().strip().split(",")
self.userMatrix = ss.dok_matrix((nusers, len(colnames) - 1))
for line in fin:
cols = line.strip().split(",")
# 只考虑train.csv中出现的用户
if programEntities.userIndex.has_key(cols[0]):
i = programEntities.userIndex[cols[0]]
self.userMatrix[i, 0] = cleaner.getLocaleId(cols[1])
self.userMatrix[i, 1] = cleaner.getBirthYearInt(cols[2])
self.userMatrix[i, 2] = cleaner.getGenderId(cols[3])
self.userMatrix[i, 3] = cleaner.getJoinedYearMonth(cols[4])
self.userMatrix[i, 4] = cleaner.getCountryId(cols[5])
self.userMatrix[i, 5] = cleaner.getTimezoneInt(cols[6])
fin.close()
# 归一化用户矩阵
self.userMatrix = normalize(self.userMatrix, norm="l1", axis=0, copy=False)
sio.mmwrite("US_userMatrix", self.userMatrix)
# 计算用户相似度矩阵,之后会用到
self.userSimMatrix = ss.dok_matrix((nusers, nusers))
for i in range(0, nusers):
self.userSimMatrix[i, i] = 1.0
for u1, u2 in programEntities.uniqueUserPairs:
i = programEntities.userIndex[u1]
j = programEntities.userIndex[u2]
if not self.userSimMatrix.has_key((i, j)):
usim = sim(self.userMatrix.getrow(i).todense(),
self.userMatrix.getrow(j).todense())
self.userSimMatrix[i, j] = usim
self.userSimMatrix[j, i] = usim
sio.mmwrite("US_userSimMatrix", self.userSimMatrix)
用户社交关系挖掘
class UserFriends:
"""
找出某用户的那些朋友,想法非常简单
1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
"""
def __init__(self, programEntities):
nusers = len(programEntities.userIndex.keys())
self.numFriends = np.zeros((nusers))
self.userFriends = ss.dok_matrix((nusers, nusers))
fin = open("user_friends.csv", 'rb')
fin.readline() # skip header
ln = 0
for line in fin:
if ln % 200 == 0:
print "Loading line: ", ln
cols = line.strip().split(",")
user = cols[0]
if programEntities.userIndex.has_key(user):
friends = cols[1].split(" ")
i = programEntities.userIndex[user]
self.numFriends[i] = len(friends)
for friend in friends:
if programEntities.userIndex.has_key(friend):
j = programEntities.userIndex[friend]
# the objective of this score is to infer the degree to
# and direction in which this friend will influence the
# user's decision, so we sum the user/event score for
# this user across all training events.
eventsForUser = programEntities.userEventScores.getrow(j).todense()
score = eventsForUser.sum() / np.shape(eventsForUser)[1]
self.userFriends[i, j] += score
self.userFriends[j, i] += score
ln += 1
fin.close()
# 归一化数组
sumNumFriends = self.numFriends.sum(axis=0)
self.numFriends = self.numFriends / sumNumFriends
sio.mmwrite("UF_numFriends", np.matrix(self.numFriends))
self.userFriends = normalize(self.userFriends, norm="l1", axis=0, copy=False)
sio.mmwrite("UF_userFriends", self.userFriends)
构造event和event相似度数据
class Events:
"""
构建event-event相似度,注意这里有2种相似度:
1)由用户-event行为,类似协同过滤算出的相似度
2)由event本身的内容(event信息)计算出的event-event相似度
"""
def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
cleaner = DataCleaner()
fin = open("events.csv", 'rb')
fin.readline() # skip header
nevents = len(programEntities.eventIndex.keys())
self.eventPropMatrix = ss.dok_matrix((nevents, 7))
self.eventContMatrix = ss.dok_matrix((nevents, 100))
ln = 0
for line in fin.readlines():
# if ln > 10:
# break
cols = line.strip().split(",")
eventId = cols[0]
if programEntities.eventIndex.has_key(eventId):
i = programEntities.eventIndex[eventId]
self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth(cols[2]) # start_time
self.eventPropMatrix[i, 1] = cleaner.getFeatureHash(cols[3]) # city
self.eventPropMatrix[i, 2] = cleaner.getFeatureHash(cols[4]) # state
self.eventPropMatrix[i, 3] = cleaner.getFeatureHash(cols[5]) # zip
self.eventPropMatrix[i, 4] = cleaner.getFeatureHash(cols[6]) # country
self.eventPropMatrix[i, 5] = cleaner.getFloatValue(cols[7]) # lat
self.eventPropMatrix[i, 6] = cleaner.getFloatValue(cols[8]) # lon
for j in range(9, 109):
self.eventContMatrix[i, j-9] = cols[j]
ln += 1
fin.close()
self.eventPropMatrix = normalize(self.eventPropMatrix,
norm="l1", axis=0, copy=False)
sio.mmwrite("EV_eventPropMatrix", self.eventPropMatrix)
self.eventContMatrix = normalize(self.eventContMatrix,
norm="l1", axis=0, copy=False)
sio.mmwrite("EV_eventContMatrix", self.eventContMatrix)
# calculate similarity between event pairs based on the two matrices
self.eventPropSim = ss.dok_matrix((nevents, nevents))
self.eventContSim = ss.dok_matrix((nevents, nevents))
for e1, e2 in programEntities.uniqueEventPairs:
i = programEntities.eventIndex[e1]
j = programEntities.eventIndex[e2]
if not self.eventPropSim.has_key((i,j)):
epsim = psim(self.eventPropMatrix.getrow(i).todense(),
self.eventPropMatrix.getrow(j).todense())
self.eventPropSim[i, j] = epsim
self.eventPropSim[j, i] = epsim
if not self.eventContSim.has_key((i,j)):
ecsim = csim(self.eventContMatrix.getrow(i).todense(),
self.eventContMatrix.getrow(j).todense())
self.eventContSim[i, j] = epsim
self.eventContSim[j, i] = epsim
sio.mmwrite("EV_eventPropSim", self.eventPropSim)
sio.mmwrite("EV_eventContSim", self.eventContSim)
活跃度/event热度 数据
class EventAttendees():
"""
统计某个活动,参加和不参加的人数,从而为活动活跃度做准备
"""
def __init__(self, programEvents):
nevents = len(programEvents.eventIndex.keys())
self.eventPopularity = ss.dok_matrix((nevents, 1))
f = open("event_attendees.csv", 'rb')
f.readline() # skip header
for line in f:
cols = line.strip().split(",")
eventId = cols[0]
if programEvents.eventIndex.has_key(eventId):
i = programEvents.eventIndex[eventId]
self.eventPopularity[i, 0] = \
len(cols[1].split(" ")) - len(cols[4].split(" "))
f.close()
self.eventPopularity = normalize(self.eventPopularity, norm="l1",
axis=0, copy=False)
sio.mmwrite("EA_eventPopularity", self.eventPopularity)
7.串起所有的数据处理和准备流程
def data_prepare():
"""
计算生成所有的数据,用矩阵或者其他形式存储方便后续提取特征和建模
"""
print "第1步:统计user和event相关信息..."
pe = ProgramEntities()
print "第1步完成...\n"
print "第2步:计算用户相似度信息,并用矩阵形式存储..."
Users(pe)
print "第2步完成...\n"
print "第3步:计算用户社交关系信息,并存储..."
UserFriends(pe)
print "第3步完成...\n"
print "第4步:计算event相似度信息,并用矩阵形式存储..."
Events(pe)
print "第4步完成...\n"
print "第5步:计算event热度信息..."
EventAttendees(pe)
print "第5步完成...\n"
# 运行进行数据准备
data_prepare()
8.构建特征
# 这是构建特征部分
from __future__ import division
import cPickle
import numpy as np
import scipy.io as sio
class DataRewriter:
def __init__(self):
# 读入数据做初始化
self.userIndex = cPickle.load(open("PE_userIndex.pkl", 'rb'))
self.eventIndex = cPickle.load(open("PE_eventIndex.pkl", 'rb'))
self.userEventScores = sio.mmread("PE_userEventScores").todense()
self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()
self.eventPropSim = sio.mmread("EV_eventPropSim").todense()
self.eventContSim = sio.mmread("EV_eventContSim").todense()
self.numFriends = sio.mmread("UF_numFriends")
self.userFriends = sio.mmread("UF_userFriends").todense()
self.eventPopularity = sio.mmread("EA_eventPopularity").todense()
def userReco(self, userId, eventId):
"""
根据User-based协同过滤,得到event的推荐度
基本的伪代码思路如下:
for item i
for every other user v that has a preference for i
compute similarity s between u and v
incorporate v's preference for i weighted by s into running aversge
return top items ranked by weighted average
"""
i = self.userIndex[userId]
j = self.eventIndex[eventId]
vs = self.userEventScores[:, j]
sims = self.userSimMatrix[i, :]
prod = sims * vs
try:
return prod[0, 0] - self.userEventScores[i, j]
except IndexError:
return 0
def eventReco(self, userId, eventId):
"""
根据基于物品的协同过滤,得到Event的推荐度
基本的伪代码思路如下:
for item i
for every item j tht u has a preference for
compute similarity s between i and j
add u's preference for j weighted by s to a running average
return top items, ranked by weighted average
"""
i = self.userIndex[userId]
j = self.eventIndex[eventId]
js = self.userEventScores[i, :]
psim = self.eventPropSim[:, j]
csim = self.eventContSim[:, j]
pprod = js * psim
cprod = js * csim
pscore = 0
cscore = 0
try:
pscore = pprod[0, 0] - self.userEventScores[i, j]
except IndexError:
pass
try:
cscore = cprod[0, 0] - self.userEventScores[i, j]
except IndexError:
pass
return pscore, cscore
def userPop(self, userId):
"""
基于用户的朋友个数来推断用户的社交程度
主要的考量是如果用户的朋友非常多,可能会更倾向于参加各种社交活动
"""
if self.userIndex.has_key(userId):
i = self.userIndex[userId]
try:
return self.numFriends[0, i]
except IndexError:
return 0
else:
return 0
def friendInfluence(self, userId):
"""
朋友对用户的影响
主要考虑用户所有的朋友中,有多少是非常喜欢参加各种社交活动/event的
用户的朋友圈如果都积极参与各种event,可能会对当前用户有一定的影响
"""
nusers = np.shape(self.userFriends)[1]
i = self.userIndex[userId]
return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]
def eventPop(self, eventId):
"""
本活动本身的热度
主要是通过参与的人数来界定的
"""
i = self.eventIndex[eventId]
return self.eventPopularity[i, 0]
def rewriteData(self, start=1, train=True, header=True):
"""
把前面user-based协同过滤 和 item-based协同过滤,以及各种热度和影响度作为特征组合在一起
生成新的训练数据,用于分类器分类使用
"""
fn = "train.csv" if train else "test.csv"
fin = open(fn, 'rb')
fout = open("data_" + fn, 'wb')
# write output header
if header:
ocolnames = ["invited", "user_reco", "evt_p_reco",
"evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]
if train:
ocolnames.append("interested")
ocolnames.append("not_interested")
fout.write(",".join(ocolnames) + "\n")
ln = 0
for line in fin:
ln += 1
if ln < start:
continue
cols = line.strip().split(",")
userId = cols[0]
eventId = cols[1]
invited = cols[2]
if ln%500 == 0:
print "%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId)
user_reco = self.userReco(userId, eventId)
evt_p_reco, evt_c_reco = self.eventReco(userId, eventId)
user_pop = self.userPop(userId)
frnd_infl = self.friendInfluence(userId)
evt_pop = self.eventPop(eventId)
ocols = [invited, user_reco, evt_p_reco,
evt_c_reco, user_pop, frnd_infl, evt_pop]
if train:
ocols.append(cols[4]) # interested
ocols.append(cols[5]) # not_interested
fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")
fin.close()
fout.close()
def rewriteTrainingSet(self):
self.rewriteData(True)
def rewriteTestSet(self):
self.rewriteData(False)
# When running with cython, the actual class will be converted to a .so
# file, and the following code (along with the commented out import below)
# will need to be put into another .py and this should be run.
#import CRegressionData as rd
dr = DataRewriter()
print "生成训练数据...\n"
dr.rewriteData(train=True, start=2, header=True)
print "生成预测数据...\n"
dr.rewriteData(train=False, start=2, header=True)
9.建模与预测
# 建模与预测
from __future__ import division
import math
import numpy as np
import pandas as pd
from sklearn.cross_validation import KFold
from sklearn.linear_model import SGDClassifier
def train():
"""
在我们得到的特征上训练分类器,target为1(感兴趣),或者是0(不感兴趣)
"""
trainDf = pd.read_csv("data_train.csv")
X = np.matrix(pd.DataFrame(trainDf, index=None,
columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",
"user_pop", "frnd_infl", "evt_pop"]))
y = np.array(trainDf.interested)
clf = SGDClassifier(loss="log", penalty="l2")
clf.fit(X, y)
return clf
def validate():
"""
10折的交叉验证,并输出交叉验证的平均准确率
"""
trainDf = pd.read_csv("data_train.csv")
X = np.matrix(pd.DataFrame(trainDf, index=None,
columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",
"user_pop", "frnd_infl", "evt_pop"]))
y = np.array(trainDf.interested)
nrows = len(trainDf)
kfold = KFold(nrows, 10)
avgAccuracy = 0
run = 0
for train, test in kfold:
Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
clf = SGDClassifier(loss="log", penalty="l2")
clf.fit(Xtrain, ytrain)
accuracy = 0
ntest = len(ytest)
for i in range(0, ntest):
yt = clf.predict(Xtest[i, :])
if yt == ytest[i]:
accuracy += 1
accuracy = accuracy / ntest
print "accuracy (run %d): %f" % (run, accuracy)
avgAccuracy += accuracy
run += 1
print "Average accuracy", (avgAccuracy / run)
def test(clf):
"""
读取test数据,用分类器完成预测
"""
origTestDf = pd.read_csv("test.csv")
users = origTestDf.user
events = origTestDf.event
testDf = pd.read_csv("data_test.csv")
fout = open("result.csv", 'wb')
fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")
nrows = len(testDf)
Xp = np.matrix(testDf)
yp = np.zeros((nrows, 2))
for i in range(0, nrows):
xp = Xp[i, :]
yp[i, 0] = clf.predict(xp)
yp[i, 1] = clf.decision_function(xp)
fout.write(",".join(map(lambda x: str(x),
[users[i], events[i], yp[i, 0], yp[i, 1]])) + "\n")
fout.close()
clf = train()
test(clf)
生成要提交的文件
# 处理成提交结果的格式
from __future__ import division
import pandas as pd
def byDist(x, y):
return int(y[1] - x[1])
def generate_submition_file():
# 输出文件
fout = open("final_result.csv", 'wb')
fout.write(",".join(["User", "Events"]) + "\n")
resultDf = pd.read_csv("result.csv")
# group remaining user/events
grouped = resultDf.groupby("user")
for name, group in grouped:
user = str(name)
tuples = zip(list(group.event), list(group.dist), list(group.outcome))
# tuples = filter(lambda x: x[2]==1, tuples)
tuples = sorted(tuples, cmp=byDist)
events = "\"" + str(map(lambda x: x[0], tuples)) + "\""
fout.write(",".join([user, events]) + "\n")
fout.close()
generate_submition_file()