根据天气等因素对自行车租赁数量进行预测
利用xgboost进行预测
import csv
from math import log, exp
import numpy
__author__ = 'Whiker'
__mtime__ = '2016/5/26'
from datetime import datetime
import pandas
import xgboost
def parse_time(data):
date = datetime.strptime(data, "%Y-%m-%d %H:%M:%S")
hour = date.hour
day = date.day
month = date.month
dow = date.weekday()
year = date.year
return year, month, day, hour, dow
def mylog(data):
data = int(data) + 1
return log(data)
dataset = pandas.read_csv("input/train.csv")
testset = pandas.read_csv("input/test.csv")
labelData = dataset['casual'].apply(func=mylog)
labelData2 = dataset['registered'].apply(func=mylog)
myIndex = testset['datetime']
feature1 = ['atemp', 'temp', 'hour', 'humidity', 'windspeed',
'month', 'dow', 'workingday', 'holiday', 'year', 'weather']
# datetime
dataset['year'], dataset['month'], dataset['day'], dataset['hour'], dataset['dow'] = zip(
*dataset['datetime'].apply(func=parse_time))
testset['year'], testset['month'], testset['day'], testset['hour'], testset['dow'] = zip(
*testset['datetime'].apply(func=parse_time))
trainData = dataset[feature1].iloc[:, :].values
testData = testset[feature1].iloc[:, :].values
offset = 6000
xgtrain = xgboost.DMatrix(trainData[:offset, :], label=labelData[:offset])
xgeval = xgboost.DMatrix(trainData[offset:, :], label=labelData[offset:])
xgtest = xgboost.DMatrix(testData)
watchlist = [(xgtrain, 'train'), (xgeval, 'val')]
params = {"max_depth": 6, "tree_num": 1000, "silent": 1, "shrinkage": 0.1}
xgModel = xgboost.train(list(params.items()), xgtrain, 450, watchlist, early_stopping_rounds=100)
# preds = numpy.column_stack((myIndex, xgModel.predict(xgtest, ntree_limit=xgModel.best_iteration))).tolist()
preds = xgModel.predict(xgtest, ntree_limit=xgModel.best_iteration).tolist()
preds = [exp(i) - 1 for i in preds]
# registered ================================================
feature2 = ['hour', 'humidity', 'atemp', 'temp', 'windspeed',
'month', 'dow', 'workingday', 'holiday', 'year', 'weather']
trainData = dataset[list(feature2)].iloc[:, :].values
testData = testset[list(feature2)].iloc[:, :].values
xgtrain = xgboost.DMatrix(trainData[:offset, :], label=labelData2[:offset])
xgeval = xgboost.DMatrix(trainData[offset:, :], label=labelData2[offset:])
xgtest = xgboost.DMatrix(testData)
watchlist = [(xgtrain, 'train'), (xgeval, 'val')]
params = {"max_depth": 6, "tree_num": 1000, "silent": 1, "shrinkage": 0.1}
xgModel = xgboost.train(list(params.items()), xgtrain, 450, watchlist, early_stopping_rounds=100)
preds2 = xgModel.predict(xgtest, ntree_limit=xgModel.best_iteration).tolist()
preds2 = [exp(i) - 1 for i in preds2]
# ==================================================================
preds = numpy.column_stack((myIndex, map(lambda x, y: x + y, preds, preds2))).tolist()
with open("result/sub_xgb_linear.csv", "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerow(["datetime", "count"])
writer.writerows(preds)
import csv
from math import log, exp
import numpy
from sklearn.ensemble import RandomForestRegressor
__author__ = 'Whiker'
__mtime__ = '2016/5/26'
from datetime import datetime
import pandas
def parse_time(data):
date = datetime.strptime(data, "%Y-%m-%d %H:%M:%S")
hour = date.hour
day = date.day
month = date.month
dow = date.weekday()
year = date.year
return year, month, day, hour, dow
def mylog(data):
data = int(data) + 1
return log(data)
dataset = pandas.read_csv("input/train.csv")
testset = pandas.read_csv("input/test.csv")
labelData = dataset['casual'].apply(func=mylog).values
labelData2 = dataset['registered'].apply(func=mylog).values
myIndex = testset['datetime']
feature1 = ['atemp', 'temp', 'hour', 'humidity', 'windspeed',
'month', 'dow', 'workingday', 'holiday', 'year', 'weather']
# datetime
dataset['year'], dataset['month'], dataset['day'], dataset['hour'], dataset['dow'] = zip(
*dataset['datetime'].apply(func=parse_time))
testset['year'], testset['month'], testset['day'], testset['hour'], testset['dow'] = zip(
*testset['datetime'].apply(func=parse_time))
trainData = dataset[feature1].iloc[:, :].values
testData = testset[feature1].iloc[:, :].values
rfModel = RandomForestRegressor(n_estimators=100)
rfModel.fit(trainData, labelData)
preds = rfModel.predict(testData).tolist()
preds = [exp(i) - 1 for i in preds]
# registered ================================================
feature2 = ['hour', 'humidity', 'atemp', 'temp', 'windspeed',
'month', 'dow', 'workingday', 'holiday', 'year', 'weather']
trainData = dataset[list(feature2)].iloc[:, :].values
testData = testset[list(feature2)].iloc[:, :].values
rfModel = RandomForestRegressor(n_estimators=100)
rfModel.fit(trainData, labelData2)
preds2 = rfModel.predict(testData).tolist()
preds2 = [exp(i) - 1 for i in preds2]
# ==================================================================
preds = numpy.column_stack((myIndex, map(lambda x, y: x + y, preds, preds2))).tolist()
with open("result/sub_rf.csv", "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerow(["datetime", "count"])
writer.writerows(preds)
最后评分0.46左右