{
"cells": [
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
"
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
"
" \n",
"
\n","
\n","
instant\n","
dteday\n","
season\n","
yr\n","
mnth\n","
holiday\n","
weekday\n","
workingday\n","
weathersit\n","
temp\n","
atemp\n","
hum\n","
windspeed\n","
casual\n","
registered\n","
cnt\n","
\n","
\n","
\n","
\n","
0\n","
1\n","
2011-01-01\n","
1\n","
0\n","
1\n","
0\n","
6\n","
0\n","
2\n","
0.344167\n","
0.363625\n","
0.805833\n","
0.160446\n","
331\n","
654\n","
985\n","
\n","
\n","
1\n","
2\n","
2011-01-02\n","
1\n","
0\n","
1\n","
0\n","
0\n","
0\n","
2\n","
0.363478\n","
0.353739\n","
0.696087\n","
0.248539\n","
131\n","
670\n","
801\n","
\n","
\n","
2\n","
3\n","
2011-01-03\n","
1\n","
0\n","
1\n","
0\n","
1\n","
1\n","
1\n","
0.196364\n","
0.189405\n","
0.437273\n","
0.248309\n","
120\n","
1229\n","
1349\n","
\n","
\n","
3\n","
4\n","
2011-01-04\n","
1\n","
0\n","
1\n","
0\n","
2\n","
1\n","
1\n","
0.200000\n","
0.212122\n","
0.590435\n","
0.160296\n","
108\n","
1454\n","
1562\n","
\n","
\n","
4\n","
5\n","
2011-01-05\n","
1\n","
0\n","
1\n","
0\n","
3\n","
1\n","
1\n","
0.226957\n","
0.229270\n","
0.436957\n","
0.186900\n","
82\n","
1518\n","
1600\n","
\n","
\n","
\n","
],
"text/plain": [
" instant dteday season yr mnth holiday weekday workingday \\\n",
"0 1 2011-01-01 1 0 1 0 6 0 \n",
"1 2 2011-01-02 1 0 1 0 0 0 \n",
"2 3 2011-01-03 1 0 1 0 1 1 \n",
"3 4 2011-01-04 1 0 1 0 2 1 \n",
"4 5 2011-01-05 1 0 1 0 3 1 \n",
"\n",
" weathersit temp atemp hum windspeed casual registered \\\n",
"0 2 0.344167 0.363625 0.805833 0.160446 331 654 \n",
"1 2 0.363478 0.353739 0.696087 0.248539 131 670 \n",
"2 1 0.196364 0.189405 0.437273 0.248309 120 1229 \n",
"3 1 0.200000 0.212122 0.590435 0.160296 108 1454 \n",
"4 1 0.226957 0.229270 0.436957 0.186900 82 1518 \n",
"\n",
" cnt \n",
"0 985 \n",
"1 801 \n",
"2 1349 \n",
"3 1562 \n",
"4 1600 "
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = pd.read_csv(\"day.csv\")\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" mnth_1 mnth_2 mnth_3 mnth_4 mnth_5 mnth_6 mnth_7 mnth_8 mnth_9 \\\n",
"0 1 0 0 0 0 0 0 0 0 \n",
"1 1 0 0 0 0 0 0 0 0 \n",
"2 1 0 0 0 0 0 0 0 0 \n",
"3 1 0 0 0 0 0 0 0 0 \n",
"4 1 0 0 0 0 0 0 0 0 \n",
"\n",
" mnth_10 ... weathersit_1 weathersit_2 weathersit_3 weekday_0 \\\n",
"0 0 ... 0 1 0 0 \n",
"1 0 ... 0 1 0 1 \n",
"2 0 ... 1 0 0 0 \n",
"3 0 ... 1 0 0 0 \n",
"4 0 ... 1 0 0 0 \n",
"\n",
" weekday_1 weekday_2 weekday_3 weekday_4 weekday_5 weekday_6 \n",
"0 0 0 0 0 0 1 \n",
"1 0 0 0 0 0 0 \n",
"2 1 0 0 0 0 0 \n",
"3 0 1 0 0 0 0 \n",
"4 0 0 1 0 0 0 \n",
"\n",
"[5 rows x 22 columns]\n"
]
}
],
"source": [
"categorical_features = ['mnth','weathersit','weekday']\n",
"for col in categorical_features:\n",
" train[col] = train[col].astype('object')\n",
"x_train_cat = train[categorical_features]\n",
"x_train_cat = pd.get_dummies(x_train_cat)\n",
"print x_train_cat.head()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
"
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
"
" \n",
"
\n","
\n","
temp\n","
atemp\n","
hum\n","
windspeed\n","
\n","
\n","
\n","
\n","
0\n","
0.355170\n","
0.373517\n","
0.828620\n","
0.284606\n","
\n","
\n","
1\n","
0.379232\n","
0.360541\n","
0.715771\n","
0.466215\n","
\n","
\n","
2\n","
0.171000\n","
0.144830\n","
0.449638\n","
0.465740\n","
\n","
\n","
3\n","
0.175530\n","
0.174649\n","
0.607131\n","
0.284297\n","
\n","
\n","
4\n","
0.209120\n","
0.197158\n","
0.449313\n","
0.339143\n","
\n","
\n","
\n","
],
"text/plain": [
" temp atemp hum windspeed\n",
"0 0.355170 0.373517 0.828620 0.284606\n",
"1 0.379232 0.360541 0.715771 0.466215\n",
"2 0.171000 0.144830 0.449638 0.465740\n",
"3 0.175530 0.174649 0.607131 0.284297\n",
"4 0.209120 0.197158 0.449313 0.339143"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"mn_x = MinMaxScaler()\n",
"numerical_features = ['temp','atemp','hum','windspeed']\n",
"x_train_num = train[numerical_features]\n",
"temp = mn_x.fit_transform(x_train_num)\n",
"x_train_num = pd.DataFrame(data=temp, columns=numerical_features, index =train.index)\n",
"x_train_num.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
"x_train = pd.concat([x_train_num, train['holiday']], axis = 1, ignore_index=False)\n",
"FE_train = pd.concat([train['instant'], x_train, train['yr'],train['cnt']], axis = 1)\n",
"FE_train.to_csv('FE_day.csv', index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(365, 8)\n",
"(366, 8)\n"
]
}
],
"source": [
"#对数据值型特征,用常用统计量观察其分布\n",
"data = pd.read_csv('FE_day.csv')\n",
"train_data = data[data.yr == 0]\n",
"test_data = data[data.yr == 1]\n",
"train_data.to_csv('train_data.csv', index=False)\n",
"test_data.to_csv('test_data.csv', index=False)\n",
"print train_data.shape\n",
"print test_data.shape"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.706085466961 0.325969065896\n"
]
}
],
"source": [
"x_test = test_data.iloc[:, list(range(7))]\n",
"y_test = test_data['cnt']\n",
"\n",
"x_train = train_data.iloc[:, list(range(7))]\n",
"y_train = train_data['cnt']\n",
"\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import r2_score\n",
"lr = LinearRegression()\n",
"lr.fit(x_train,y_train)\n",
"y_test_pred_lr = lr.predict(x_test)\n",
"y_train_pred_lr = lr.predict(x_train)\n",
"print r2_score(y_train,y_train_pred_lr), r2_score(y_test,y_test_pred_lr)\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
"
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
"
" \n",
"
\n","
\n","
mnth_1\n","
mnth_2\n","
mnth_3\n","
mnth_4\n","
mnth_5\n","
mnth_6\n","
mnth_7\n","
mnth_8\n","
mnth_9\n","
mnth_10\n","
...\n","
weathersit_1\n","
weathersit_2\n","
weathersit_3\n","
weekday_0\n","
weekday_1\n","
weekday_2\n","
weekday_3\n","
weekday_4\n","
weekday_5\n","
weekday_6\n","
\n","
\n","
\n","
\n","
0\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
...\n","
0\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
1\n","
\n","
\n","
1\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
...\n","
0\n","
1\n","
0\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
\n","
\n","
2\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
...\n","
1\n","
0\n","
0\n","
0\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
\n","
\n","
3\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
...\n","
1\n","
0\n","
0\n","
0\n","
0\n","
1\n","
0\n","
0\n","
0\n","
0\n","
\n","
\n","
4\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
0\n","
...\n","
1\n","
0\n","
0\n","
0\n","
0\n","
0\n","
1\n","
0\n","
0\n","
0\n","
\n","
\n","
\n","
5 rows × 22 columns
\n","
],
"text/plain": [
" mnth_1 mnth_2 mnth_3 mnth_4 mnth_5 mnth_6 mnth_7 mnth_8 mnth_9 \\\n",
"0 1 0 0 0 0 0 0 0 0 \n",
"1 1 0 0 0 0 0 0 0 0 \n",
"2 1 0 0 0 0 0 0 0 0 \n",
"3 1 0 0 0 0 0 0 0 0 \n",
"4 1 0 0 0 0 0 0 0 0 \n",
"\n",
" mnth_10 ... weathersit_1 weathersit_2 weathersit_3 weekday_0 \\\n",
"0 0 ... 0 1 0 0 \n",
"1 0 ... 0 1 0 1 \n",
"2 0 ... 1 0 0 0 \n",
"3 0 ... 1 0 0 0 \n",
"4 0 ... 1 0 0 0 \n",
"\n",
" weekday_1 weekday_2 weekday_3 weekday_4 weekday_5 weekday_6 \n",
"0 0 0 0 0 0 1 \n",
"1 0 0 0 0 0 0 \n",
"2 1 0 0 0 0 0 \n",
"3 0 1 0 0 0 0 \n",
"4 0 0 1 0 0 0 \n",
"\n",
"[5 rows x 22 columns]"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"['temp' 'hum' 'windspeed'] not in index\"",
"output_type": "error",
"traceback": [
"\u001b[1;31m\u001b[0m",
"\u001b[1;31mKeyError\u001b[0mTraceback (most recent call last)",
"\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mmn_x\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMinMaxScaler\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mnumerical_features\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'temp'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'hum'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'windspeed'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mtemp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmn_x\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mnumerical_features\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0mx_train_num\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtemp\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mnumerical_features\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m \u001b[1;33m=\u001b[0m\u001b[0mtrain\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\Users\\62744\\Anaconda2\\lib\\site-packages\\pandas\\core\\frame.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 1956\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1957\u001b[0m \u001b[1;31m# either boolean or fancy integer index\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1958\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1959\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1960\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_frame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\Users\\62744\\Anaconda2\\lib\\site-packages\\pandas\\core\\frame.pyc\u001b[0m in \u001b[0;36m_getitem_array\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 2000\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconvert\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2001\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2002\u001b[1;33m \u001b[0mindexer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_convert_to_indexer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2003\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconvert\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2004\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mC:\\Users\\62744\\Anaconda2\\lib\\site-packages\\pandas\\core\\indexing.pyc\u001b[0m in \u001b[0;36m_convert_to_indexer\u001b[1;34m(self, obj, axis, is_setter)\u001b[0m\n\u001b[0;32m 1229\u001b[0m \u001b[0mmask\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcheck\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1230\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmask\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0many\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1231\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'%s not in index'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mobjarr\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1232\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1233\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_values_from_object\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mKeyError\u001b[0m: \"['temp' 'hum' 'windspeed'] not in index\""
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fig = plt.figure()\n",
"sns.distplot(data.holiday.values, bins=30, kde=False)\n",
"plt.xlabel('holiday', fontsize=12)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
一键复制
编辑
Web IDE
原始数据
标准视图
历史