参考:top 2% based on CatBoostClassifier
导入库与数据
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import catboost
import gensim
data_train = pd.read_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\kaggle\\sf-crime\\train.csv")
data_test = pd.read_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\kaggle\\sf-crime\\test.csv")
特征处理
def transformTimeDataset(dataset):
dataset['Dates'] = pd.to_datetime(dataset['Dates'])
dataset['Date'] = dataset['Dates'].dt.date
dataset['n_days'] = (dataset['Date']-dataset['Date'].min()).apply(lambda x:x.days)
dataset['Year'] = dataset['Dates'].dt.year
dataset['DayOfWeek'] = dataset['Dates'].dt.dayofweek
dataset['WeekOfYear'] = dataset['Dates'].dt.weekofyear
dataset['Month'] = dataset['Dates'].dt.month
dataset['Hour'] = dataset['Dates'].dt.hour
return dataset
data_train = transformTimeDataset(data_train)
data_test = transformTimeDataset(data_test)
def transformdGeoDataset(dataset):
dataset['Block'] = dataset['Address'].str.contains('block', case=False)
dataset['Block'] = dataset['Block'].map(lambda x: 1 if x == True else 0)
dataset = pd.get_dummies(data=dataset, columns=['PdDistrict'], drop_first=True)
return dataset
data_train = transformdGeoDataset(data_train)
data_test = transformdGeoDataset(data_test)
data_train = data_train.drop(["Descript", "Resolution","Address","Dates","Date"], axis = 1)
data_test = data_test.drop(["Address","Dates","Date"], axis = 1)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_train.Category = le.fit_transform(data_train.Category)
设置特征与目标
X = data_train.drop("Category",axis=1)
y = data_train['Category']
print(X.head())
DayOfWeek X Y n_days Year WeekOfYear Month Hour Block PdDistrict_CENTRAL PdDistrict_INGLESIDE PdDistrict_MISSION PdDistrict_NORTHERN PdDistrict_PARK PdDistrict_RICHMOND PdDistrict_SOUTHERN PdDistrict_TARAVAL PdDistrict_TENDERLOIN
0 2 -122.425892 37.774599 4510 2015 20 5 23 0 0 0 0 1 0 0 0 0 0
1 2 -122.425892 37.774599 4510 2015 20 5 23 0 0 0 0 1 0 0 0 0 0
2 2 -122.424363 37.800414 4510 2015 20 5 23 0 0 0 0 1 0 0 0 0 0
3 2 -122.426995 37.800873 4510 2015 20 5 23 1 0 0 0 1 0 0 0 0 0
4 2 -122.438738 37.771541 4510 2015 20 5 23 1 0 0 0 0 1 0 0 0 0
先把处理的数据封装起来输出(节省时间)
data_train = pd.DataFrame(data_train)
data_train.to_csv("C:\\Users\\Nihil\\Documents\\pythonlearn\\data\\Results\\Crimedatatrain.csv")