民宿价格预测分析(和鲸社区)

比赛介绍

关于新人赛比赛介绍见链接:https://www.heywhale.com/home/competition/605c426d21e3f6003b56a920

处理过程

导入需要的包

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier # knn
from sklearn.preprocessing import MinMaxScaler,StandardScaler #归一化和标准化
from sklearn.ensemble import RandomForestClassifier

读取数据

train=pd.read_csv("./data/训练集.csv")
test = pd.read_csv("./data/测试集 .csv")

处理数据

#训练数据
X=train.loc[:,["便利设施","洗手间数量","床的数量","床的类型","卧室数量","所在城市","房产类型","房型"]]
y=train.iloc[:,-1]
#测试数据
X_test=test.loc[:,["便利设施","洗手间数量","床的数量","床的类型","卧室数量","所在城市","房产类型","房型"]]
X_test.洗手间数量=X_test.洗手间数量.fillna(X_test.query("床的类型==4").洗手间数量.median())
X_test.床的数量=X_test.床的数量.fillna(X_test.query("床的类型==4").床的数量.median())
X_test.卧室数量=X_test.卧室数量.fillna(X_test.query("床的类型==4").卧室数量.median())
X.洗手间数量=X.洗手间数量.fillna(X.query("床的类型==4").洗手间数量.median())
X.床的数量=X.床的数量.fillna(X.query("床的类型==4").床的数量.median())
X.卧室数量=X.卧室数量.fillna(X.query("床的类型==4").卧室数量.median())
X["便利设施数量"]=X.便利设施.apply(lambda x:len(x.lstrip('{').rstrip('}').split(',')))
X_test['便利设施数量']=X_test.便利设施.apply(lambda x:len(x.lstrip('{').rstrip('}').split(',')))
X_test=X_test.drop("便利设施",axis=1)
X=X.drop("便利设施",axis=1)
#数据内存压缩
X.床的数量=X.床的数量.astype("int8")
X_test.床的数量=X_test.床的数量.astype("int8")
X.卧室数量=X.卧室数量.astype("int8")
X_test.卧室数量=X_test.卧室数量.astype("int8")
#映射数据
bins = [0,10,20,30,40,50,60,70,80,90]
labels = ["1-10","10-20","20-30","30-40","40-50","50-60","60-70","70-80","80-90"]
X['便利设施数量范围']=pd.cut(X["便利设施数量"],bins,labels=labels)
bins = [0,10,20,30,40,50,60,70,80,90]
labels = ["1-10","10-20","20-30","30-40","40-50","50-60","60-70","70-80","80-90"]
X_test['便利设施数量范围']=pd.cut(X_test["便利设施数量"],bins,labels=labels)
X_test=X_test.drop("便利设施数量",axis=1)
X=X.drop("便利设施数量",axis=1)
leCrime1 = preprocessing.LabelEncoder()
crime1 = leCrime1.fit_transform(X.便利设施数量范围)
X["便利设施数量范围"]=crime1
leCrime2 = preprocessing.LabelEncoder()
crime2 = leCrime2.fit_transform(X_test.便利设施数量范围)
X_test["便利设施数量范围"]=crime2
X.洗手间数量=X.洗手间数量.astype("int8")
X_test.洗手间数量=X_test.洗手间数量.astype("int8")
bins = [-1,20,30,40,50,60,70,80,90,100,110]
labels = ["0-20","20-30","30-40","40-50","50-60","60-70","70-80","80-90","90-100","100-110"]
y=pd.cut(y,bins,labels=labels)
leCrime2 = preprocessing.LabelEncoder()
crime2 = leCrime2.fit_transform(y)
y=crime2

模型预测

xtr,xte,ytr,yte=train_test_split(X,y,test_size=0.3)
#归一化
mm=MinMaxScaler()
#fit_transform=fit+transform
mm_train = mm.fit_transform(xtr)
mm_test=mm.transform(xte)
knn_model=KNeighborsClassifier(10).fit(mm_train,ytr)
knn_model.score(mm_train,ytr.astype("int")),knn_model.score(mm_test,yte)

#标准化
std_ = StandardScaler() #标准化
#fit_transform=fit+transform
std_train = std_.fit_transform(xtr)  
std_test=std_.transform(xte)
knn_model=KNeighborsClassifier(10).fit(std_train,ytr.astype("int"))
knn_model.score(std_train,ytr.astype("int")),knn_model.score(std_test,yte)
knn_model.predict(X_test)

#决策树
dtree = DecisionTreeClassifier()
param_grid = dict(max_depth=range(2,7),max_leaf_nodes=range(5,15),min_impurity_split=[0.1,0.15,0.2])
gc_model = GridSearchCV(dtree,param_grid,cv=20,n_jobs=4).fit(xtr,ytr)

gc_model.best_score_
gc_model.best_params_
gc_model.best_estimator_.score(xtr,ytr)
gc_model.best_estimator_.score(xte,yte)
gc_model.best_estimator_.predict(X_test)

#随机森林
rfc = RandomForestClassifier()
param_grid = dict(n_estimators=[100,150,200],max_features=[0.7,0.8],max_depth=range(4,8),max_leaf_nodes=range(5,10),min_impurity_split=[0.05,0.1,0.15,0.2])
gc_model3 = GridSearchCV(rfc,param_grid,cv=3,n_jobs=6).fit(xtr,ytr)

gc_model3.best_score_
gc_model3.best_estimator_.score(xtr,ytr),gc_model3.best_estimator_.score(xte,yte)
  • 1
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值