k近邻案例(二)

第一步:导入模块

# 导入Pandas进行数据处理
import pandas as pd
# 导入机器学习库中的K近邻回归模型
from sklearn.neighbors import KNeighborsRegressor
# 导入机器学习库中的均方误差回归模型
from sklearn.metrics import mean_squared_error

第二步:导入数据及数据预处理

# 读取数据
data_listings = pd.read_csv('/root/zhj/python3/code/data/listings.csv')
# 先打印数据
print(data_listings.head())
        id                           listing_url       scrape_id last_scraped  \
0  7087327  https://www.airbnb.com/rooms/7087327  20151002231825   2015-10-03   
1   975833   https://www.airbnb.com/rooms/975833  20151002231825   2015-10-03   
2  8249488  https://www.airbnb.com/rooms/8249488  20151002231825   2015-10-03   
3  8409022  https://www.airbnb.com/rooms/8409022  20151002231825   2015-10-03   
4  8411173  https://www.airbnb.com/rooms/8411173  20151002231825   2015-10-03   

                                 name  \
0  Historic DC Condo-Walk to Capitol!   
1     Spacious Capitol Hill Townhouse   
2    Spacious/private room for single   
3    A wonderful bedroom with library   
4              Downtown Silver Spring   

                                             summary  \
0  Professional pictures coming soon! Welcome to ...   
1                                                NaN   
2  This is an ideal room for a single traveler th...   
3  Prime location right on the Potomac River in W...   
4  Hi travellers! I live in this peaceful spot, b...   

                                               space  \
0                                                NaN   
1  Beautifully renovated Capitol Hill townhouse. ...   
2                                                NaN   
3                                                NaN   
4  This is a 750 sq ft 1 bedroom 1 bathroom.  Whi...   

                                         description experiences_offered  \
0  Professional pictures coming soon! Welcome to ...                none   
1  Beautifully renovated Capitol Hill townhouse. ...                none   
2  This is an ideal room for a single traveler th...                none   
3  Prime location right on the Potomac River in W...                none   
4  Hi travellers! I live in this peaceful spot, b...                none   

                               neighborhood_overview        ...         \
0                                                NaN        ...          
1                                                NaN        ...          
2                                                NaN        ...          
3                                                NaN        ...          
4  Silver Spring is booming.  You can walk to a n...        ...          

  review_scores_value requires_license license  \
0                 NaN                f     NaN   
1                 9.0                f     NaN   
2                 NaN                f     NaN   
3                 NaN                f     NaN   
4                 NaN                f     NaN   

                 jurisdiction_names instant_bookable cancellation_policy  \
0  DISTRICT OF COLUMBIA, WASHINGTON                f            flexible   
1  DISTRICT OF COLUMBIA, WASHINGTON                f              strict   
2                               NaN                f            flexible   
3  DISTRICT OF COLUMBIA, WASHINGTON                f            flexible   
4                               NaN                f            flexible   

   require_guest_profile_picture require_guest_phone_verification  \
0                              f                                f   
1                              f                                f   
2                              f                                f   
3                              f                                f   
4                              f                                f   

  calculated_host_listings_count reviews_per_month  
0                             18               NaN  
1                              1              2.11  
2                              1              1.00  
3                              1               NaN  
4                              1               NaN  

[5 rows x 92 columns]

问题:上面显示的数据很多,但是也很乱。
解决方法:选择主要特征值,剔除重复,冗余的特征值

# 选择主要的特征值
features = ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews']
'''
数据说明 
accommodates:可以容纳的旅客、
bedrooms:卧室的数量、
bathrooms:厕所的数量、
beds:床的数量、
price:每晚的费用、
minimum_nights:客人最少租几天、
maximum_nights:客人最多租几天、
number_of_reviews:评论的数量
'''
# 获取需要的数据,并覆盖原始数据
data_listings = data_listings[features]
# 查看新数据
data_listings.head()
accommodatesbedroomsbathroomsbedspriceminimum_nightsmaximum_nightsnumber_of_reviews
041.01.02.0$160.00111250
163.03.03.0$350.0023065
211.02.01.0$50.00211251
321.01.01.0$95.00111250
441.01.01.0$50.00711250

问题: 上面的数据中,数据类型不统一
解决办法:转化数据类型

data_listings['price'] = data_listings['price'].str.replace("\$|,",'').astype(float)
data_listings.head()

accommodatesbedroomsbathroomsbedspriceminimum_nightsmaximum_nightsnumber_of_reviews
041.01.02.0160.0111250
163.03.03.0350.023065
211.02.01.050.0211251
321.01.01.095.0111250
441.01.01.050.0711250

第三步:创建训练集和测试集

# 过滤数据
data_listings = data_listings.dropna()
# 备份数据
data_toll = data_listings
# 创建训练集和测试集
data_train_df = data_toll.copy().iloc[0:2792]
data_test_df  = data_toll.copy().iloc[2792:]

创建K近邻模型

# 选择测试集的训练的列
cols =  ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews']
# 设置K值,K=10(其他参数设置为默认设置)
knn = KNeighborsRegressor(n_neighbors=10)
# 将X放入训练集数据,Y放入目标输出数据
knn.fit(data_train_df[cols], data_train_df['price'])
# 输出测试结果
two_features_predictions = knn.predict(data_test_df[cols])

第四步:创建模型效果验证

two_features_mse = mean_squared_error(data_test_df['price'],two_features_predictions)
two_features_rmse = two_features_mse ** (1/2)
# 输出模型验证结果,根据结果调整近邻参数
print(two_features_rmse)
39.3117233496

第五步:调用模型设置实际值进行预测

print(knn.predict([[6,3,3,3,1,30,0]]))
---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-25-ae490c4d4c23> in <module>()
----> 1 print(knn.predict([[6,3,3,3,1,30,0]]))


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/neighbors/regression.py in predict(self, X)
    142         X = check_array(X, accept_sparse='csr')
    143 
--> 144         neigh_dist, neigh_ind = self.kneighbors(X)
    145 
    146         weights = _get_weights(neigh_dist, self.weights)


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/neighbors/base.py in kneighbors(self, X, n_neighbors, return_distance)
    383                 delayed(self._tree.query, check_pickle=False)(
    384                     X[s], n_neighbors, return_distance)
--> 385                 for s in gen_even_slices(X.shape[0], n_jobs)
    386             )
    387             if return_distance:


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):


~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):


sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.kd_tree.BinaryTree.query()


ValueError: query data dimension must match training data dimension

问题:数据维度不一致
解决办法:主要特征值为8个,应设置8个参数

print(knn.predict([[6,3,3,3,1,100,30,0]]))
[ 34.3]
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值