第一步:导入模块
# 导入Pandas进行数据处理
import pandas as pd
# 导入机器学习库中的K近邻回归模型
from sklearn.neighbors import KNeighborsRegressor
# 导入机器学习库中的均方误差回归模型
from sklearn.metrics import mean_squared_error
第二步:导入数据及数据预处理
# 读取数据
data_listings = pd.read_csv('/root/zhj/python3/code/data/listings.csv')
# 先打印数据
print(data_listings.head())
id listing_url scrape_id last_scraped \
0 7087327 https://www.airbnb.com/rooms/7087327 20151002231825 2015-10-03
1 975833 https://www.airbnb.com/rooms/975833 20151002231825 2015-10-03
2 8249488 https://www.airbnb.com/rooms/8249488 20151002231825 2015-10-03
3 8409022 https://www.airbnb.com/rooms/8409022 20151002231825 2015-10-03
4 8411173 https://www.airbnb.com/rooms/8411173 20151002231825 2015-10-03
name \
0 Historic DC Condo-Walk to Capitol!
1 Spacious Capitol Hill Townhouse
2 Spacious/private room for single
3 A wonderful bedroom with library
4 Downtown Silver Spring
summary \
0 Professional pictures coming soon! Welcome to ...
1 NaN
2 This is an ideal room for a single traveler th...
3 Prime location right on the Potomac River in W...
4 Hi travellers! I live in this peaceful spot, b...
space \
0 NaN
1 Beautifully renovated Capitol Hill townhouse. ...
2 NaN
3 NaN
4 This is a 750 sq ft 1 bedroom 1 bathroom. Whi...
description experiences_offered \
0 Professional pictures coming soon! Welcome to ... none
1 Beautifully renovated Capitol Hill townhouse. ... none
2 This is an ideal room for a single traveler th... none
3 Prime location right on the Potomac River in W... none
4 Hi travellers! I live in this peaceful spot, b... none
neighborhood_overview ... \
0 NaN ...
1 NaN ...
2 NaN ...
3 NaN ...
4 Silver Spring is booming. You can walk to a n... ...
review_scores_value requires_license license \
0 NaN f NaN
1 9.0 f NaN
2 NaN f NaN
3 NaN f NaN
4 NaN f NaN
jurisdiction_names instant_bookable cancellation_policy \
0 DISTRICT OF COLUMBIA, WASHINGTON f flexible
1 DISTRICT OF COLUMBIA, WASHINGTON f strict
2 NaN f flexible
3 DISTRICT OF COLUMBIA, WASHINGTON f flexible
4 NaN f flexible
require_guest_profile_picture require_guest_phone_verification \
0 f f
1 f f
2 f f
3 f f
4 f f
calculated_host_listings_count reviews_per_month
0 18 NaN
1 1 2.11
2 1 1.00
3 1 NaN
4 1 NaN
[5 rows x 92 columns]
问题:上面显示的数据很多,但是也很乱。
解决方法:选择主要特征值,剔除重复,冗余的特征值
# 选择主要的特征值
features = ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews']
'''
数据说明
accommodates:可以容纳的旅客、
bedrooms:卧室的数量、
bathrooms:厕所的数量、
beds:床的数量、
price:每晚的费用、
minimum_nights:客人最少租几天、
maximum_nights:客人最多租几天、
number_of_reviews:评论的数量
'''
# 获取需要的数据,并覆盖原始数据
data_listings = data_listings[features]
# 查看新数据
data_listings.head()
accommodates | bedrooms | bathrooms | beds | price | minimum_nights | maximum_nights | number_of_reviews | |
---|---|---|---|---|---|---|---|---|
0 | 4 | 1.0 | 1.0 | 2.0 | $160.00 | 1 | 1125 | 0 |
1 | 6 | 3.0 | 3.0 | 3.0 | $350.00 | 2 | 30 | 65 |
2 | 1 | 1.0 | 2.0 | 1.0 | $50.00 | 2 | 1125 | 1 |
3 | 2 | 1.0 | 1.0 | 1.0 | $95.00 | 1 | 1125 | 0 |
4 | 4 | 1.0 | 1.0 | 1.0 | $50.00 | 7 | 1125 | 0 |
问题: 上面的数据中,数据类型不统一
解决办法:转化数据类型
data_listings['price'] = data_listings['price'].str.replace("\$|,",'').astype(float)
data_listings.head()
accommodates | bedrooms | bathrooms | beds | price | minimum_nights | maximum_nights | number_of_reviews | |
---|---|---|---|---|---|---|---|---|
0 | 4 | 1.0 | 1.0 | 2.0 | 160.0 | 1 | 1125 | 0 |
1 | 6 | 3.0 | 3.0 | 3.0 | 350.0 | 2 | 30 | 65 |
2 | 1 | 1.0 | 2.0 | 1.0 | 50.0 | 2 | 1125 | 1 |
3 | 2 | 1.0 | 1.0 | 1.0 | 95.0 | 1 | 1125 | 0 |
4 | 4 | 1.0 | 1.0 | 1.0 | 50.0 | 7 | 1125 | 0 |
第三步:创建训练集和测试集
# 过滤数据
data_listings = data_listings.dropna()
# 备份数据
data_toll = data_listings
# 创建训练集和测试集
data_train_df = data_toll.copy().iloc[0:2792]
data_test_df = data_toll.copy().iloc[2792:]
创建K近邻模型
# 选择测试集的训练的列
cols = ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews']
# 设置K值,K=10(其他参数设置为默认设置)
knn = KNeighborsRegressor(n_neighbors=10)
# 将X放入训练集数据,Y放入目标输出数据
knn.fit(data_train_df[cols], data_train_df['price'])
# 输出测试结果
two_features_predictions = knn.predict(data_test_df[cols])
第四步:创建模型效果验证
two_features_mse = mean_squared_error(data_test_df['price'],two_features_predictions)
two_features_rmse = two_features_mse ** (1/2)
# 输出模型验证结果,根据结果调整近邻参数
print(two_features_rmse)
39.3117233496
第五步:调用模型设置实际值进行预测
print(knn.predict([[6,3,3,3,1,30,0]]))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-25-ae490c4d4c23> in <module>()
----> 1 print(knn.predict([[6,3,3,3,1,30,0]]))
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/neighbors/regression.py in predict(self, X)
142 X = check_array(X, accept_sparse='csr')
143
--> 144 neigh_dist, neigh_ind = self.kneighbors(X)
145
146 weights = _get_weights(neigh_dist, self.weights)
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/neighbors/base.py in kneighbors(self, X, n_neighbors, return_distance)
383 delayed(self._tree.query, check_pickle=False)(
384 X[s], n_neighbors, return_distance)
--> 385 for s in gen_even_slices(X.shape[0], n_jobs)
386 )
387 if return_distance:
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/zhj/python3/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
sklearn/neighbors/binary_tree.pxi in sklearn.neighbors.kd_tree.BinaryTree.query()
ValueError: query data dimension must match training data dimension
问题:数据维度不一致
解决办法:主要特征值为8个,应设置8个参数
print(knn.predict([[6,3,3,3,1,100,30,0]]))
[ 34.3]