数据集
青岛市市北区智慧城区:http://www.smartcbd.cn/resource/list/index/id/7.html
房地产>房地产>房地产交易数据,数据量共80页3200条
python代码
# from sklearn import datasets, preprocessing
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import r2_score
# from sklearn.datasets import load_iris
# iris = load_iris()
# data = iris.data
# target= iris.target
# # print(data)
# # print(target)
# from sklearn import svm
# clf = svm.SVC(gamma=0.001, C=100.)
# clf.fit(data[:-2], target[:-2])
# print(clf.predict(data[-1:]))
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re
#从csv文件中获得每平米单价
price_per_1m3 = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=8,encoding='utf-8'))
# area = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\1-20.csv",dtype=str,delimiter=',',skiprows=1,usecols=2,encoding='utf-8'))
# area_uni = np.unique(area)
price_per_1m3_num = np.array([])
# print(price_per_1m3)
#把每平米单价的字符串数组转换为int型数组
for i in range(len(price_per_1m3)):
#print(price_per_1m3[i])
temp = re.findall(r'-?\d+\.?\d*', price_per_1m3[i])
#print(temp[0])
price_per_1m3_num = np.append(price_per_1m3_num, int(temp[0]))
# # price_per_1m3_uni = np.unique(price_per_1m3)
# print(price_per_1m3_num)
#print(price_per_1m3)
area = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=2,encoding='utf-8'))
area_num = np.array([])
# print(price_per_1m3)
#把每平米单价的字符串数组转换为int型数组
for i in range(len(area)):
#print(price_per_1m3[i])
temp = re.findall(r'-?\d+\.?\d*', area[i])
#print(temp[0])
area_num = np.append(area_num, float(temp[0]))
# year_situation = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=5,encoding='utf-8'))
# print(year_situation)
#从csv文件中获得户型
house_type = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=1,encoding='utf-8'))
encoder_housetype = LabelEncoder()
house_type_encoder = encoder_housetype.fit_transform(house_type)
house_type_decoder = list(encoder_housetype.inverse_transform([0]))
# print(house_type_encoder)
# print(house_type)
decoration = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=4,encoding='utf-8'))
encoder_decoration = LabelEncoder()
decoration_encoder = encoder_decoration.fit_transform(decoration)
decoration_decoder = list(encoder_decoration.inverse_transform([0]))
building_situation = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=7,encoding='utf-8'))
building_situation_year = np.array([])
for i, j in enumerate(building_situation):
building_situation[i] = building_situation[i][-8:-4]
# print(building_situation)
for i in range(len(building_situation)):
#print(price_per_1m3[i])
try:
temp = re.findall(r'\d\d\d\d', building_situation[i])
# print(temp)
# print(int(temp))
building_situation_year = np.append(building_situation_year, int(temp[0]))
# print(building_situation_year)
except Exception as e :
temp = 0
building_situation_year = np.append(building_situation_year, temp)
print(building_situation_year)
for i, j in enumerate(building_situation_year):
if(j<=2022 and j>2022):
building_situation_year[i] = 2012
elif(j<=2012 and j>2002):
building_situation_year[i] = 1602
elif(j<=2002 and j>1992):
building_situation_year[i] = 1392
elif(j == 0):
building_situation_year[i] = 400
else:
building_situation_year[i] = 1200
# print(building_situation_year)
# encoder_building_situation = LabelEncoder()
# building_situation_encoder = encoder_building_situation.fit_transform(building_situation)
# building_situation_decoder = list(encoder_building_situation.inverse_transform([12]))
# # print(building_situation_encoder)
# print(building_situation_decoder)
# print(decoration)
# print(decoration_encoder)
dataset = np.array(list(zip(area_num, house_type_encoder, price_per_1m3_num, decoration_encoder)),dtype=int)
ylabel = building_situation_year
print(ylabel)
#加载红酒数据集
from sklearn.datasets import load_wine
#KNN分类算法
from sklearn.neighbors import KNeighborsClassifier
#分割训练集与测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dataset,ylabel,test_size=0.2,random_state=0)
# print(y_train)
KNN=KNeighborsClassifier(n_neighbors=7)
KNN.fit(X_train,y_train)
score=KNN.score(X_test,y_test)
print('SKLEARN中KNN算法模型准确率为:'+str(score))
import matplotlib.pylab as pyb
# pyb_x = building_situation_encoder
# pyb.scatter(price_per_1m3_num, house_type_encoder,c = decoration_encoder) #以类别区分颜色
pyb.scatter(price_per_1m3_num, area_num,c = building_situation_year, s=1.2) #以类别区分颜色
pyb.show()
# input_house_type = input("精装?简装?毛胚?其他?:")
#input_house_type_arr = np.array([input_house_type], dtype=str)
# print(input_house_type_arr)
#input_encoder_decoration = encoder_decoration.fit_transform(input_house_type_arr)
# print(input_encoder_decoration)
# input_price_per_1m3_num = eval(input("均价(例15800元/平输入15800):"))
# predict_result=KNN.predict([[input_encoder_decoration[0], input_price_per_1m3_num]])
# print(encoder_housetype.inverse_transform(predict_result)[0])
效果
SKLEARN中KNN算法模型准确率为:0.3625