【统计学习】记录一次用SKLEARN的失败机器学习

a9c93f2300

已于 2022-12-31 00:07:18 修改

阅读量468

点赞数

分类专栏：算法習作机器学习文章标签： sklearn python

于 2022-11-10 15:35:02 首次发布

本文链接：https://blog.csdn.net/m0_50939789/article/details/127790003

版权

算法習作同时被 2 个专栏收录

23 篇文章 4 订阅

订阅专栏

机器学习

1 篇文章 0 订阅

订阅专栏

数据集

青岛市市北区智慧城区：http://www.smartcbd.cn/resource/list/index/id/7.html
房地产>房地产>房地产交易数据，数据量共80页3200条

python代码

# from sklearn import datasets, preprocessing
# from sklearn.model_selection import train_test_split  
# from sklearn.linear_model import LinearRegression  
# from sklearn.metrics import r2_score

# from sklearn.datasets import load_iris  

# iris = load_iris()
# data = iris.data
# target= iris.target
# # print(data)
# # print(target)

# from sklearn import svm
# clf = svm.SVC(gamma=0.001, C=100.)
# clf.fit(data[:-2], target[:-2])
# print(clf.predict(data[-1:]))
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re

#从csv文件中获得每平米单价
price_per_1m3 =  np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=8,encoding='utf-8'))
# area = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\1-20.csv",dtype=str,delimiter=',',skiprows=1,usecols=2,encoding='utf-8'))
# area_uni = np.unique(area)
price_per_1m3_num = np.array([]) 
# print(price_per_1m3)
#把每平米单价的字符串数组转换为int型数组
for i in range(len(price_per_1m3)):
    #print(price_per_1m3[i])
    temp = re.findall(r'-?\d+\.?\d*', price_per_1m3[i])
    #print(temp[0])
    price_per_1m3_num = np.append(price_per_1m3_num, int(temp[0]))
# # price_per_1m3_uni = np.unique(price_per_1m3)
# print(price_per_1m3_num)
#print(price_per_1m3)

area =  np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=2,encoding='utf-8'))
area_num = np.array([]) 
# print(price_per_1m3)
#把每平米单价的字符串数组转换为int型数组
for i in range(len(area)):
    #print(price_per_1m3[i])
    temp = re.findall(r'-?\d+\.?\d*', area[i])
    #print(temp[0])
    area_num = np.append(area_num, float(temp[0]))

# year_situation = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=5,encoding='utf-8'))
# print(year_situation)


#从csv文件中获得户型
house_type = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=1,encoding='utf-8'))
encoder_housetype = LabelEncoder()
house_type_encoder = encoder_housetype.fit_transform(house_type)
house_type_decoder = list(encoder_housetype.inverse_transform([0]))
# print(house_type_encoder)
# print(house_type)

decoration = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=4,encoding='utf-8'))
encoder_decoration = LabelEncoder()
decoration_encoder = encoder_decoration.fit_transform(decoration)
decoration_decoder = list(encoder_decoration.inverse_transform([0]))

building_situation = np.array(np.loadtxt("D:\Project\PythonProject\MachineLearningProject\\origin_dataset.csv",dtype=str,delimiter=',',skiprows=1,usecols=7,encoding='utf-8'))
building_situation_year = np.array([])
for i, j in enumerate(building_situation):
    building_situation[i] = building_situation[i][-8:-4]
# print(building_situation)
for i in range(len(building_situation)):
    #print(price_per_1m3[i])
    try:
        temp = re.findall(r'\d\d\d\d', building_situation[i])
        # print(temp)
        # print(int(temp))
        building_situation_year = np.append(building_situation_year, int(temp[0]))
        # print(building_situation_year)
    except Exception as e :
        temp = 0
        building_situation_year = np.append(building_situation_year, temp)
print(building_situation_year)
for i, j in enumerate(building_situation_year):
    if(j<=2022 and j>2022):
        building_situation_year[i] = 2012
    elif(j<=2012 and j>2002):
        building_situation_year[i] = 1602
    elif(j<=2002 and j>1992):
        building_situation_year[i] = 1392
    elif(j == 0):
        building_situation_year[i] = 400
    else:
        building_situation_year[i] = 1200


# print(building_situation_year)
# encoder_building_situation = LabelEncoder()
# building_situation_encoder = encoder_building_situation.fit_transform(building_situation)
# building_situation_decoder = list(encoder_building_situation.inverse_transform([12]))
# # print(building_situation_encoder)
# print(building_situation_decoder)

# print(decoration)
# print(decoration_encoder)

dataset = np.array(list(zip(area_num, house_type_encoder, price_per_1m3_num, decoration_encoder)),dtype=int)
ylabel = building_situation_year
print(ylabel)




#加载红酒数据集
from sklearn.datasets import load_wine
#KNN分类算法
from sklearn.neighbors import KNeighborsClassifier
#分割训练集与测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dataset,ylabel,test_size=0.2,random_state=0)
# print(y_train)
KNN=KNeighborsClassifier(n_neighbors=7)
KNN.fit(X_train,y_train)
score=KNN.score(X_test,y_test)
print('SKLEARN中KNN算法模型准确率为：'+str(score))

import matplotlib.pylab as pyb
# pyb_x = building_situation_encoder
# pyb.scatter(price_per_1m3_num, house_type_encoder,c = decoration_encoder) #以类别区分颜色
pyb.scatter(price_per_1m3_num, area_num,c = building_situation_year, s=1.2) #以类别区分颜色
pyb.show()

# input_house_type = input("精装？简装？毛胚？其他？：")
#input_house_type_arr = np.array([input_house_type], dtype=str)
# print(input_house_type_arr)
#input_encoder_decoration = encoder_decoration.fit_transform(input_house_type_arr)
# print(input_encoder_decoration)

# input_price_per_1m3_num = eval(input("均价（例15800元/平输入15800）："))

# predict_result=KNN.predict([[input_encoder_decoration[0], input_price_per_1m3_num]])
# print(encoder_housetype.inverse_transform(predict_result)[0])