机器学习 实验一 杭州二手房价预测
一、实验环境
PC机,Python
二、代码
#%%
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
#%%
train_data = pd.read_csv('train.csv')
#%%
train_data
#%%
test_data = pd.read_csv('test.csv')
#%%
test_data
#%%
full = train_data.append(test_data,ignore_index=True)
#%%
full.columns
#%%
full.head()
#%%
full.info()
#%%
full['单价'].unique()
#%%
full['楼型'].unique()
#%%
full.drop(['Unnamed: 0','简介','单价'], axis=1, inplace=True)
full
#%%
# room=[]
# living_room=[]
# for i in full['规格']:
# if(i=='车位'):
# room.append(0)
# living_room.append(0)
# else:
# for j in range(len(i)):
# if i[j]=='室':
# b=j
# room.append(i[:b])
# living_room.append(i[b+1:b+2])
# room
# len(room)
#%%
# len(living_room)
#%%
# renovation=[]
# for i in full['装修']:
# if(i=='精装'):
# renovation.append(0)
# elif(i=='简装'):
# renovation.append(1)
# elif(i=='毛坯'):
# renovation.append(2)
# elif(i=='其他'):
# renovation.append(3)
# elif(i=="板塔结合"):
# renovation.append(4)
# elif(i=='板楼'):
# renovation.append(5)
# renovation
# len(renovation)
#%%
# full['装修'].unique()
#%%
# building_type=[]
# for i in full['楼型']:
# if(i=='板塔结合'):
# building_type.append(0)
# elif(i=='板楼'):
# building_type.append(1)
# elif(i=='塔楼'):
# building_type.append(2)
# elif(i=='暂无数据'):
# building_type.append(3)
# elif(i=='平房'):
# building_type.append(4)
# else:
# building_type.append(np.random.randint(0,5))
# building_type
# len(building_type)
#%%
# full['室']=room
# full['厅']=living_room
# # full['装修']=renovation
# # full['楼型']=building_type
# full.drop(['规格'],axis=1, inplace=True)
#%%
for i in range(21318):
full['总价'][i]=full['总价'][i][:-1]
full['总价']
#%%
for i in range(31318):
full['大小'][i]=full['大小'][i][:-2]
full['大小']
#%%
full.info()
#%%
full['大小']=full['大小'].astype(float)
full['总价']=full['总价'].astype(float)
# full['室']=full['室'].astype(float)
# full['厅']=full['厅'].astype(float)
#%%
full.info()
#%%
full
#%%
object_type=[]
for col in full.columns:
if full[col].dtype =="object":
object_type.append(col)
full_dummied_object = full
for ob in object_type:
objectDf=pd.DataFrame()
objectDf=pd.get_dummies(full_dummied_object[ob],prefix=ob)
full_dummied_object=pd.concat([full_dummied_object,objectDf],axis=1)
full_dummied_object.drop(ob,axis=1,inplace=True)
full_dummied_object.head()
full=full_dummied_object
#%%
full
#%%
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
lr = linear_model.LinearRegression()
train_data=full[:21318]
source_y=train_data['总价']
train_copy=train_data.copy()
train_copy.drop(['总价'],axis=1,inplace=True)
source_x=train_copy
#%%
source_x
#%%
source_y
#%%
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
maxscore=0
for i in range(100):
train_X, test_X, train_y, test_y = train_test_split(source_x, source_y, train_size=0.8)
model = LinearRegression()
model.fit(train_X , train_y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
print(model.score(test_X, test_y))
if(model.score(test_X, test_y)>maxscore):
maxscore=model.score(test_X, test_y)
modelbest=model
if(maxscore>0.8):
break
maxscore
#%%
test_data=full[21318:]
test_copy=test_data.copy()
test_copy.drop(['总价'], axis=1, inplace=True)
test_copy
#%%
result=modelbest.predict(test_copy)
result
二、实验结果与分析
1、猎豹平台提交结果:
2、一开始可以尝试对“地区”、“规格”、“大小”、“装修”、“楼型”、“总价”这几个特征进行处理,进行预测;为进一步提高正确率考虑逐步加入其他几个特征,并进行独热编码。