针对3-对链家二手房数据进行预处理,以及利用CNN进行多分类操作,针对CNN分类这块,我自己再来一次整理吧!
看看能不能提高精度,我自己试一试,可能结果不太好也说不定~~~
----------------------------华丽的分割线【进入学习模式】-------------------------------
本次主要是针对上一个博客的例子,进行了2步修改
(1)在完成数据处理之后,我将'cjlouceng'中未知的数据剔除(之前是直接将它设置为0)。
(2)重点是构建train训练集、validation验证集和test测试集这边,我是将每个类别分开选取的,这样能保证我的train训练集、validation验证集以及test测试集中是包含我的所有数据类别,不会出现有某个集合(训练集、验证集和测试集)中的样本类别不全,导致模型训练出错的问题。
#导入数据,选取数据做分析【简单的数据处理:去重、去缺失值】
import pandas as pd
import numpy as np
data=pd.read_csv(r'C:\Users\Administrator\lianjia.csv',encoding='utf-8') #读取csv数据
data=data[['cjxiaoqu','cjdanjia','cjmianji','cjlouceng']]
(data.isnull()).sum()#没有缺失值
(data.duplicated()).sum() #有5个缺失值
data=data.drop_duplicates()
#data.info()
data.head(3)
#针对本文数据进行相应的数据分析
#(1)把cjdanjia改为“万/平”,原本自带的都是"元/平"
#(2)将cjmianji去掉单位,并保留1位小数
#(3)将cjloucheng中只显示数字,并去除“未知”的数据
#(4)对cjxiaoqu进行标签设置,1,2,3......这种类型
#(1)把cjdanjia改为“万/平”,原本自带的都是"元/平"
data=data.assign(cjdanjia=np.round(data.cjdanjia.map(lambda x:x/10000),2))
#(2)将cjmianji去掉单位,并保留一位小数
(~data.cjmianji.str.contains('平米')).sum()
data=data.assign(cjmianji=np.round(data.cjmianji.str.replace('平米','').astype(np.float),1))
#(3)将cjloucheng中只显示数字
#data.cjlouceng.unique()
words=['中','高','低','楼层','\(共','层\)']
for item in words:
data=data.assign(cjlouceng=data.cjlouceng.str.replace(item,''))
#data.cjlouceng.unique() #发现里面还存在一个字符串“地下室1”,'未知','地下室3'
for item in data.cjlouceng.values:
if item=='地下室1':
data=data.assign(cjlouceng=data.cjlouceng.str.replace('地下室1','-1'))
elif item=='地下室3':
data=data.assign(cjlouceng=data.cjlouceng.str.replace('地下室3','-3'))
else:
data=data.assign(cjlouceng=data.cjlouceng.str.replace('未知','0'))
data=data.assign(cjlouceng=data.cjlouceng.astype(np.int))
#data.cjlouceng.unique()
#将data中cjlouceng=0的数据去除
data.info() #2973条信息 【去除cjlouceng中包含0的信息】
cjlc_list=list(data.cjlouceng)
cjlc_list_new=[]
for item in cjlc_list:
if item!=0:
cjlc_list_new.append(item)
data=data[data.cjlouceng.isin(cjlc_list_new)]
data.info() #2972条信息 【去除了这条信息】
#(4)对cjxiaoqu进行标签设置,1,2,3......这种类型
data.cjxiaoqu.unique() #['栖霞', '江宁', '秦淮', '浦口', '雨花台', '鼓楼', '建邺', '玄武', '溧水', '六合']
keys=['栖霞', '江宁', '秦淮', '浦口', '雨花台', '鼓楼', '建邺', '玄武', '溧水', '六合']
values=['0','1','2','3','4','5','6','7','8','9']
dic=dict(map(lambda x,y:[x,y],keys,values)) #将key和values组合成字典
for item in dic:
#print(item,dic[item])
data=data.assign(cjxiaoqu=data.cjxiaoqu.str.replace(item,dic[item]))
data=data.assign(cjxiaoqu=data.cjxiaoqu.astype(np.int))
##此时做一个简单的数据分类,根据cjdanjia,cjmianji,cjlouceng,来判断它是在哪个小区 #即相当于做一个分类,但是features中包括:cjdanjia,cjmianji,cjlouceng, #labels中为cjxiaoqu #利用CNN进行分类【卷积神经网络】 #导入库 from keras import backend as K K.set_image_dim_ordering('tf') import keras import numpy as np from sklearn.cross_validation import train_test_split from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv1D, MaxPooling1D,BatchNormalization from numpy.random import seed seed(1) from tensorflow import set_random_seed set_random_seed(1)
#划分训练集、验证集、测试集 num_classes=10 #分为10类 #首先对数据进行划分,每类数据拿到我的训练、验证以及测试集,最后再进行拼接 features0=(data[data.cjxiaoqu==0])[['cjdanjia','cjmianji','cjlouceng']] #139个数据 print(len(data0)) labels0=(data[data.cjxiaoqu==0])[['cjxiaoqu']] labels0_cate=keras.utils.to_categorical(labels0,num_classes) #将label进行向量话 features0=features0.values.reshape(features0.shape[0],1,features0.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f0_train,f0_test,l0_train,l0_test=train_test_split(features0,labels0_cate,test_size=0.2,random_state=13) f0_train,f0_validation,l0_train,l0_validation=train_test_split(f0_train,l0_train,test_size=0.125,random_state=13) features1=(data[data.cjxiaoqu==1])[['cjdanjia','cjmianji','cjlouceng']] #785个数据 print(len(data1)) labels1=(data[data.cjxiaoqu==1])[['cjxiaoqu']] labels1_cate=keras.utils.to_categorical(labels1,num_classes) #将label进行向量话 features1=features1.values.reshape(features1.shape[0],1,features1.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f1_train,f1_test,l1_train,l1_test=train_test_split(features1,labels1_cate,test_size=0.2,random_state=13) f1_train,f1_validation,l1_train,l1_validation=train_test_split(f1_train,l1_train,test_size=0.125,random_state=13) features2=(data[data.cjxiaoqu==2])[['cjdanjia','cjmianji','cjlouceng']] #552个数据 print(len(data2)) labels2=(data[data.cjxiaoqu==2])[['cjxiaoqu']] labels2_cate=keras.utils.to_categorical(labels2,num_classes) #将label进行向量话 features2=features2.values.reshape(features2.shape[0],1,features2.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f2_train,f2_test,l2_train,l2_test=train_test_split(features2,labels2_cate,test_size=0.2,random_state=13) f2_train,f2_validation,l2_train,l2_validation=train_test_split(f2_train,l2_train,test_size=0.125,random_state=13) features3=(data[data.cjxiaoqu==3])[['cjdanjia','cjmianji','cjlouceng']] #334个数据 print(len(data3)) labels3=(data[data.cjxiaoqu==3])[['cjxiaoqu']] labels3_cate=keras.utils.to_categorical(labels3,num_classes) #将label进行向量话 features3=features3.values.reshape(features3.shape[0],1,features3.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f3_train,f3_test,l3_train,l3_test=train_test_split(features3,labels3_cate,test_size=0.2,random_state=13) f3_train,f3_validation,l3_train,l3_validation=train_test_split(f3_train,l3_train,test_size=0.125,random_state=13) features4=(data[data.cjxiaoqu==4])[['cjdanjia','cjmianji','cjlouceng']] #192个数据 print(len(data4)) labels4=(data[data.cjxiaoqu==4])[['cjxiaoqu']] labels4_cate=keras.utils.to_categorical(labels4,num_classes) #将label进行向量话 features4=features4.values.reshape(features4.shape[0],1,features4.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f4_train,f4_test,l4_train,l4_test=train_test_split(features4,labels4_cate,test_size=0.2,random_state=13) f4_train,f4_validation,l4_train,l4_validation=train_test_split(f4_train,l4_train,test_size=0.125,random_state=13) features5=(data[data.cjxiaoqu==5])[['cjdanjia','cjmianji','cjlouceng']] #431个数据 print(len(data5)) labels5=(data[data.cjxiaoqu==5])[['cjxiaoqu']] labels5_cate=keras.utils.to_categorical(labels5,num_classes) #将label进行向量话 features5=features5.values.reshape(features5.shape[0],1,features5.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f5_train,f5_test,l5_train,l5_test=train_test_split(features5,labels5_cate,test_size=0.2,random_state=13) f5_train,f5_validation,l5_train,l5_validation=train_test_split(f5_train,l5_train,test_size=0.125,random_state=13) features6=(data[data.cjxiaoqu==6])[['cjdanjia','cjmianji','cjlouceng']] #268个数据 print(len(data6)) labels6=(data[data.cjxiaoqu==6])[['cjxiaoqu']] labels6_cate=keras.utils.to_categorical(labels6,num_classes) #将label进行向量话 features6=features6.values.reshape(features6.shape[0],1,features6.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f6_train,f6_test,l6_train,l6_test=train_test_split(features6,labels6_cate,test_size=0.2,random_state=13) f6_train,f6_validation,l6_train,l6_validation=train_test_split(f6_train,l6_train,test_size=0.125,random_state=13) features7=(data[data.cjxiaoqu==7])[['cjdanjia','cjmianji','cjlouceng']] #223个数据 print(len(data7)) labels7=(data[data.cjxiaoqu==7])[['cjxiaoqu']] labels7_cate=keras.utils.to_categorical(labels7,num_classes) #将label进行向量话 features7=features7.values.reshape(features7.shape[0],1,features7.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f7_train,f7_test,l7_train,l7_test=train_test_split(features7,labels7_cate,test_size=0.2,random_state=13) f7_train,f7_validation,l7_train,l7_validation=train_test_split(f7_train,l7_train,test_size=0.125,random_state=13) features8=(data[data.cjxiaoqu==8])[['cjdanjia','cjmianji','cjlouceng']] #13个数据 print(len(data8)) labels8=(data[data.cjxiaoqu==8])[['cjxiaoqu']] labels8_cate=keras.utils.to_categorical(labels8,num_classes) #将label进行向量话 features8=features8.values.reshape(features8.shape[0],1,features8.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f8_train,f8_test,l8_train,l8_test=train_test_split(features8,labels8_cate,test_size=0.2,random_state=13) f8_train,f8_validation,l8_train,l8_validation=train_test_split(f8_train,l8_train,test_size=0.125,random_state=13) features9=(data[data.cjxiaoqu==9])[['cjdanjia','cjmianji','cjlouceng']] #35个数据 print(len(data9)) labels9=(data[data.cjxiaoqu==9])[['cjxiaoqu']] labels9_cate=keras.utils.to_categorical(labels9,num_classes) #将label进行向量话 features9=features9.values.reshape(features9.shape[0],1,features9.shape[1]) #输入到CNN中进行格式化转化 【CNN对输入数据的shape要求较高】 f9_train,f9_test,l9_train,l9_test=train_test_split(features9,labels9_cate,test_size=0.2,random_state=13) f9_train,f9_validation,l9_train,l9_validation=train_test_split(f9_train,l9_train,test_size=0.125,random_state=13) f_train=np.vstack((f0_train,f1_train,f2_train,f3_train,f4_train,f5_train,f6_train,f7_train,f8_train,f9_train)) l_train=np.vstack((l0_train,l1_train,l2_train,l3_train,l4_train,l5_train,l6_train,l7_train,l8_train,l9_train)) f_validation=np.vstack((f0_validation,f1_validation,f2_validation,f3_validation,f4_validation,f5_validation,f6_validation,f7_validation,f8_validation,f9_validation)) l_validation=np.vstack((l0_validation,l1_validation,l2_validation,l3_validation,l4_validation,l5_validation,l6_validation,l7_validation,l8_validation,l9_validation)) f_test=np.vstack((f0_test,f1_test,f2_test,f3_test,f4_test,f5_test,f6_test,f7_test,f8_test,f9_test)) l_test=np.vstack((l0_test,l1_test,l2_test,l3_test,l4_test,l5_test,l6_test,l7_test,l8_test,l9_test))
##建立模型
model=Sequential()
model.add(Conv1D(4096,(1),activation='relu',padding='same',input_shape=f_train.shape[1:])) #注:卷积窗口的大小设置为1,en,,,相当于没有卷积
model.add(MaxPooling1D((1))) #因为我的特征太少,所以这边的池化窗口就只能设置为1,相当于没有池化
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(4096,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))
opt = keras.optimizers.rmsprop(lr=0.001, decay=1e-6)#RMSprop是 Geoff Hinton 提出的一种自适应学习率方法
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.fit(f_train, l_train, epochs=50,validation_data=(f_validation,l_validation))
#model.save('1DCNN_max.h5') #保存模型,方便下次使用
##模型预测 loss, accuracy = model.evaluate(f_test, l_test) print(loss, accuracy)
最后精度为41.72% 【!!!效果还是不好啊!!!】
----------------------------华丽的分割线【退出学习模式】-------------------------------