研究多种数据处理方式(拆分,整合,哑变量,重编码,分段变量处理等)
测试SVM模型
示例:
研究房屋相关信息预测价格区间
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#1.读取数据库
import MySQLdb
import pandas as pd
import numpy as np
import pandas.io.sql as sql
conn = MySQLdb.connect(host='',
port=3306,charset='utf8',user='', passwd='', db='')
df = sql.read_sql('select * from test',conn)
conn.close()
#2.数据处理(分词+表达式)
#拆分detail成多个变量
detailframe = pd.DataFrame()
detailnarry = df['detail']
for d in detailnarry:
ds
=pd.DataFrame(d.split('/')).T
detailframe =
detailframe.append(ds)
detailframe.columns=['bnum','floor','allfloor','dir','years']
detailframe.index=detailnarry.index
#整合数据
prodf =
pd.merge(df.ix[:,[2,3,4,5,6,7,9,10]],detailframe,left_index=True,right_index=True)
#缺失值处理,删除有空值的样本
prodf =
prodf[(prodf.years<>"")&(prodf.bnum<>"")&(prodf.dir<>"")]
#哑变量处理(回归模型需要,有些模型可不做)
dumdir = pd.get_dummies(prodf['dir'],prefix='dir')
dumbn = pd.get_dummies(prodf['bnum'],prefix='bnum')
#保留数字,删除多余文字
import re
pattern = '\d+'
regx = re.compile(pattern)
r = lambda x: int(regx.findall(x)[0])
ss = prodf.ix[:,[3,4,5,9,10,12]].applymap(r)
fliterdf =
pd.merge(prodf.ix[:,[0,1,2,6,7,8,11]],ss,left_index=True,right_index=True)
fliterdf = fliterdf.join(dumdir).join(dumbn)
#print fliterdf[:10]
fliterdf.price = fliterdf['price'].astype(float)
fliterdf = fliterdf.dropna()
#3.描述统计
print fliterdf.describe()
print fliterdf['floor'].value_counts()
by_ad = fliterdf.groupby(['floor','dir'])
print by_ad.size().unstack().fillna(0)
import matplotlib.pyplot as plt
plt.scatter(fliterdf['sizes'],fliterdf['price'])
plt.show()
#离散化预测变量
plt.hist(fliterdf['price'],bins=50,color='k')
plt.show()
bins=[0,200,500,1000,2000,4000]
fliterdf['pricecats'] = pd.cut(fliterdf['price'],bins)
#中文文本变量重新编码
bnum_to_bnm = {u'4室0厅': '1',
u'4室1厅': '2', u'4室2厅': '3', u'4室3厅': '4', u'4室4厅': '5'}
fliterdf['bnm'] =
fliterdf['bnum'].map(bnum_to_bnm)
dir_to_drm = {u'北向': '1', u'西向':
'2', u'东向': '3', u'西北向': '4', u'东北向': '5',\
u'东西向': '6',u'西南向': '7',u'东南向': '8',u'南向':
'9',u'南北向': '10'}
fliterdf['drm'] =
fliterdf['dir'].map(dir_to_drm)
#4.建模
import sklearn.linear_model
import numpy as np
#建模,测试数据分组
sampleR=0.7
nsample = len(y)
sampleBoundary = int(nsample*sampleR)
#SVM(svc,rbf,poly三种方式)
import sklearn.svm as svm
x_c=fliterdf.ix[:,[7,10,11,29,30]]
y_c= fliterdf['pricecats']
train_xc = x_c.ix[shffleIdx[:sampleBoundary]]
test_xc = x_c.ix[shffleIdx[sampleBoundary:]]
train_yc = y_c[shffleIdx[:sampleBoundary]]
test_yc = y_c[shffleIdx[sampleBoundary:]]
#svc =
svm.SVC(kernel='linear',C=1.0).fit(train_xc,train_yc)
rbf_svc =
svm.SVC(kernel='rbf',gamma=0.7,C=1.0).fit(train_xc,train_yc)
#poly_svc =
svm.SVC(kernel='poly',degree=3,C=1.0).fit(train_xc,train_yc)
#预测数据,准确率计算
pre_yc = rbf_svc.predict(test_xc)
correctnum =0
test_yc.index = range(len(pre_yc))
for i in range(len(pre_yc)):
if
(pre_yc[i]==test_yc.iloc[i]):
correctnum =correctnum +1
accuracy = correctnum*1.0/len(pre_yc)
print accuracy
#结果存入数据库
conn = MySQLdb.connect(host='',
port=3306,charset='utf8',user='', passwd='', db='')
#sql.write_frame(res,"temp_test",conn)
cur = conn.cursor()
wsql = 'INSERT INTO temp_test(y) VALUES
(%s)'
for py in pre_y:
cur.execute(wsql,py)
conn.commit()
cur.close()
conn.close()