python数据分析模型选择_python学习-数据分析2(分类型数据处理+SVM建模)

研究多种数据处理方式(拆分,整合,哑变量,重编码,分段变量处理等)

测试SVM模型

示例:

研究房屋相关信息预测价格区间

#!/usr/bin/env python

# -*- coding:utf-8 -*-

#1.读取数据库

import MySQLdb

import pandas as pd

import numpy as np

import pandas.io.sql as sql

conn = MySQLdb.connect(host='',

port=3306,charset='utf8',user='', passwd='', db='')

df = sql.read_sql('select * from test',conn)

conn.close()

sg_trans.gif

#2.数据处理(分词+表达式)

#拆分detail成多个变量

detailframe = pd.DataFrame()

detailnarry = df['detail']

for d in detailnarry:

ds

=pd.DataFrame(d.split('/')).T

detailframe =

detailframe.append(ds)

detailframe.columns=['bnum','floor','allfloor','dir','years']

detailframe.index=detailnarry.index

#整合数据

prodf =

pd.merge(df.ix[:,[2,3,4,5,6,7,9,10]],detailframe,left_index=True,right_index=True)

#缺失值处理,删除有空值的样本

prodf =

prodf[(prodf.years<>"")&(prodf.bnum<>"")&(prodf.dir<>"")]

#哑变量处理(回归模型需要,有些模型可不做)

dumdir = pd.get_dummies(prodf['dir'],prefix='dir')

dumbn = pd.get_dummies(prodf['bnum'],prefix='bnum')

#保留数字,删除多余文字

import re

pattern = '\d+'

regx = re.compile(pattern)

r = lambda x: int(regx.findall(x)[0])

ss = prodf.ix[:,[3,4,5,9,10,12]].applymap(r)

fliterdf =

pd.merge(prodf.ix[:,[0,1,2,6,7,8,11]],ss,left_index=True,right_index=True)

fliterdf = fliterdf.join(dumdir).join(dumbn)

#print fliterdf[:10]

fliterdf.price = fliterdf['price'].astype(float)

fliterdf = fliterdf.dropna()

#3.描述统计

print fliterdf.describe()

print fliterdf['floor'].value_counts()

by_ad = fliterdf.groupby(['floor','dir'])

print by_ad.size().unstack().fillna(0)

import matplotlib.pyplot as plt

plt.scatter(fliterdf['sizes'],fliterdf['price'])

plt.show()

#离散化预测变量

plt.hist(fliterdf['price'],bins=50,color='k')

plt.show()

bins=[0,200,500,1000,2000,4000]

fliterdf['pricecats'] = pd.cut(fliterdf['price'],bins)

#中文文本变量重新编码

bnum_to_bnm = {u'4室0厅': '1',

u'4室1厅': '2', u'4室2厅': '3', u'4室3厅': '4', u'4室4厅': '5'}

fliterdf['bnm'] =

fliterdf['bnum'].map(bnum_to_bnm)

dir_to_drm = {u'北向': '1', u'西向':

'2', u'东向': '3', u'西北向': '4', u'东北向': '5',\

u'东西向': '6',u'西南向': '7',u'东南向': '8',u'南向':

'9',u'南北向': '10'}

fliterdf['drm'] =

fliterdf['dir'].map(dir_to_drm)

#4.建模

import sklearn.linear_model

import numpy as np

#建模,测试数据分组

sampleR=0.7

nsample = len(y)

sampleBoundary = int(nsample*sampleR)

#SVM(svc,rbf,poly三种方式)

import sklearn.svm as svm

x_c=fliterdf.ix[:,[7,10,11,29,30]]

y_c= fliterdf['pricecats']

train_xc = x_c.ix[shffleIdx[:sampleBoundary]]

test_xc = x_c.ix[shffleIdx[sampleBoundary:]]

train_yc = y_c[shffleIdx[:sampleBoundary]]

test_yc = y_c[shffleIdx[sampleBoundary:]]

#svc =

svm.SVC(kernel='linear',C=1.0).fit(train_xc,train_yc)

rbf_svc =

svm.SVC(kernel='rbf',gamma=0.7,C=1.0).fit(train_xc,train_yc)

#poly_svc =

svm.SVC(kernel='poly',degree=3,C=1.0).fit(train_xc,train_yc)

#预测数据,准确率计算

pre_yc = rbf_svc.predict(test_xc)

correctnum =0

test_yc.index = range(len(pre_yc))

for i in range(len(pre_yc)):

if

(pre_yc[i]==test_yc.iloc[i]):

correctnum =correctnum +1

accuracy = correctnum*1.0/len(pre_yc)

print accuracy

#结果存入数据库

conn = MySQLdb.connect(host='',

port=3306,charset='utf8',user='', passwd='', db='')

#sql.write_frame(res,"temp_test",conn)

cur = conn.cursor()

wsql = 'INSERT INTO temp_test(y) VALUES

(%s)'

for py in pre_y:

cur.execute(wsql,py)

conn.commit()

cur.close()

conn.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值