赞同来自:
ipython notebook source code:
import pandas as pd
f = '../../luckdraw/ipodata.csv'
data = pd.read_csv(f,header=None,names=['Code','U1','U2','U3','U4','InitPrice','Limit','U7','U8','Ratio','U10','TotalValue','Range','FinalPrice','GainPerUnit','GainPerApp','U15','U16','U17'])
data =data[data.Range !='-']
RangeList = data['Range'].str.split("~")
data['EstLow'] = [int(row[0]) for row in RangeList]
data['EstHigh'] = [int(row[1]) for row in RangeList]
data['IsCYB'] = 0
data.loc[data['Code']<100000,'IsCYB'] = 1
data['IsZB'] = 0
data.loc[data['Code']>600000,'IsZB'] = 1
data['ZQL'] = data['Ratio'].str.replace("%", "")
data['GainPerApp'] = data['GainPerApp'].str.replace("%", "")
data['InitPrice']=pd.to_numeric(data['InitPrice'],errors='coerce')
data['ZQL']=pd.to_numeric(data['ZQL'],errors='coerce')
data['TotalValue']=pd.to_numeric(data['TotalValue'],errors='coerce')
data['FinalPrice']=pd.to_numeric(data['FinalPrice'],errors='coerce')
data['GainPerUnit']=pd.to_numeric(data['GainPerUnit'],errors='coerce')
data['GainPerApp']=pd.to_numeric(data['GainPerApp'],errors='coerce')
data['PriceGain'] = data['FinalPrice'].divide(data['InitPrice'])
data['EstLowGain'] = data['EstLow'].divide(data['InitPrice'])
data['EstHighGain'] = data['EstHigh'].divide(data['InitPrice'])
data.head()data_rp = data.loc[pd.notnull(data['FinalPrice']), ['IsCYB','IsZB','InitPrice','Limit','ZQL','TotalValue','EstLow','EstHigh','Type','FinalPrice','GainPerApp','GainPerUnit','PriceGain','EstLowGain','EstHighGain']]
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
data_raw = data_rp.loc[:,['InitPrice','Limit','ZQL','TotalValue','EstLow','EstHigh','EstLowGain','EstHighGain']]
data_finalprice = data_rp.loc[:,['PriceGain']]
data_raw_train, data_raw_test, data_finalprice_train, data_finalprice_test = train_test_split(data_raw, data_finalprice)
lr = LinearRegression()
lr.fit(data_raw_train, data_finalprice_train)
d = data.loc[data['Code']==300580, ['InitPrice','Limit','ZQL','TotalValue','EstLow','EstHigh','EstLowGain','EstHighGain']]
thisgain = lr.predict(d)