import pandas as pd
import numpy as np
data = pd. DataFrame( {
'x0' : [ 1 , 2 , 3 , 4 , 5 ] ,
'x1' : [ 0.01 , - 0.01 , 0.25 , - 4.1 , 0 . ] ,
'x2' : [ - 1.5 , 0 . , 3.6 , 1.3 , - 2.1 ]
} )
data
x0 x1 x2 0 1 0.01 -1.5 1 2 -0.01 0.0 2 3 0.25 3.6 3 4 -4.10 1.3 4 5 0.00 -2.1
data. columns
Index(['x0', 'x1', 'x2'], dtype='object')
data. values
array([[ 1. , 0.01, -1.5 ],
[ 2. , -0.01, 0. ],
[ 3. , 0.25, 3.6 ],
[ 4. , -4.1 , 1.3 ],
[ 5. , 0. , -2.1 ]])
df2 = pd. DataFrame( data. values, columns= [ 'one' , 'two' , 'three' ] )
df2
one two three 0 1.0 0.01 -1.5 1 2.0 -0.01 0.0 2 3.0 0.25 3.6 3 4.0 -4.10 1.3 4 5.0 0.00 -2.1
df3 = data. copy( )
df3[ 'strings' ] = [ 'a' , 'b' , 'c' , 'd' , 'e' ]
df3
x0 x1 x2 strings 0 1 0.01 -1.5 a 1 2 -0.01 0.0 b 2 3 0.25 3.6 c 3 4 -4.10 1.3 d 4 5 0.00 -2.1 e
model_cols = [ 'x0' , 'x1' ]
data. loc[ : , model_cols] . values
array([[ 1. , 0.01],
[ 2. , -0.01],
[ 3. , 0.25],
[ 4. , -4.1 ],
[ 5. , 0. ]])
data[ 'category' ] = pd. Categorical( [ 'a' , 'b' , 'a' , 'a' , 'b' ] ,
categories= [ 'a' , 'b' ] )
data
x0 x1 x2 category 0 1 0.01 -1.5 a 1 2 -0.01 0.0 b 2 3 0.25 3.6 a 3 4 -4.10 1.3 a 4 5 0.00 -2.1 b
dummies = pd. get_dummies( data. category, prefix= 'category' )
data_with_dummies = data. drop( 'category' , axis= 1 ) . join( dummies)
data_with_dummies
x0 x1 x2 category_a category_b 0 1 0.01 -1.5 1 0 1 2 -0.01 0.0 0 1 2 3 0.25 3.6 1 0 3 4 -4.10 1.3 1 0 4 5 0.00 -2.1 0 1
data = pd. DataFrame( {
'x0' : [ 1 , 2 , 3 , 4 , 5 ] ,
'x1' : [ 0.01 , - 0.01 , 0.25 , - 4.1 , 0 . ] ,
'y' : [ - 1.5 , 0 . , 3.6 , 1.3 , - 2 . ] } )
data
x0 x1 y 0 1 0.01 -1.5 1 2 -0.01 0.0 2 3 0.25 3.6 3 4 -4.10 1.3 4 5 0.00 -2.0
import patsy
y, x = patsy. dmatrices( 'y ~ x0 + x1' , data)
y
DesignMatrix with shape (5, 1)
y
-1.5
0.0
3.6
1.3
-2.0
Terms:
'y' (column 0)
x
DesignMatrix with shape (5, 3)
Intercept x0 x1
1 1 0.01
1 2 -0.01
1 3 0.25
1 4 -4.10
1 5 0.00
Terms:
'Intercept' (column 0)
'x0' (column 1)
'x1' (column 2)
coef, resid, _, _ = np. linalg. lstsq( x, y)
C:\Anaconda\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
"""Entry point for launching an IPython kernel.
coef
array([[ 0.31290976],
[-0.07910564],
[-0.26546384]])
y, x = patsy. dmatrices( 'y~x0 + np.log(np.abs(x1) + 1)' , data)
x
DesignMatrix with shape (5, 3)
Intercept x0 np.log(np.abs(x1) + 1)
1 1 0.00995
1 2 0.00995
1 3 0.22314
1 4 1.62924
1 5 0.00000
Terms:
'Intercept' (column 0)
'x0' (column 1)
'np.log(np.abs(x1) + 1)' (column 2)
y, x = patsy. dmatrices( 'y ~ standardize(x0) + center(x1)' , data)
x
DesignMatrix with shape (5, 3)
Intercept standardize(x0) center(x1)
1 -1.41421 0.78
1 -0.70711 0.76
1 0.00000 1.02
1 0.70711 -3.33
1 1.41421 0.77
Terms:
'Intercept' (column 0)
'standardize(x0)' (column 1)
'center(x1)' (column 2)
data = pd. DataFrame( { 'key1' : [ 'a' , 'a' , 'b' , 'b' , 'a' , 'b' , 'a' , 'b' ] ,
'key2' : [ 0 , 1 , 0 , 1 , 0 , 1 , 0 , 0 ] ,
'v1' : [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ] ,
'v2' : [ - 1 , 0 , 2.5 , - 0.5 , 4.0 , - 1.2 , 0.2 , - 1.7 ]
} )
y, x = patsy. dmatrices( 'v2~key1' , data)
x
DesignMatrix with shape (8, 2)
Intercept key1[T.b]
1 0
1 0
1 1
1 1
1 0
1 1
1 0
1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
y, x = patsy. dmatrices( 'v2~key1 + 0' , data)
x
DesignMatrix with shape (8, 2)
key1[a] key1[b]
1 0
1 0
0 1
0 1
1 0
0 1
1 0
0 1
Terms:
'key1' (columns 0:2)
y, x = patsy. dmatrices( 'v2~C(key2)' , data)
x
DesignMatrix with shape (8, 2)
Intercept C(key2)[T.1]
1 0
1 1
1 0
1 1
1 0
1 1
1 0
1 0
Terms:
'Intercept' (column 0)
'C(key2)' (column 1)
data[ 'key2' ] = data[ 'key2' ] . map ( { 0 : 'zero' , 1 : 'one' } )
data
key1 key2 v1 v2 0 a zero 1 -1.0 1 a one 2 0.0 2 b zero 3 2.5 3 b one 4 -0.5 4 a zero 5 4.0 5 b one 6 -1.2 6 a zero 7 0.2 7 b zero 8 -1.7
y, x = patsy. dmatrices( 'v2~key1+key2' , data)
x
DesignMatrix with shape (8, 3)
Intercept key1[T.b] key2[T.zero]
1 0 1
1 0 0
1 1 1
1 1 0
1 0 1
1 1 0
1 0 1
1 1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
'key2' (column 2)
y, x = patsy. dmatrices( 'v2~key1+key2+key1:key2' , data)
x
DesignMatrix with shape (8, 4)
Intercept key1[T.b] key2[T.zero] key1[T.b]:key2[T.zero]
1 0 1 0
1 0 0 0
1 1 1 1
1 1 0 0
1 0 1 0
1 1 0 0
1 0 1 0
1 1 1 1
Terms:
'Intercept' (column 0)
'key1' (column 1)
'key2' (column 2)
'key1:key2' (column 3)
import statsmodels. api as sm
import statsmodels. formula. api as smf
def dnorm ( mean, variance, size= 1 ) :
if isinstance ( size, int ) :
size= size,
return mean + np. sqrt( variance) * np. random. randn( * size)
np. random. seed( 12345 )
N = 100
x = np. c_[ dnorm( 0 , 0.4 , size= N) ,
dnorm( 0 , 0.6 , size= N) ,
dnorm( 0 , 0.2 , size= N) ]
eps = dnorm( 0 , 0.1 , size= N)
beta = [ 0.1 , 0.3 , 0.5 ]
y = np. dot( x, beta) + eps
x[ : 5 ]
array([[-0.12946849, -1.21275292, 0.50422488],
[ 0.30291036, -0.43574176, -0.25417986],
[-0.32852189, -0.02530153, 0.13835097],
[-0.35147471, -0.71960511, -0.25821463],
[ 1.2432688 , -0.37379916, -0.52262905]])
x_model = sm. add_constant( x)
x_model[ : 5 ]
array([[ 1. , -0.12946849, -1.21275292, 0.50422488],
[ 1. , 0.30291036, -0.43574176, -0.25417986],
[ 1. , -0.32852189, -0.02530153, 0.13835097],
[ 1. , -0.35147471, -0.71960511, -0.25821463],
[ 1. , 1.2432688 , -0.37379916, -0.52262905]])
model = sm. OLS( y, x)
results = model. fit( )
results. params
array([0.17826108, 0.22303962, 0.50095093])
print ( results. summary( ) )
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.430
Model: OLS Adj. R-squared: 0.413
Method: Least Squares F-statistic: 24.42
Date: Fri, 09 Nov 2018 Prob (F-statistic): 7.44e-12
Time: 11:17:59 Log-Likelihood: -34.305
No. Observations: 100 AIC: 74.61
Df Residuals: 97 BIC: 82.42
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
x1 0.1783 0.053 3.364 0.001 0.073 0.283
x2 0.2230 0.046 4.818 0.000 0.131 0.315
x3 0.5010 0.080 6.237 0.000 0.342 0.660
==============================================================================
Omnibus: 4.662 Durbin-Watson: 2.201
Prob(Omnibus): 0.097 Jarque-Bera (JB): 4.098
Skew: 0.481 Prob(JB): 0.129
Kurtosis: 3.243 Cond. No. 1.74
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
data = pd. DataFrame( X, columns= [ 'col0' , 'col1' , 'col2' ] )
data[ 'y' ] = y
data[ : 5 ]
col0 col1 col2 y 0 -0.129468 -1.212753 0.504225 0.427863 1 0.302910 -0.435742 -0.254180 -0.673480 2 -0.328522 -0.025302 0.138351 -0.090878 3 -0.351475 -0.719605 -0.258215 -0.489494 4 1.243269 -0.373799 -0.522629 -0.128941
results = smf. ols( 'y ~ col0 + col1 + col2' , data= data) . fit( )
results. params
Intercept 0.033559
col0 0.176149
col1 0.224826
col2 0.514808
dtype: float64
train = pd. read_csv( 'datasets/titanic/train.csv' )
test = pd. read_csv( 'datasets/titanic/test.csv' )
train[ : 4 ]
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
train. isnull( ) . sum ( )
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
test. isnull( ) . sum ( )
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 86
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 327
Embarked 0
dtype: int64
impute_value = train[ 'Age' ] . median( )
train[ 'Age' ] = train[ 'Age' ] . fillna( impute_value)
test[ 'Age' ] = test[ 'Age' ] . fillna( impute_value)
train[ 'IsFemale' ] = ( train[ 'Sex' ] == 'female' ) . astype( int )
test[ 'IsFemale' ] = ( test[ 'Sex' ] == 'female' ) . astype( int )
predictors = [ 'Pclass' , 'IsFemale' , 'Age' ]
X_train = train[ predictors] . values
X_test = test[ predictors] . values
Y_train = train[ 'Survived' ] . values
X_train
array([[ 3., 0., 22.],
[ 1., 1., 38.],
[ 3., 1., 26.],
...,
[ 3., 1., 28.],
[ 1., 0., 26.],
[ 3., 0., 32.]])
Y_train
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], dtype=int64)
from sklearn. linear_model import LogisticRegression
model = LogisticRegression( )
model. fit( X_train, Y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)
y_predict = model. predict( X_test)
y_predict
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0],
dtype=int64)
from sklearn. linear_model import LogisticRegressionCV
model_cv = LogisticRegressionCV( 10 )
model_cv. fit( X_train, Y_train)
LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
fit_intercept=True, intercept_scaling=1.0, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)