import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
df=pd.read_excel(‘C:/Users/Administrator/Desktop/students.csv’)
df.head(10)
df.columns
drop_list=[‘numero_DU’, ‘id_inec_ci’, ‘id_inec_ruc’, ‘scholar_progr’, ‘scholar_compo’, ‘subject_study_1_DU’,
‘subject_study_2_DU’, ‘subject_study_3_DU’, ‘subject_study_4_DU’, ‘estado_DU’, ‘place_birth_DU’, ‘sector’, ‘working_relation’, ‘denominacion_carrera_DU’, ‘area_DU’, ‘subarea_DU’, ‘nivel_formacion_DU’, ‘type_title_DU’, ‘tipo_fecha_DU’,‘clase_contribuyente_DU’, ‘estado_personal_natural_DU’, ‘estado_sociedad_DU’, ‘pais_DU’, ‘tipo_contribuyente_DU’, ‘ubicacion_geogra_DU’, ‘participacion_DU’, ‘forma_DU’, ‘empleados_DU’, ‘ventas_totales_DU’, ‘tamano_empresa_DU’, ‘codigo_DU’, ‘codigo_actividad_DU’, ‘age_range_DU’]
df.drop(drop_list,axis=1,inplace=True)
df.info()
df.columns
df.low_salary_range
df[‘low_salary_range’]=df[‘low_salary_range’].str.replace(’,’,’.’)
df.low_salary_range
df.low_salary_range=pd.to_numeric(df.low_salary_range)
df[‘high_salary_range’]=df[‘high_salary_range’].str.replace(’,’,’.’)
df.high_salary_range=pd.to_numeric(df.high_salary_range)
df.low_salary_range
df.info()
df.high_salary_range
df.high_salary_range=
df.high_salary_range.fillna(np.mean(df.high_salary_range))
df.degree_year = df.degree_year.fillna(np.mean(df.degree_year))
df.degree_month = df.degree_month.fillna(np.mean(df.degree_month))
df.degree_day = df.degree_day.fillna(np.mean(df.degree_day))
df.info()
df.describe().T
x=[6,7,11,1,2,3,4]
y=[44,55,64,76,19,31,55]
z=[10089,11231,55356,23231,54576,455423,99086]
x_1=(x-np.mean(x))/np.std(x)
y_1=(y-np.mean(y))/np.std(y)
z_1=(z-np.mean(z))/np.std(z)
print(np.mean(x),np.mean(y),np.mean(z))
print(‘x:’,x_1,‘y:’,y_1,‘z:’,z_1)
df[‘year_high_edu’]=df[‘type_study_DU’]
df[‘years_high_edu’]=df[‘year_high_edu’].map({‘PREGRADO’: int(4),
‘MAESTRIA’: int(6), ‘DOCTORADO’: int(11), ‘POSDOCTORADO’: int(13), ‘ESPECIALIDADES MEDICAS’: int(8)})
df[‘years_high_edu’]
df[‘log_year_salary’]=df.yearly_salary.apply(np.log)
df[[‘stud_age’,‘year_high_edu’]]
df[‘experience’]=(df[‘stud_age’]-df[‘years_high_edu’]-6)
from patsy import dmatrices
import statsmodels.formula.api as smf
import patsy
from sklearn.model_selection import train_test_split
Defining y and X
y, X = patsy.dmatrices(“log_year_salary ~ years_high_edu -1”, data=df, return_type=“dataframe”)
y = np.ravel(y)
print (y.shape)
print (X.shape)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30)
print (Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape )
lm = linear_model.LinearRegression()
model = lm.fit(Xtrain, ytrain)
print (“R^2:”, model.score(Xtrain, ytrain))
print (“Coefficients:”, model.coef_)
print (“Intercept:”, model.intercept_)
predictions = lm.predict(Xtest)
predictions.shape
Predictions
Ytest