【小徐】python期末

import numpy as np
import pandas as pd
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn.metrics import mean_squared_error

df=pd.read_excel(‘C:/Users/Administrator/Desktop/students.csv’)
df.head(10)

df.columns

drop_list=[‘numero_DU’, ‘id_inec_ci’, ‘id_inec_ruc’, ‘scholar_progr’, ‘scholar_compo’, ‘subject_study_1_DU’,
‘subject_study_2_DU’, ‘subject_study_3_DU’, ‘subject_study_4_DU’, ‘estado_DU’, ‘place_birth_DU’, ‘sector’, ‘working_relation’, ‘denominacion_carrera_DU’, ‘area_DU’, ‘subarea_DU’, ‘nivel_formacion_DU’, ‘type_title_DU’, ‘tipo_fecha_DU’,‘clase_contribuyente_DU’, ‘estado_personal_natural_DU’, ‘estado_sociedad_DU’, ‘pais_DU’, ‘tipo_contribuyente_DU’, ‘ubicacion_geogra_DU’, ‘participacion_DU’, ‘forma_DU’, ‘empleados_DU’, ‘ventas_totales_DU’, ‘tamano_empresa_DU’, ‘codigo_DU’, ‘codigo_actividad_DU’, ‘age_range_DU’]

df.drop(drop_list,axis=1,inplace=True)
df.info()

df.columns

df.low_salary_range

df[‘low_salary_range’]=df[‘low_salary_range’].str.replace(’,’,’.’)

df.low_salary_range

df.low_salary_range=pd.to_numeric(df.low_salary_range)

df[‘high_salary_range’]=df[‘high_salary_range’].str.replace(’,’,’.’)

df.high_salary_range=pd.to_numeric(df.high_salary_range)

df.low_salary_range
df.info()

df.high_salary_range

df.high_salary_range=
df.high_salary_range.fillna(np.mean(df.high_salary_range))

df.degree_year = df.degree_year.fillna(np.mean(df.degree_year))

df.degree_month = df.degree_month.fillna(np.mean(df.degree_month))

df.degree_day = df.degree_day.fillna(np.mean(df.degree_day))

df.info()

df.describe().T

x=[6,7,11,1,2,3,4]
y=[44,55,64,76,19,31,55]
z=[10089,11231,55356,23231,54576,455423,99086]
x_1=(x-np.mean(x))/np.std(x)
y_1=(y-np.mean(y))/np.std(y)
z_1=(z-np.mean(z))/np.std(z)
print(np.mean(x),np.mean(y),np.mean(z))

print(‘x:’,x_1,‘y:’,y_1,‘z:’,z_1)

df[‘year_high_edu’]=df[‘type_study_DU’]

df[‘years_high_edu’]=df[‘year_high_edu’].map({‘PREGRADO’: int(4),
‘MAESTRIA’: int(6), ‘DOCTORADO’: int(11), ‘POSDOCTORADO’: int(13), ‘ESPECIALIDADES MEDICAS’: int(8)})

df[‘years_high_edu’]

df[‘log_year_salary’]=df.yearly_salary.apply(np.log)

df[[‘stud_age’,‘year_high_edu’]]

df[‘experience’]=(df[‘stud_age’]-df[‘years_high_edu’]-6)

from patsy import dmatrices
import statsmodels.formula.api as smf
import patsy
from sklearn.model_selection import train_test_split

Defining y and X

y, X = patsy.dmatrices(“log_year_salary ~ years_high_edu -1”, data=df, return_type=“dataframe”)
y = np.ravel(y)
print (y.shape)
print (X.shape)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30)

print (Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape )

lm = linear_model.LinearRegression()

model = lm.fit(Xtrain, ytrain)

print (“R^2:”, model.score(Xtrain, ytrain))
print (“Coefficients:”, model.coef_)
print (“Intercept:”, model.intercept_)

predictions = lm.predict(Xtest)
predictions.shape

Predictions

Ytest

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值