[优达 机器学习入门]课程8:异常值

##带有异常值的回归斜率

from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(ages_train, net_worths_train)
print(reg.coef_)

##带有异常值的回归分数

print(reg.score(ages_test, net_worths_test))


##清理后的斜率

#outlier_removal_regression.py
def outlierCleaner(predictions, ages, net_worths):  
    cleaned_data = []
    cleaned_data = zip(ages,net_worths,abs(net_worths-predictions))
    cleaned_data = sorted(cleaned_data , key=lambda x: (x[2]))
    cleaned_data = cleaned_data[:81]

    return cleaned_data


##清理后的分数

#训练——清理——训练
reg = linear_model.LinearRegression()
reg.fit(ages, net_worths)
print(reg.coef_)
print(reg.score(ages_test, net_worths_test))


##识别最大的安然异常值

#!/usr/bin/python

import pickle
import sys
import matplotlib.pyplot
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit


### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("./tools/final_project_dataset.pkl", "rb") )
features = ["salary", "bonus"]
data = featureFormat(data_dict, features)
print(data)
for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

##还有更多异常值吗?

#!/usr/bin/python

import pickle
import sys
import matplotlib.pyplot
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit


### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )

# answer
data_dict.pop( 'TOTAL', 0 )

features = ["salary", "bonus"]
data = featureFormat(data_dict, features)


for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

##再识别两个异常值

#!/usr/bin/python

import pickle
import sys
import matplotlib.pyplot
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit


### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )

# answer
data_dict.pop( 'TOTAL', 0 )

features = ["salary", "bonus"]
data = featureFormat(data_dict, features)


for item in data_dict:
    if data_dict[item]['bonus'] != 'NaN' and data_dict[item]['salary'] != 'NaN':
        if data_dict[item]['bonus'] > 5e6 and data_dict[item]['salary'] > 1e6:
            print item

for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值