##带有异常值的回归斜率
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(ages_train, net_worths_train)
print(reg.coef_)
##带有异常值的回归分数
print(reg.score(ages_test, net_worths_test))
##清理后的斜率
#outlier_removal_regression.py
def outlierCleaner(predictions, ages, net_worths):
cleaned_data = []
cleaned_data = zip(ages,net_worths,abs(net_worths-predictions))
cleaned_data = sorted(cleaned_data , key=lambda x: (x[2]))
cleaned_data = cleaned_data[:81]
return cleaned_data
##清理后的分数
#训练——清理——训练
reg = linear_model.LinearRegression()
reg.fit(ages, net_worths)
print(reg.coef_)
print(reg.score(ages_test, net_worths_test))
##识别最大的安然异常值
#!/usr/bin/python
import pickle
import sys
import matplotlib.pyplot
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("./tools/final_project_dataset.pkl", "rb") )
features = ["salary", "bonus"]
data = featureFormat(data_dict, features)
print(data)
for point in data:
salary = point[0]
bonus = point[1]
matplotlib.pyplot.scatter( salary, bonus )
matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()
##还有更多异常值吗?
#!/usr/bin/python
import pickle
import sys
import matplotlib.pyplot
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
# answer
data_dict.pop( 'TOTAL', 0 )
features = ["salary", "bonus"]
data = featureFormat(data_dict, features)
for point in data:
salary = point[0]
bonus = point[1]
matplotlib.pyplot.scatter( salary, bonus )
matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()
##再识别两个异常值
#!/usr/bin/python
import pickle
import sys
import matplotlib.pyplot
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
### read in data dictionary, convert to numpy array
data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
# answer
data_dict.pop( 'TOTAL', 0 )
features = ["salary", "bonus"]
data = featureFormat(data_dict, features)
for item in data_dict:
if data_dict[item]['bonus'] != 'NaN' and data_dict[item]['salary'] != 'NaN':
if data_dict[item]['bonus'] > 5e6 and data_dict[item]['salary'] > 1e6:
print item
for point in data:
salary = point[0]
bonus = point[1]
matplotlib.pyplot.scatter( salary, bonus )
matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()