# coding=utf-8 import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.preprocessing import label from sklearn.linear_model import LogisticRegression from sklearn.metrics import mean_squared_error from sklearn import metrics ''' 每年高中生和大学生都会申请进入到各种各样的高校中去。每个学生都有一组唯一的考试分数, 成绩和背景数据。录取委员会根据这个数据决定是否接受这些申请者。 在这种情况下一个二元分类算法可用于接受或拒绝申请,逻辑回归是个不错的方法。 gre - Graduate Record Exam(研究生入学考试), a generalized test for prospective graduate students(一个通用的测试未来的研究生), continuous between 200 and 800. gpa - Cumulative grade point average(累积平均绩点), continuous between 0.0 and 4.0. admit - Binary variable, 0 or 1, where 1 means the applicant was admitted to the program. ''' ''' 线性回归和逻辑回归的区别: 线性回归是预测值, 逻辑回归的预测某一类的概率值 ''' def logit(x): return np.exp(x) / (1 + np.exp(x)) #逻辑回归函数图像 def find_logit_(): t = np.linspace(-6, 6, 50, dtype=float) ylogit = logit(t) plt.plot(t, ylogit,label='logistic') plt.ylabel("Probability") plt.xlabel("t") plt.title("Logistic Function") plt.show() a = logit(-10) b = logit(10) ''' a:4.5397868702434395e-05 b:0.99995460213129761 ''' def test_plt(admissions): plt.scatter(admissions["gpa"], admissions["admit"]) plt.show() #用逻辑回归预测类别 def logit_stand(admissions): logistic_model = LogisticRegression() logistic_model.fit(admissions[['gpa']],admissions[['admit']]) pre = logistic_model.predict(admissions[['gpa']]) mse = mean_squared_error(admissions[['admit']],pre) plt.scatter(admissions[['gpa']], pre) plt.show() #predict_proba 预测概率值 def logit_pro(admissions): logistic_model = LogisticRegression() logistic_model.fit(admissions[['gpa']], admissions[['admit']]) #得到逻辑回归预测的概率值 l列可能和0列不可能 两列 pre = logistic_model.predict_proba(admissions[['gpa']]) plt.scatter(admissions[['gpa']], pre[:,1]) plt.show() #模型准确度的预测 def logit_admin(admissions): logistic_model = LogisticRegression() logistic_model.fit(admissions[['gpa']], admissions[['admit']]) labels = logistic_model.predict(admissions[['gpa']]) admissions['predicted_label'] = labels #print(admissions['predicted_label'].value_counts()) admissions['actual_label'] = admissions[['admit']] matches = admissions['predicted_label'] == admissions['actual_label'] correct_predictions = admissions[matches] accuracy = len(correct_predictions) / len(admissions)*1.0 print('预测的精度:') print(accuracy) #非均衡分类问题 #检测正例效果 #现实中,在测试集中预测 #TP true_positive_filter = (admissions['predicted_label'] == 1) & (admissions['actual_label'] == 1) true_positives = len(admissions[true_positive_filter]) #TN true_negatives_filter = (admissions['predicted_label'] == 0) & (admissions['actual_label'] == 0) true_negatives = len(admissions[true_negatives_filter]) #FN false_negatives_filter = (admissions['predicted_label'] == 0) & (admissions['actual_label'] == 1) false_negatives = len(admissions[false_negatives_filter]) senditivity = true_positives / float((true_positives + false_negatives)) print(senditivity) def test_train(admissions): np.random.seed(8) admissions['actual_label'] = admissions[['admit']] #删除某行或某列 admissions = admissions.drop('admit', axis=1) #np.random.shuffle() 没有返回值 permutation有返回打乱的index shffle_admissions = np.random.permutation(admissions.index) shuffled_admissions = admissions.loc[shffle_admissions] train = shuffled_admissions.iloc[0:515] test = shuffled_admissions.iloc[515:len(shuffled_admissions)] logistic_model = LogisticRegression() logistic_model.fit(train[['gpa']], train[['actual_label']]) labels = logistic_model.predict(test[['gpa']]) test['predicted_label'] = labels matches = test['predicted_label'] == test['actual_label'] correct_predictions = test[matches] accuracy = len(correct_predictions) / len(test)*1.0 #print('预测的精度:') #print(accuracy) ''' ROC曲线: roc_curve (真是的label,是一个概率值(某一列)) 返回值 : FP,TP ''' probabilities = logistic_model.predict_proba(test[['gpa']]) fpr, tpr, thresholds = metrics.roc_curve(test['actual_label'], probabilities[:,1]) #计算ROC曲线的面积,对模型的总和评判 area = metrics.roc_auc_score(test['actual_label'], probabilities[:, 1]) plt.plot(fpr, tpr) plt.show() if __name__ == '__main__': admissions = pd.read_csv("admissions.csv") #find_logit_() #logit_stand(admissions) #logit_pro(admissions) #logit_admin(admissions) test_train(admissions)
# coding=utf-8 from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import pandas as pd import matplotlib.pyplot as plt import numpy //线性回归 columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name'] cars = pd.read_table('auto-mpg.data',delim_whitespace=True, names=columns) #导入线性模型 lr = LinearRegression(fit_intercept=True) #对模型的数据进行训练 # 第一个参数: 样本矩阵 # 第二个参数;label(列向量 单标签) (矩阵 多标签) lr.fit(cars[['weight']], cars['mpg']) #预测v predictions = lr.predict(cars[['weight']]) #print(predictions[0:5]) #print(cars['mpg'][0:5]) #均方误差 ''' 均方误差 = 累加和{(预测值 - 真实值)^2}/个数 ''' mse = mean_squared_error(cars['mpg'],predictions) print(mse) #标准差 rmse = mse**0.5 plt.scatter(cars[['weight']], cars['mpg'], c='red') plt.scatter(cars[['weight']], predictions, c='blue') plt.show()
# coding=utf-8 from sklearn.linear_model import LogisticRegression from sklearn.metrics import mean_squared_error import pandas as pd import matplotlib.pyplot as plt import numpy as np ''' 数据集: mpg:跑公里 cylinders : 气缸数 displacement: horsepower: 马力 weight: 重 acceleration:加速度 model year: 生产年 origin: 生产地 ''' ''' 用逻辑回归解决多分类问题: 对于三个类别A,B,C,需要进行分解,对问题进行分类, 首先把A作为正例,把BC放在一起 其次 B作为正例,把AC放在一起 再者 C作为正例,把AB放在一起 分别求三者的概率,求出最大值 步骤: 1.读取数据,并把数据洗牌 2.进行数据集和测试集的划分 3.依照类别的个数,对每一中类别进行逻辑回归 4.求出几种类别的最大值,构建权重向量 ''' columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name'] cars = pd.read_table('auto-mpg.data',delim_whitespace=True, names=columns) #prefix被分类特征的前缀名 作用: prefix的值 + _ + 每一个数据 组成的列名 cummy_cylinders = pd.get_dummies(cars['cylinders'], prefix='cyl') #concat 把cars列与cummy_cylinders列连在一起 cars = pd.concat([cars, cummy_cylinders], axis=1) cummy_years = pd.get_dummies(cars['model year'], prefix='year') cars = pd.concat([cars, cummy_years], axis=1) cars = cars.drop('model year', axis=1) cars = cars.drop('cylinders', axis=1) #permutation 数据的下标进行洗牌 shuffled_rows = np.random.permutation(cars.index) shuffed_cars = cars.iloc[shuffled_rows] highest_train_row = int(cars.shape[0] * 0.7) train = shuffed_cars.iloc[0:highest_train_row] test = shuffed_cars.iloc[highest_train_row:] #取出汽车产地的分类 [1,2,3] unique_origins = cars['origin'].unique() unique_origins.sort() models = {} features = [c for c in train.columns if c.startswith('cyl') or c.startswith('year')] #print(features) for origin in unique_origins: model = LogisticRegression() X_train = train[features] y_train = train['origin'] == origin model.fit(X_train, y_train) models[origin] = model #保存每一个样例为正例的结果 testing_probs = pd.DataFrame(columns=unique_origins) for origin in unique_origins: #选择测试的特征 X_test = test[features] testing_probs[origin] = models[origin].predict_proba(X_test)[:,1 ] predicted_origins = testing_probs.idxmax(axis=1) print(predicted_origins)