## Python回归简介

### 线性回归Python实现

# Import necessary packages
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn import datasets
from sklearn import linear_model
import numpy as np
yb = boston.target.reshape(-1, 1)
Xb = boston['data'][:,5].reshape(-1, 1)
# Plot data
plt.scatter(Xb,yb)
plt.ylabel('value of house /1000 (\$)')
plt.xlabel('number of rooms')
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit( Xb, yb)
# Plot outputs
plt.scatter(Xb, yb,  color='black')
plt.plot(Xb, regr.predict(Xb), color='blue',
linewidth=3)
plt.show()

### 逻辑回归Python实现

# Synthesize data
X1 = np.random.normal(size=150)
y1 = (X1 > 0).astype(np.float)
X1[X1 > 0] *= 4
X1 += .3 * np.random.normal(size=150)
X1= X1.reshape(-1, 1)
# Run the classifier
clf = linear_model.LogisticRegression()
clf.fit(X1, y1)
# Plot the result
plt.scatter(X1.ravel(), y1, color='black', zorder=20 , alpha = 0.5)
plt.plot(X1_ordered, clf.predict_proba(X1_ordered)[:,1], color='blue' , linewidth = 3)
plt.ylabel('target variable')
plt.xlabel('predictor variable')
plt.show()

## 逻辑回归和数据缩放：红酒数据集

# Import necessary modules
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ' , sep = ';')
X = df.drop('quality' , 1).values #drop target variable
y1 = df['quality'].values
y = y1 <= 5 # is the rating <= 5?
# plot histograms of original target variable
# and aggregated target variable
plt.figure(figsize=(20,5));
plt.subplot(1, 2, 1 );
plt.hist(y1);
plt.xlabel('original target value')
plt.ylabel('count')
plt.subplot(1, 2, 2);
plt.hist(y)
plt.xlabel('aggregated target value')
plt.show()

# Split the data into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#initial logistic regression model
lr = linear_model.LogisticRegression()
# fit the model
lr = lr.fit(X_train, y_train)
print('Logistic Regression score for training set: %f' % lr.score(X_train, y_train))
from sklearn.metrics import classification_report
y_true, y_pred = y_test, lr.predict(X_test)
print(classification_report(y_true, y_pred))
Logistic Regression score for training set: 0.752932
precision    recall  f1-score   support

False       0.78      0.74      0.76       179
True       0.69      0.74      0.71       141

avg / total       0.74      0.74      0.74       320

from sklearn.preprocessing import scale
Xs = scale(X)
Xs_train, Xs_test, y_train, y_test = train_test_split(Xs, y, test_size=0.2, random_state=42)
lr_2 = lr.fit(Xs_train, y_train)
print('Scaled Logistic Regression score for test set: %f' % lr_2.score(Xs_test, y_test))
y_true, y_pred = y_test, lr_2.predict(Xs_test)
print(classification_report(y_true, y_pred))
Scaled Logistic Regression score for test set: 0.740625
precision    recall  f1-score   support

False       0.79      0.74      0.76       179
True       0.69      0.74      0.72       141

avg / total       0.74      0.74      0.74       320

# Set sc = True if you want to scale your features
sc = False

df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ' , sep = ';')
X = df.drop('quality' , 1).values # drop target variable

# Here we scale, if desired
if sc == True:
X = scale(X)

# Target value
y1 = df['quality'].values # original target variable
y = y1 <= 5 # new target variable: is the rating <= 5?

# Split the data into a test set and a training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model and print performance on the test set
lr = linear_model.LogisticRegression()
lr = lr.fit(X_train, y_train)
print('Logistic Regression score for training set: %f' % lr.score(X_train, y_train))
y_true, y_pred = y_test, lr.predict(X_test)
print(classification_report(y_true, y_pred))
<script.py> output:
Logistic Regression score for training set: 0.752932
precision    recall  f1-score   support

False       0.78      0.74      0.76       179
True       0.69      0.74      0.71       141

avg / total       0.74      0.74      0.74       320

### 术语表

K近邻（k-Nearest Neighbors）：分类任务的一种算法，一个数据点的标签由离它最近的k个质心投票决定。

#### 机器学习基础与实践（一）----数据清洗

2016-06-23 14:39:26

#### 机器学习——数据预处理

2014-08-19 16:40:12

#### 机器学习基础（三十七） —— 处理类别特征

2016-03-31 22:42:29

#### 机器学习里数据预处理及特征工程总结

2017-11-30 11:58:11

#### 机器学习_特征处理

2018-01-07 10:29:50

#### 数据科学与机器学习管道中预处理的重要性（一）：中心化、缩放和K近邻

2016-05-21 21:50:41

#### 数据科学与机器学习管道中预处理的重要性（二）：中心化、缩放和逻辑回归

2016-05-23 22:08:07

#### 机器学习作业3 - 中心化的作用

2017-10-12 19:51:06

#### 机器学习中的特征缩放（feature scaling）浅谈

2017-12-23 18:51:56

#### 机器学习数据预处理

2015-11-07 22:06:16