Learning Scikit-learn Machine Learning in Python


Chapter 4: Advanced Features - Feature Engineering and Selection

%pylab inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

titanic = pd.read_csv('data/titanic.csv')
print titanic

print titanic.head()[['pclass', 'survived', 'age', 'embarked', 'boat', 'sex']]  # 注意写法, 把属性卸载head()后面的[[内

titanic.describe()

from sklearn import feature_extraction

def one_hot_dataframe(data, cols, replace=False):                       #特征抽取
    vec = feature_extraction.DictVectorizer()
    mkdict = lambda row: dict((col, row[col]) for col in cols)     #注意这个lambda的写法,pandas的apply, 此处对data[cols],一个DataFrame应用,所以row是DataFrame中的Series
    vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())   #注意apply的应用,,并设置了axis
    vecData.columns = vec.get_feature_names()
    vecData.index = data.index
    if replace:
        data = data.drop(cols, axis=1)
        data = data.join(vecData)  # 注意此处join的用法,列连接?
    return (data, vecData)

titanic, titanic_n = one_hot_dataframe(titanic, ['pclass', 'embarked', 'sex'], replace=True)


print titanic_n.head(5)
print titanic_n[titanic_n['embarked'] != 0].head()


print titanic.head()
titanic, titanic_n = one_hot_dataframe(titanic, ['home.dest', 'room', 'ticket', 'boat'], replace=True)

print titanic['age'].describe()
mean = titanic['age'].mean()
titanic['age'].fillna(mean, inplace=True)
print titanic['age'].describe()

titanic.fillna(0, inplace=True)

from sklearn.cross_validation import train_test_split
titanic_target = titanic['survived']
titanic_data = titanic.drop(['name', 'row.names', 'survived'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(titanic_data, titanic_target, test_size=0.25, random_state=33)


from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train, y_train)

import pydot, StringIO
dot_data = StringIO.StringIO()
tree.export_graphviz(dt, out_file=dot_data, feature_names=titanic_data.columns)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('titanic.png')
from IPython.core.display import Image
Image(filename='titanic.png')



from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)   #选择特征, 百分比
X_train_fs = fs.fit_transform(X_train, y_train)
print titanic_data.columns[fs.get_support()]
print fs.scores_[2]
print titanic_data.columns[2]

dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False, show_classification_report=False)




ercentiles = range(1, 100, 5)   #画出特征和准确度之间的关系图
results = []
for i in range(1, 100, 5):
    fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
    X_train_fs = fs.fit_transform(X_train, y_train)
    scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=5)
    #print i,scores.mean()
    results = np.append(results, scores.mean())

optimal_percentil = np.where(results == results.max())[0]
print "Optimal number of features:{0}".format(percentiles[optimal_percentil]), "\n"

# Plot number of features VS. cross-validation scores
import pylab as pl
pl.figure()
pl.xlabel("Number of features selected")
pl.ylabel("Cross validation accuracy)")
pl.plot(percentiles,results)
print "Mean scores:",results

fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=percentiles[optimal_percentil])
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False, show_classification_report=False)

模型选择

dt = tree.DecisionTreeClassifier(criterion='entropy')
scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=5)
print "Entropy criterion accuracy on cv: {0:.3f}".format(scores.mean())
dt = tree.DecisionTreeClassifier(criterion='gini')
scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=5)
print "Gini criterion accuracy on cv: {0:.3f}".format(scores.mean())

dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False, show_classification_report=False)



Model Selection

from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='all')

n_samples = 3000

X = news.data[:n_samples]
y = news.target[:n_samples]

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


def get_stop_words():
    result = set()
    for line in open('stopwords_en.txt', 'r').readlines():
        result.add(line.strip())
    return result

stop_words = get_stop_words()

clf = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('nb', MultinomialNB(alpha=0.01)),
])

from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

def evaluate_cross_validation(clf, X, y, K):
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print scores
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores))
		
		
evaluate_cross_validation(clf, X, y, 3)

def calc_params(X, y, clf, param_values, param_name, K):
    # initialize training and testing scores with zeros
    train_scores = np.zeros(len(param_values))
    test_scores = np.zeros(len(param_values))
    
    # iterate over the different parameter values
    for i, param_value in enumerate(param_values):
        print param_name, ' = ', param_value
        
        # set classifier parameters
        clf.set_params(**{param_name:param_value})      ###注意** 的用法
        
        # initialize the K scores obtained for each fold
        k_train_scores = np.zeros(K)
        k_test_scores = np.zeros(K)
        
        # create KFold cross validation
        cv = KFold(n_samples, K, shuffle=True, random_state=0)
        
        # iterate over the K folds
        for j, (train, test) in enumerate(cv):
            # fit the classifier in the corresponding fold
            # and obtain the corresponding accuracy scores on train and test sets
            clf.fit([X[k] for k in train], y[train])
            k_train_scores[j] = clf.score([X[k] for k in train], y[train])
            k_test_scores[j] = clf.score([X[k] for k in test], y[test])
            
        # store the mean of the K fold scores
        train_scores[i] = np.mean(k_train_scores)
        test_scores[i] = np.mean(k_test_scores)
       
    # plot the training and testing scores in a log scale
    plt.semilogx(param_values, train_scores, alpha=0.4, lw=2, c='b')
    plt.semilogx(param_values, test_scores, alpha=0.4, lw=2, c='g')
    
    plt.xlabel(param_name + " values")
    plt.ylabel("Mean cross validation accuracy")

    # return the training and testing scores on each parameter value
    return train_scores, test_scores
	
	
	alphas = np.logspace(-7, 0, 8)
print alphas

train_scores, test_scores = calc_params(X, y, clf, alphas, 'nb__alpha', 3)

from sklearn.svm import SVC

clf = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('svc', SVC()),
])


from sklearn.grid_search import GridSearchCV

parameters = {
    'svc__gamma': np.logspace(-2, 1, 4),
    'svc__C': np.logspace(-1, 1, 3),
}

clf = Pipeline([
    ('vect', TfidfVectorizer(
                stop_words=stop_words,
                token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
    )),
    ('svc', SVC()),
])

gs = GridSearchCV(clf, parameters, verbose=2, refit=False, cv=3)

Parallelizing

from sklearn.externals import joblib
from sklearn.cross_validation import ShuffleSplit
import os

def persist_cv_splits(X, y, K=3, name='data', suffix="_cv_%03d.pkl"):
    """Dump K folds to filesystem."""
    
    cv_split_filenames = []
    
    # create KFold cross validation
    cv = KFold(n_samples, K, shuffle=True, random_state=0)
    
    # iterate over the K folds
    for i, (train, test) in enumerate(cv):
        cv_fold = ([X[k] for k in train], y[train], [X[k] for k in test], y[test])
        cv_split_filename = name + suffix % i
        cv_split_filename = os.path.abspath(cv_split_filename)
        joblib.dump(cv_fold, cv_split_filename)
        cv_split_filenames.append(cv_split_filename)
    
    return cv_split_filenames
	
def compute_evaluation(cv_split_filename, clf, params):
    
    # All module imports should be executed in the worker namespace
    from sklearn.externals import joblib

    # load the fold training and testing partitions from the filesystem
    X_train, y_train, X_test, y_test = joblib.load(
        cv_split_filename, mmap_mode='c')
    
    clf.set_params(**params)
    clf.fit(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    return test_score
	
	
from sklearn.grid_search import IterGrid

def parallel_grid_search(lb_view, clf, cv_split_filenames, param_grid):
    
    all_tasks = []
    all_parameters = list(IterGrid(param_grid))
    
    # iterate over parameter combinations
    for i, params in enumerate(all_parameters):
        task_for_params = []
        
        # iterate over the K folds
        for j, cv_split_filename in enumerate(cv_split_filenames):    
            t = lb_view.apply(
                compute_evaluation, cv_split_filename, clf, params)
            task_for_params.append(t) 
        
        all_tasks.append(task_for_params)
        
    return all_parameters, all_tasks
	
from sklearn.svm import SVC
from IPython.parallel import Client

client = Client()
lb_view = client.load_balanced_view()

all_parameters, all_tasks = parallel_grid_search(
   lb_view, clf, cv_filenames, parameters)
   
 
def print_progress(tasks):
    progress = np.mean([task.ready() for task_group in tasks
                                 for task in task_group])
    print "Tasks completed: {0}%".format(100 * progress)
	
	
def find_bests(all_parameters, all_tasks, n_top=5):
    """Compute the mean score of the completed tasks"""
    mean_scores = []
    
    for param, task_group in zip(all_parameters, all_tasks):
        scores = [t.get() for t in task_group if t.ready()]
        if len(scores) == 0:
            continue
        mean_scores.append((np.mean(scores), param))
                   
    return sorted(mean_scores, reverse=True)[:n_top]


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
Machine Learning in Python: Essential Techniques for Predictive Analysis Paperback: 360 pages Publisher: Wiley; 1 edition (April 27, 2015) Language: English ISBN-10: 1118961749 ISBN-13: 978-1118961742 Learn a simpler and more effective way to analyze data and predict outcomes with Python Machine Learning in Python shows you how to successfully analyze data using only two core machine learning algorithms, and how to apply them using Python. By focusing on two algorithm families that effectively predict outcomes, this book is able to provide full descriptions of the mechanisms at work, and the examples that illustrate the machinery with specific, hackable code. The algorithms are explained in simple terms with no complex math and applied using Python, with guidance on algorithm selection, data preparation, and using the trained models in practice. You will learn a core set of Python programming techniques, various methods of building predictive models, and how to measure the performance of each model to ensure that the right one is used. The chapters on penalized linear regression and ensemble methods dive deep into each of the algorithms, and you can use the sample code in the book to develop your own data analysis solutions. Machine learning algorithms are at the core of data analytics and visualization. In the past, these methods required a deep background in math and statistics, often in combination with the specialized R programming language. This book demonstrates how machine learning can be implemented using the more widely used and accessible Python programming language. * Predict outcomes using linear and ensemble algorithm families * Build predictive models that solve a range of simple and complex problems * Apply core machine learning algorithms using Python * Use sample code directly to build custom solutions Machine learning doesn't have to be complex and highly specialized. Python makes this technology more accessible to a much wider audience, using methods that are simpler, effective, and well tested. Machine Learning in Python shows you how to do this, without requiring an extensive background in math or statistics.
Title: Machine Learning in Python: Essential Techniques for Predictive Analysis Author: Michael Bowles Length: 360 pages Edition: 1 Language: English Publisher: Wiley Publication Date: 2015-04-20 ISBN-10: 1118961749 ISBN-13: 9781118961742 Learn a simpler and more effective way to analyze data and predict outcomes with Python Machine Learning in Python shows you how to successfully analyze data using only two core machine learning algorithms, and how to apply them using Python. By focusing on two algorithm families that effectively predict outcomes, this book is able to provide full descriptions of the mechanisms at work, and the examples that illustrate the machinery with specific, hackable code. The algorithms are explained in simple terms with no complex math and applied using Python, with guidance on algorithm selection, data preparation, and using the trained models in practice. You will learn a core set of Python programming techniques, various methods of building predictive models, and how to measure the performance of each model to ensure that the right one is used. The chapters on penalized linear regression and ensemble methods dive deep into each of the algorithms, and you can use the sample code in the book to develop your own data analysis solutions. Machine learning algorithms are at the core of data analytics and visualization. In the past, these methods required a deep background in math and statistics, often in combination with the specialized R programming language. This book demonstrates how machine learning can be implemented using the more widely used and accessible Python programming language. * Predict outcomes using linear and ensemble algorithm families * Build predictive models that solve a range of simple and complex problems * Apply core machine learning algorithms using Python * Use sample code directly to build custom solutions Machine learning doesn't have to be complex and highly specialized. Python makes this technology more acces
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

张博208

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值