参考地址:https://github.com/jadianes/spark-py-notebooks
MLlib: 基础统计和探索性数据分析
#!/usr/bin/python # -*- coding: UTF-8 -*- import urllib from pyspark import SparkContext,SparkConf f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz") data_file = "./kddcup.data_10_percent.gz" sc = SparkContext(conf=SparkConf().setAppName("The first example")) # Creating a RDD from a file raw_data = sc.textFile(data_file) # Local vectors # An RDD of dense vectors import numpy as np def parse_interaction(line): line_split = line.split(",") # keep just numeric and logical values symbolic_indexes = [1,2,3,41] clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes] return np.array([float(x) for x in clean_line_split]) vector_data = raw_data.map(parse_interaction) print(vector_data.dtype) # Summary statistics from pyspark.mllib.stat import Statistics from math import sqrt # Compute column summary statistics. summary = Statistics.colStats(vector_data) print "Duration Statistics:" print " Mean: {}".format(round(summary.mean()[0],3)) print " St. deviation: {}".format(round(sqrt(summary.variance()[0]),3)) print " Max value: {}".format(round(summary.max()[0],3)) print " Min value: {}".format(round(summary.min()[0],3)) print " Total value count: {}".format(summary.count()) print " Number of non-zero values: {}".format(summary.numNonzeros()[0]) # Summary statistics by label def parse_interaction_with_key(line): line_split = line.split(",") # keep just numeric and logical values symbolic_indexes = [1,2,3,41] clean_line_split = [item for i,item in enumerate(line_split) if i not in symbolic_indexes] return (line_split[41], np.array([float(x) for x in clean_line_split])) label_vector_data = raw_data.map(parse_interaction_with_key) normal_label_data = label_vector_data.filter(lambda x: x[0]=="normal.") normal_summary = Statistics.colStats(normal_label_data.values()) print "Duration Statistics for label: {}".format("normal") print " Mean: {}".format(normal_summary.mean()[0],3) print " St. deviation: {}".format(round(sqrt(normal_summary.variance()[0]),3)) print " Max value: {}".format(round(normal_summary.max()[0],3)) print " Min value: {}".format(round(normal_summary.min()[0],3)) print " Total value count: {}".format(normal_summary.count()) print " Number of non-zero values: {}".format(normal_summary.numNonzeros()[0]) def summary_by_label(raw_data, label): label_vector_data = raw_data.map(parse_interaction_with_key).filter(lambda x: x[0]==label) return Statistics.colStats(label_vector_data.values()) normal_sum = summary_by_label(raw_data, "normal.") print "Duration Statistics for label: {}".format("normal") print " Mean: {}".format(normal_sum.mean()[0],3) print " St. deviation: {}".format(round(sqrt(normal_sum.variance()[0]),3)) print " Max value: {}".format(round(normal_sum.max()[0],3)) print " Min value: {}".format(round(normal_sum.min()[0],3)) print " Total value count: {}".format(normal_sum.count()) print " Number of non-zero values: {}".format(normal_sum.numNonzeros()[0]) guess_passwd_summary = summary_by_label(raw_data, "guess_passwd.") print "Duration Statistics for label: {}".format("guess_password") print " Mean: {}".format(guess_passwd_summary.mean()[0],3) print " St. deviation: {}".format(round(sqrt(guess_passwd_summary.variance()[0]),3)) print " Max value: {}".format(round(guess_passwd_summary.max()[0],3)) print " Min value: {}".format(round(guess_passwd_summary.min()[0],3)) print " Total value count: {}".format(guess_passwd_summary.count()) print " Number of non-zero values: {}".format(guess_passwd_summary.numNonzeros()[0]) label_list = ["back.","buffer_overflow.","ftp_write.","guess_passwd.", "imap.","ipsweep.","land.","loadmodule.","multihop.", "neptune.","nmap.","normal.","perl.","phf.","pod.","portsweep.", "rootkit.","satan.","smurf.","spy.","teardrop.","warezclient.", "warezmaster."] stats_by_label = [(label, summary_by_label(raw_data, label)) for label in label_list] duration_by_label = [ (stat[0], np.array([float(stat[1].mean()[0]), float(sqrt(stat[1].variance()[0])), float(stat[1].min()[0]), float(stat[1].max()[0]), int(stat[1].count())])) for stat in stats_by_label] import pandas as pd pd.set_option('display.max_columns', 50) stats_by_label_df = pd.DataFrame.from_items(duration_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index') print "Duration statistics, by label" stats_by_label_df def get_variable_stats_df(stats_by_label, column_i): column_stats_by_label = [ (stat[0], np.array([float(stat[1].mean()[column_i]), float(sqrt(stat[1].variance()[column_i])), float(stat[1].min()[column_i]), float(stat[1].max()[column_i]), int(stat[1].count())])) for stat in stats_by_label ] return pd.DataFrame.from_items(column_stats_by_label, columns=["Mean", "Std Dev", "Min", "Max", "Count"], orient='index') get_variable_stats_df(stats_by_label,0) print "src_bytes statistics, by label" get_variable_stats_df(stats_by_label,1) # Correlations from pyspark.mllib.stat import Statistics correlation_matrix = Statistics.corr(vector_data, method="spearman") import pandas as pd pd.set_option('display.max_columns', 50) col_names = ["duration","src_bytes","dst_bytes","land","wrong_fragment", "urgent","hot","num_failed_logins","logged_in","num_compromised", "root_shell","su_attempted","num_root","num_file_creations", "num_shells","num_access_files","num_outbound_cmds", "is_hot_login","is_guest_login","count","srv_count","serror_rate", "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate", "dst_host_rerror_rate","dst_host_srv_rerror_rate"] corr_df = pd.DataFrame(correlation_matrix, index=col_names, columns=col_names) # get a boolean dataframe where true means that a pair of variables is highly correlated highly_correlated_df = (abs(corr_df) > .8) & (corr_df < 1.0) # get the names of the variables so we can use them to slice the dataframe correlated_vars_index = (highly_correlated_df==True).any() correlated_var_names = correlated_vars_index[correlated_vars_index==True].index # slice it highly_correlated_df.loc[correlated_var_names,correlated_var_names]
MLlib: 逻辑回归分类
#!/usr/bin/python # -*- coding: UTF-8 -*- import urllib from pyspark import SparkContext,SparkConf f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz") data_file = "./kddcup.data_10_percent.gz" sc = SparkContext(conf=SparkConf().setAppName("The first example")) # Creating a RDD from a file raw_data = sc.textFile(data_file) print "Train data size is {}".format(raw_data.count()) ft = urllib.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz", "corrected.gz") test_data_file = "./corrected.gz" test_raw_data = sc.textFile(test_data_file) print "Test data size is {}".format(test_raw_data.count()) # Labeled Points # Preparing the training data from pyspark.mllib.regression import LabeledPoint from numpy import array def parse_interaction(line): line_split = line.split(",") # leave_out = [1,2,3,41] clean_line_split = line_split[0:1]+line_split[4:41] attack = 1.0 if line_split[41]=='normal.': attack = 0.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data = raw_data.map(parse_interaction) # Preparing the test data test_data = test_raw_data.map(parse_interaction) # Detecting network attacks using Logistic Regression # Training a classifier from pyspark.mllib.classification import LogisticRegressionWithLBFGS from time import time # Build the model t0 = time() logit_model = LogisticRegressionWithLBFGS.train(training_data) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) # Evaluating the model on new data labels_and_preds = test_data.map(lambda p: (p.label, logit_model.predict(p.features))) t0 = time() test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count()) tt = time() - t0 print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)) ------------------------------------------------------------------------------------ # Model selection # Using a correlation matrix # Evaluating the new model def parse_interaction_corr(line): line_split = line.split(",") # leave_out = [1,2,3,25,27,35,38,40,41] clean_line_split = line_split[0:1]+line_split[4:25]+line_split[26:27]+line_split[28:35]+line_split[36:38]+line_split[39:40] attack = 1.0 if line_split[41]=='normal.': attack = 0.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) corr_reduced_training_data = raw_data.map(parse_interaction_corr) corr_reduced_test_data = test_raw_data.map(parse_interaction_corr) # Build the model t0 = time() logit_model_2 = LogisticRegressionWithLBFGS.train(corr_reduced_training_data) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) labels_and_preds = corr_reduced_test_data.map(lambda p: (p.label, logit_model_2.predict(p.features))) t0 = time() test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(corr_reduced_test_data.count()) tt = time() - t0 print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)) ------------------------------------------------------------------------------------ # Using hypothesis testing feature_names = ["land","wrong_fragment", "urgent","hot","num_failed_logins","logged_in","num_compromised", "root_shell","su_attempted","num_root","num_file_creations", "num_shells","num_access_files","num_outbound_cmds", "is_hot_login","is_guest_login","count","srv_count","serror_rate", "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate", "dst_host_rerror_rate","dst_host_srv_rerror_rate"] def parse_interaction_categorical(line): line_split = line.split(",") clean_line_split = line_split[6:41] attack = 1.0 if line_split[41]=='normal.': attack = 0.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data_categorical = raw_data.map(parse_interaction_categorical) from pyspark.mllib.stat import Statistics chi = Statistics.chiSqTest(training_data_categorical) import pandas as pd pd.set_option('display.max_colwidth', 30) records = [(result.statistic, result.pValue) for result in chi] chi_df = pd.DataFrame(data=records, index= feature_names, columns=["Statistic","p-value"]) # Evaluating the new model def parse_interaction_chi(line): line_split = line.split(",") # leave_out = [1,2,3,6,19,41] clean_line_split = line_split[0:1] + line_split[4:6] + line_split[7:19] + line_split[20:41] attack = 1.0 if line_split[41]=='normal.': attack = 0.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data_chi = raw_data.map(parse_interaction_chi) test_data_chi = test_raw_data.map(parse_interaction_chi) # Build the model t0 = time() logit_model_chi = LogisticRegressionWithLBFGS.train(training_data_chi) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) # evaluate in test data. labels_and_preds = test_data_chi.map(lambda p: (p.label, logit_model_chi.predict(p.features))) t0 = time() test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data_chi.count()) tt = time() - t0 print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))
MLlib: 决策树
#!/usr/bin/python # -*- coding: UTF-8 -*- import urllib from pyspark import SparkContext,SparkConf f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz") data_file = "./kddcup.data_10_percent.gz" sc = SparkContext(conf=SparkConf().setAppName("The first example")) # Creating a RDD from a file raw_data = sc.textFile(data_file) print "Train data size is {}".format(raw_data.count()) ft = urllib.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz", "corrected.gz") test_data_file = "./corrected.gz" test_raw_data = sc.textFile(test_data_file) print "Test data size is {}".format(test_raw_data.count()) # Detecting network attacks using Decision Trees # Preparing the data from pyspark.mllib.regression import LabeledPoint from numpy import array csv_data = raw_data.map(lambda x: x.split(",")) test_csv_data = test_raw_data.map(lambda x: x.split(",")) protocols = csv_data.map(lambda x: x[1]).distinct().collect() services = csv_data.map(lambda x: x[2]).distinct().collect() flags = csv_data.map(lambda x: x[3]).distinct().collect() def create_labeled_point(line_split): # leave_out = [41] clean_line_split = line_split[0:41] # convert protocol to numeric categorical variable try: clean_line_split[1] = protocols.index(clean_line_split[1]) except: clean_line_split[1] = len(protocols) # convert service to numeric categorical variable try: clean_line_split[2] = services.index(clean_line_split[2]) except: clean_line_split[2] = len(services) # convert flag to numeric categorical variable try: clean_line_split[3] = flags.index(clean_line_split[3]) except: clean_line_split[3] = len(flags) # convert label to binary label attack = 1.0 if line_split[41] == 'normal.': attack = 0.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data = csv_data.map(create_labeled_point) test_data = test_csv_data.map(create_labeled_point) # Training a classifier from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from time import time # Build the model t0 = time() tree_model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={1: len(protocols), 2: len(services), 3: len(flags)}, impurity='gini', maxDepth=4, maxBins=100) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) # Evaluating the model predictions = tree_model.predict(test_data.map(lambda p: p.features)) labels_and_preds = test_data.map(lambda p: p.label).zip(predictions) t0 = time() test_accuracy = labels_and_preds.filter(lambda (v, p): v == p).count() / float(test_data.count()) tt = time() - t0 print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4)) # Interpreting the model print "Learned classification tree model:" print tree_model.toDebugString() print "Service 0 is {}".format(services[0]) print "Service 52 is {}".format(services[52]) # Building a minimal model using the three main splits def create_labeled_point_minimal(line_split): # leave_out = [41] clean_line_split = line_split[3:4] + line_split[5:6] + line_split[22:23] # convert flag to numeric categorical variable try: clean_line_split[0] = flags.index(clean_line_split[0]) except: clean_line_split[0] = len(flags) # convert label to binary label attack = 1.0 if line_split[41] == 'normal.': attack = 0.0 return LabeledPoint(attack, array([float(x) for x in clean_line_split])) training_data_minimal = csv_data.map(create_labeled_point_minimal) test_data_minimal = test_csv_data.map(create_labeled_point_minimal) # Build the model t0 = time() tree_model_minimal = DecisionTree.trainClassifier(training_data_minimal, numClasses=2, categoricalFeaturesInfo={0: len(flags)}, impurity='gini', maxDepth=3, maxBins=32) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) predictions_minimal = tree_model_minimal.predict(test_data_minimal.map(lambda p: p.features)) labels_and_preds_minimal = test_data_minimal.map(lambda p: p.label).zip(predictions_minimal) t0 = time() test_accuracy = labels_and_preds_minimal.filter(lambda (v, p): v == p).count() / float(test_data_minimal.count()) tt = time() - t0 print "Prediction made in {} seconds. Test accuracy is {}".format(round(tt,3), round(test_accuracy,4))