pyspark-RDD

参考地址:https://github.com/jadianes/spark-py-notebooks


RDD 创建

 #!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib
from pyspark import SparkContext,SparkConf
f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

data_file = "./kddcup.data_10_percent.gz"
sc = SparkContext(conf=SparkConf().setAppName("The first example"))

# Creating a RDD from a file
raw_data = sc.textFile(data_file)
raw_data.count() # 494021
raw_data.take(5) # We can also check the first few entries in our data.

# Creating and RDD using parallelize
a = range(100)
data = sc.parallelize(a)
data.count() # 100
data.take(5) # [0, 1, 2, 3, 4]


RDD 基础

 #!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib
from pyspark import SparkContext,SparkConf
f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

data_file = "./kddcup.data_10_percent.gz"
sc = SparkContext(conf=SparkConf().setAppName("The first example"))

# Creating a RDD from a file
raw_data = sc.textFile(data_file)

# The filter transformation
normal_raw_data = raw_data.filter(lambda x: 'normal.' in x)
# imagine we want to count how many normal. interactions we have in our dataset.

from time import time
t0 = time()
normal_count = normal_raw_data.count()
tt = time() - t0
print "There are {} 'normal' interactions".format(normal_count)
print "Count completed in {} seconds".format(round(tt,3))

# The map transformation
# By using the map transformation in Spark, we can apply a function to every element in our RDD
from pprint import pprint
csv_data = raw_data.map(lambda x: x.split(","))
t0 = time()
head_rows = csv_data.take(5)
tt = time() - t0
print "Parse completed in {} seconds".format(round(tt,3))
pprint(head_rows[0])

t0 = time()
head_rows = csv_data.take(100000)
tt = time() - t0
print "Parse completed in {} seconds".format(round(tt,3))

# Using map and predefined functions
def parse_interaction(line):
    elems = line.split(",")
    tag = elems[41]
    return (tag, elems)

key_csv_data = raw_data.map(parse_interaction)
head_rows = key_csv_data.take(5)
pprint(head_rows[0])

# The collect action
t0 = time()
all_raw_data = raw_data.collect()
tt = time() - t0
print "Data collected in {} seconds".format(round(tt,3))

-------------------------------------------------------------------
# get data from file
data_file = "./kddcup.data_10_percent.gz"
raw_data = sc.textFile(data_file)

# parse into key-value pairs
key_csv_data = raw_data.map(parse_interaction)

# filter normal key interactions
normal_key_interactions = key_csv_data.filter(lambda x: x[0] == "normal.")

# collect all
t0 = time()
all_normal = normal_key_interactions.collect()
tt = time() - t0
normal_count = len(all_normal)
print "Data collected in {} seconds".format(round(tt,3))
print "There are {} 'normal' interactions".format(normal_count)
 

采样 RDDs

 #!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib
from pyspark import SparkContext,SparkConf
f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

data_file = "./kddcup.data_10_percent.gz"
sc = SparkContext(conf=SparkConf().setAppName("The first example"))

# Creating a RDD from a file
raw_data = sc.textFile(data_file)

# Sampling RDDs
# The sample transformation
raw_data_sample = raw_data.sample(False, 0.1, 1234) # 0.1 表示随机选取原样本中的10%
sample_size = raw_data_sample.count() # 489957
total_size = raw_data.count() # 4898431
print "Sample size is {} of {}".format(sample_size, total_size)


from time import time

# transformations to be applied
raw_data_sample_items = raw_data_sample.map(lambda x: x.split(","))
sample_normal_tags = raw_data_sample_items.filter(lambda x: "normal." in x)

# actions + time
t0 = time()
sample_normal_tags_count = sample_normal_tags.count()
tt = time() - t0

sample_normal_ratio = sample_normal_tags_count / float(sample_size)
print "The ratio of 'normal' interactions is {}".format(round(sample_normal_ratio,3))
print "Count done in {} seconds".format(round(tt,3))


# transformations to be applied
raw_data_items = raw_data.map(lambda x: x.split(","))
normal_tags = raw_data_items.filter(lambda x: "normal." in x)

# actions + time
t0 = time()
normal_tags_count = normal_tags.count()
tt = time() - t0

normal_ratio = normal_tags_count / float(total_size)
print "The ratio of 'normal' interactions is {}".format(round(normal_ratio,3))
print "Count done in {} seconds".format(round(tt,3))

# The takeSample action
t0 = time()
raw_data_sample = raw_data.takeSample(False, 400000, 1234) # 随机取其中的400000个样本
normal_data_sample = [x.split(",") for x in raw_data_sample if "normal." in x]
tt = time() - t0

normal_sample_size = len(normal_data_sample)

normal_ratio = normal_sample_size / 400000.0
print "The ratio of 'normal' interactions is {}".format(normal_ratio)
print "Count done in {} seconds".format(round(tt,3))


在RDD上设置操作

 #!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib
from pyspark import SparkContext,SparkConf
f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

data_file = "./kddcup.data_10_percent.gz"
sc = SparkContext(conf=SparkConf().setAppName("The first example"))

# Creating a RDD from a file
raw_data = sc.textFile(data_file)

# Getting attack interactions using subtract
normal_raw_data = raw_data.filter(lambda x: "normal." in x)

attack_raw_data = raw_data.subtract(normal_raw_data) # 得到不含normal

from time import time

# count all
t0 = time()
raw_data_count = raw_data.count()
tt = time() - t0
print "All count in {} secs".format(round(tt,3))

# count normal
t0 = time()
normal_raw_data_count = normal_raw_data.count()
tt = time() - t0
print "Normal count in {} secs".format(round(tt,3))

# count attacks
t0 = time()
attack_raw_data_count = attack_raw_data.count()
tt = time() - t0
print "Attack count in {} secs".format(round(tt,3))

print "There are {} normal interactions and {} attacks, \
from a total of {} interactions".format(normal_raw_data_count,attack_raw_data_count,raw_data_count)
# There are 97278 normal interactions and 396743 attacks, from a total of 494021 interactions


# Protocol and service combinations using cartesian
csv_data = raw_data.map(lambda x: x.split(","))
protocols = csv_data.map(lambda x: x[1]).distinct() # distinct 去重
protocols.collect()

services = csv_data.map(lambda x: x[2]).distinct()
services.collect()

product = protocols.cartesian(services).collect() # cartesian 交叉组合
print "There are {} combinations of protocol X service".format(len(product))
# There are 198 combinations of protocol X service


RDD上的数据汇总

 #!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib
from pyspark import SparkContext,SparkConf
f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

data_file = "./kddcup.data_10_percent.gz"
sc = SparkContext(conf=SparkConf().setAppName("The first example"))

# Creating a RDD from a file
raw_data = sc.textFile(data_file)

# Inspecting interaction duration by tag
# parse data
csv_data = raw_data.map(lambda x: x.split(","))

# separate into different RDDs
normal_csv_data = csv_data.filter(lambda x: x[41]=="normal.")
attack_csv_data = csv_data.filter(lambda x: x[41]!="normal.")

normal_duration_data = normal_csv_data.map(lambda x: int(x[0]))
attack_duration_data = attack_csv_data.map(lambda x: int(x[0]))

total_normal_duration = normal_duration_data.reduce(lambda x, y: x + y)
total_attack_duration = attack_duration_data.reduce(lambda x, y: x + y)

print "Total duration for 'normal' interactions is {}".\
    format(total_normal_duration)
print "Total duration for 'attack' interactions is {}".\
    format(total_attack_duration)

normal_count = normal_duration_data.count()
attack_count = attack_duration_data.count()

print "Mean duration for 'normal' interactions is {}".\
    format(round(total_normal_duration/float(normal_count),3))
print "Mean duration for 'attack' interactions is {}".\
    format(round(total_attack_duration/float(attack_count),3))

# A better way, using aggregate
normal_sum_count = normal_duration_data.aggregate(
    (0,0), # the initial value
    (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc
    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators
)

print "Mean duration for 'normal' interactions is {}".\
    format(round(normal_sum_count[0]/float(normal_sum_count[1]),3))


attack_sum_count = attack_duration_data.aggregate(
    (0,0), # the initial value
    (lambda acc, value: (acc[0] + value, acc[1] + 1)), # combine value with acc
    (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])) # combine accumulators
)

print "Mean duration for 'attack' interactions is {}".\
    format(round(attack_sum_count[0]/float(attack_sum_count[1]),3))

使用键/值对RDD

 #!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib
from pyspark import SparkContext,SparkConf
f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")

data_file = "./kddcup.data_10_percent.gz"
sc = SparkContext(conf=SparkConf().setAppName("The first example"))

# Creating a RDD from a file
raw_data = sc.textFile(data_file)

# Creating a pair RDD for interaction types
csv_data = raw_data.map(lambda x: x.split(","))
key_value_data = csv_data.map(lambda x: (x[41], x)) # x[41] contains the network interaction tag

key_value_data.take(1)

# Data aggregations with key/value pair RDDs
key_value_duration = csv_data.map(lambda x: (x[41], float(x[0])))
durations_by_key = key_value_duration.reduceByKey(lambda x, y: x + y)

durations_by_key.collect()

counts_by_key = key_value_data.countByKey()
counts_by_key

# Using combineByKey
sum_counts = key_value_duration.combineByKey(
    (lambda x: (x, 1)), # the initial value, with value x and count 1
    (lambda acc, value: (acc[0]+value, acc[1]+1)), # how to combine a pair value with the accumulator: sum value, and increment count
    (lambda acc1, acc2: (acc1[0]+acc2[0], acc1[1]+acc2[1])) # combine accumulators
)

sum_counts.collectAsMap()

duration_means_by_type = sum_counts.map(lambda (key,value): (key, round(value[0]/value[1],3))).collectAsMap()

# Print them sorted
for tag in sorted(duration_means_by_type, key=duration_means_by_type.get, reverse=True):
    print tag, duration_means_by_type[tag]

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值