1.上传一个words.txt文件到HDFS中
vim words.txt
输入i 进入插入模式
添加以下内容:
hadoop hive hive hadoop sqoop
sqoop kafka hadoop sqoop hive hive
hadoop hadoop hive sqoop kafka kafka
kafka hue kafka hbase hue hadoop hadoop hive
sqoop sqoop kafka hue hue kafka
上传到HDFS
hdfs dfs -mkdir -p /pyspark/wd/input
hdfs dfs -put words.txt /pyspark/wd/input
2. 从HDFS上读取文件, 完成WordCount案例实现
from pyspark import SparkContext, SparkConf
import os
# Spark程序入门案例: pyspark在编写的时候, 必须要有程序的入口
# 锁定远端环境, 确保环境统一
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
# 快捷键: main + 回车
if __name__ == '__main__':
print("pyspark的入门案例: WordCount")
# 1- 创建SparkContext核心对象 (Spark Core 的核心对象)
# 快速返回变量: crtl + alt + v
conf = SparkConf().setAppName("WordCount").setMaster("local[*]")
sc = SparkContext(conf=conf)
# 2- 执行相关的操作
# 2.1 读取外部文件的数据
"""
路径的写法: 协议 + 路径
本地路径协议: file:///
HDFS路径协议: hdfs://host:port/
注意: 此处的路径, 千万不要写windows的路径地址, 因为咱们连接的是远程环境, 所以这里的本地路径应该指的是远端的linux本地地址
textFile: 一行一行读取操作
"""
path = "hdfs://node1:8020/pyspark/wd/input/words.txt"
rdd_init = sc.textFile(name=path)
"""
目前得到结果:
[
'hadoop hive hive hadoop sqoop',
'sqoop kafka hadoop sqoop hive hive',
'hadoop hadoop hive sqoop kafka kafka',
'kafka hue kafka hbase hue hadoop hadoop hive',
'sqoop sqoop kafka hue hue kafka'
]
期望返回的结果为:
[
hadoop,hive,hive,hadoop,sqoop,
sqoop,kafka,hadoop,sqoop,hive,hive,
hadoop,hadoop,hive,sqoop,kafka,kafka,
kafka,hue,kafka,hbase,hue,hadoop,hadoop,hive,
sqoop,sqoop,kafka,hue,hue,kafka
]
"""
# 2.2 对数据执行切割操作
# 一 转 多: flatMap
rdd_flatMap = rdd_init.flatMap(lambda line: line.split(' '))
"""
得到结果为:
[
'hadoop', 'hive', 'hive', 'hadoop', 'sqoop',
'sqoop', 'kafka', 'hadoop', 'sqoop', 'hive', 'hive',
'hadoop', 'hadoop', 'hive', 'sqoop', 'kafka', 'kafka',
'kafka', 'hue', 'kafka', 'hbase', 'hue', 'hadoop', 'hadoop', 'hive',
'sqoop', 'sqoop', 'kafka', 'hue', 'hue', 'kafka'
]
希望得到结果:
[
('hadoop',1), ('hive',1), ('hive',1), ('hadoop',1), ('sqoop',1),
('sqoop', ('kafka',1), ('hadoop',1), ('sqoop',1), ('hive',1), ('hive',1),
('hadoop', ('hadoop',1), ('hive',1), ('sqoop',1), ('kafka',1), ('kafka',1),
('kafka',1), ('hue',1), ('kafka',1), ('hbase',1), ('hue',1), ('hadoop',1), ('hadoop',1), ('hive',1),
('sqoop',1), ('sqoop',1), ('kafka',1), ('hue',1), ('hue',1), ('kafka',1)
]
"""
# 2.3 将每一个单词转换为 (单词,1)
# 1 转 1: map
rdd_map = rdd_flatMap.map(lambda word: (word, 1))
"""
[
('hadoop', 1), ('hive', 1), ('hive', 1), ('hadoop', 1), ('sqoop', 1), ('sqoop', 1),
('kafka', 1), ('hadoop', 1), ('sqoop', 1), ('hive', 1), ('hive', 1),
('hadoop', 1), ('hadoop', 1), ('hive', 1), ('sqoop', 1), ('kafka', 1), ('kafka', 1),
('kafka', 1), ('hue', 1), ('kafka', 1), ('hbase', 1), ('hue', 1), ('hadoop', 1), ('hadoop', 1), ('hive', 1),
('sqoop', 1), ('sqoop', 1), ('kafka', 1), ('hue', 1), ('hue', 1), ('kafka', 1)
]
"""
# 2.4 分组聚合统计
rdd_res = rdd_map.reduceByKey(lambda agg, curr: agg + curr)
# [('hadoop', 7), ('hive', 6), ('hue', 4), ('sqoop', 6), ('kafka', 7), ('hbase', 1)]
# 2.5 输出结果
print(rdd_res.collect())
# 3. 释放资源
sc.stop()
3.将结果输出到目的地
from pyspark import SparkContext, SparkConf
import os
# Spark程序入门案例: pyspark在编写的时候, 必须要有程序的入口
# 锁定远端环境, 确保环境统一
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
# 快捷键: main + 回车
if __name__ == '__main__':
print("pyspark的入门案例: WordCount")
# 1- 创建SparkContext核心对象 (Spark Core 的核心对象)
# 快速返回变量: crtl + alt + v
conf = SparkConf().setAppName("WordCount").setMaster("local[*]")
sc = SparkContext(conf=conf)
# 2- 执行相关的操作
# 2.1 读取外部文件的数据
"""
路径的写法: 协议 + 路径
本地路径协议: file:///
HDFS路径协议: hdfs://host:port/
注意: 此处的路径, 千万不要写windows的路径地址, 因为咱们连接的是远程环境, 所以这里的本地路径应该指的是远端的linux本地地址
textFile: 一行一行读取操作
"""
path = "hdfs://node1:8020/pyspark/wd/input/words.txt"
rdd_init = sc.textFile(name=path)
"""
目前得到结果:
[
'hadoop hive hive hadoop sqoop',
'sqoop kafka hadoop sqoop hive hive',
'hadoop hadoop hive sqoop kafka kafka',
'kafka hue kafka hbase hue hadoop hadoop hive',
'sqoop sqoop kafka hue hue kafka'
]
期望返回的结果为:
[
hadoop,hive,hive,hadoop,sqoop,
sqoop,kafka,hadoop,sqoop,hive,hive,
hadoop,hadoop,hive,sqoop,kafka,kafka,
kafka,hue,kafka,hbase,hue,hadoop,hadoop,hive,
sqoop,sqoop,kafka,hue,hue,kafka
]
"""
# 2.2 对数据执行切割操作
# 一 转 多: flatMap
rdd_flatMap = rdd_init.flatMap(lambda line: line.split(' '))
"""
得到结果为:
[
'hadoop', 'hive', 'hive', 'hadoop', 'sqoop',
'sqoop', 'kafka', 'hadoop', 'sqoop', 'hive', 'hive',
'hadoop', 'hadoop', 'hive', 'sqoop', 'kafka', 'kafka',
'kafka', 'hue', 'kafka', 'hbase', 'hue', 'hadoop', 'hadoop', 'hive',
'sqoop', 'sqoop', 'kafka', 'hue', 'hue', 'kafka'
]
希望得到结果:
[
('hadoop',1), ('hive',1), ('hive',1), ('hadoop',1), ('sqoop',1),
('sqoop', ('kafka',1), ('hadoop',1), ('sqoop',1), ('hive',1), ('hive',1),
('hadoop', ('hadoop',1), ('hive',1), ('sqoop',1), ('kafka',1), ('kafka',1),
('kafka',1), ('hue',1), ('kafka',1), ('hbase',1), ('hue',1), ('hadoop',1), ('hadoop',1), ('hive',1),
('sqoop',1), ('sqoop',1), ('kafka',1), ('hue',1), ('hue',1), ('kafka',1)
]
"""
# 2.3 将每一个单词转换为 (单词,1)
# 1 转 1: map
rdd_map = rdd_flatMap.map(lambda word: (word, 1))
"""
[
('hadoop', 1), ('hive', 1), ('hive', 1), ('hadoop', 1), ('sqoop', 1), ('sqoop', 1),
('kafka', 1), ('hadoop', 1), ('sqoop', 1), ('hive', 1), ('hive', 1),
('hadoop', 1), ('hadoop', 1), ('hive', 1), ('sqoop', 1), ('kafka', 1), ('kafka', 1),
('kafka', 1), ('hue', 1), ('kafka', 1), ('hbase', 1), ('hue', 1), ('hadoop', 1), ('hadoop', 1), ('hive', 1),
('sqoop', 1), ('sqoop', 1), ('kafka', 1), ('hue', 1), ('hue', 1), ('kafka', 1)
]
"""
# 2.4 分组聚合统计
rdd_res = rdd_map.reduceByKey(lambda agg, curr: agg + curr)
# [('hadoop', 7), ('hive', 6), ('hue', 4), ('sqoop', 6), ('kafka', 7), ('hbase', 1)]
# 2.5 输出结果: 输出到HDFS
# 注意: 输出目录是不能存在, 如果存在会直接报错
rdd_res.saveAsTextFile(path='hdfs://node1:8020/pyspark/wd/output1')
# 3. 释放资源
sc.stop()