【博学谷学习记录】超强总结,用心分享| 从HDFS上读取文件并实现排序

1.上传一个words.txt文件到HDFS中

vim words.txt

输入i 进入插入模式
添加以下内容:
hadoop hive hive hadoop sqoop
sqoop kafka hadoop sqoop hive hive
hadoop hadoop hive sqoop kafka kafka
kafka hue kafka hbase hue hadoop hadoop hive
sqoop sqoop kafka hue hue kafka


上传到HDFS
hdfs dfs -mkdir -p /pyspark/wd/input
hdfs dfs -put words.txt  /pyspark/wd/input

2. 从HDFS上读取文件, 完成WordCount案例实现

from pyspark import SparkContext, SparkConf
import os
# Spark程序入门案例: pyspark在编写的时候, 必须要有程序的入口
# 锁定远端环境, 确保环境统一
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'

# 快捷键: main + 回车
if __name__ == '__main__':
    print("pyspark的入门案例: WordCount")

    # 1- 创建SparkContext核心对象 (Spark Core 的核心对象)
    # 快速返回变量:  crtl + alt + v
    conf = SparkConf().setAppName("WordCount").setMaster("local[*]")
    sc = SparkContext(conf=conf)

    # 2- 执行相关的操作
    # 2.1 读取外部文件的数据
    """
        路径的写法: 协议 + 路径
            本地路径协议:  file:///
            HDFS路径协议: hdfs://host:port/
    
        注意: 此处的路径, 千万不要写windows的路径地址, 因为咱们连接的是远程环境, 所以这里的本地路径应该指的是远端的linux本地地址
        
        textFile: 一行一行读取操作
    """
    path = "hdfs://node1:8020/pyspark/wd/input/words.txt"
    rdd_init = sc.textFile(name=path)

    """
        目前得到结果:
            [
                'hadoop hive hive hadoop sqoop', 
                'sqoop kafka hadoop sqoop hive hive', 
                'hadoop hadoop hive sqoop kafka kafka', 
                'kafka hue kafka hbase hue hadoop hadoop hive', 
                'sqoop sqoop kafka hue hue kafka'
            ]
        期望返回的结果为:
            [
                hadoop,hive,hive,hadoop,sqoop,
                sqoop,kafka,hadoop,sqoop,hive,hive,
                hadoop,hadoop,hive,sqoop,kafka,kafka,
                kafka,hue,kafka,hbase,hue,hadoop,hadoop,hive,
                sqoop,sqoop,kafka,hue,hue,kafka
            ]
    """
    # 2.2 对数据执行切割操作
    # 一 转 多: flatMap
    rdd_flatMap = rdd_init.flatMap(lambda line: line.split(' '))
    """
        得到结果为: 
            [
                'hadoop', 'hive', 'hive', 'hadoop', 'sqoop', 
                'sqoop', 'kafka', 'hadoop', 'sqoop', 'hive', 'hive', 
                'hadoop', 'hadoop', 'hive', 'sqoop', 'kafka', 'kafka', 
                'kafka', 'hue', 'kafka', 'hbase', 'hue', 'hadoop', 'hadoop', 'hive', 
                'sqoop', 'sqoop', 'kafka', 'hue', 'hue', 'kafka'
            ]
        
        希望得到结果:
            [
                ('hadoop',1), ('hive',1), ('hive',1), ('hadoop',1), ('sqoop',1), 
                ('sqoop', ('kafka',1), ('hadoop',1), ('sqoop',1), ('hive',1), ('hive',1), 
                ('hadoop', ('hadoop',1), ('hive',1), ('sqoop',1), ('kafka',1), ('kafka',1), 
                ('kafka',1), ('hue',1), ('kafka',1), ('hbase',1), ('hue',1), ('hadoop',1), ('hadoop',1), ('hive',1), 
                ('sqoop',1), ('sqoop',1), ('kafka',1), ('hue',1), ('hue',1), ('kafka',1)
            ]

    """
    # 2.3 将每一个单词转换为 (单词,1)
    # 1 转 1:  map
    rdd_map = rdd_flatMap.map(lambda word: (word, 1))
    """
        [
            ('hadoop', 1), ('hive', 1), ('hive', 1), ('hadoop', 1), ('sqoop', 1), ('sqoop', 1), 
            ('kafka', 1), ('hadoop', 1), ('sqoop', 1), ('hive', 1), ('hive', 1), 
            ('hadoop', 1), ('hadoop', 1), ('hive', 1), ('sqoop', 1), ('kafka', 1), ('kafka', 1), 
            ('kafka', 1), ('hue', 1), ('kafka', 1), ('hbase', 1), ('hue', 1), ('hadoop', 1), ('hadoop', 1), ('hive', 1), 
            ('sqoop', 1), ('sqoop', 1), ('kafka', 1), ('hue', 1), ('hue', 1), ('kafka', 1)
        ]
    """

    # 2.4 分组聚合统计
    rdd_res = rdd_map.reduceByKey(lambda agg, curr: agg + curr)
    # [('hadoop', 7), ('hive', 6), ('hue', 4), ('sqoop', 6), ('kafka', 7), ('hbase', 1)]
    # 2.5 输出结果
    print(rdd_res.collect())

    # 3. 释放资源
    sc.stop()

3.将结果输出到目的地

from pyspark import SparkContext, SparkConf
import os
# Spark程序入门案例: pyspark在编写的时候, 必须要有程序的入口
# 锁定远端环境, 确保环境统一
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'

# 快捷键: main + 回车
if __name__ == '__main__':
    print("pyspark的入门案例: WordCount")

    # 1- 创建SparkContext核心对象 (Spark Core 的核心对象)
    # 快速返回变量:  crtl + alt + v
    conf = SparkConf().setAppName("WordCount").setMaster("local[*]")
    sc = SparkContext(conf=conf)

    # 2- 执行相关的操作
    # 2.1 读取外部文件的数据
    """
        路径的写法: 协议 + 路径
            本地路径协议:  file:///
            HDFS路径协议: hdfs://host:port/
    
        注意: 此处的路径, 千万不要写windows的路径地址, 因为咱们连接的是远程环境, 所以这里的本地路径应该指的是远端的linux本地地址
        
        textFile: 一行一行读取操作
    """
    path = "hdfs://node1:8020/pyspark/wd/input/words.txt"
    rdd_init = sc.textFile(name=path)

    """
        目前得到结果:
            [
                'hadoop hive hive hadoop sqoop', 
                'sqoop kafka hadoop sqoop hive hive', 
                'hadoop hadoop hive sqoop kafka kafka', 
                'kafka hue kafka hbase hue hadoop hadoop hive', 
                'sqoop sqoop kafka hue hue kafka'
            ]
        期望返回的结果为:
            [
                hadoop,hive,hive,hadoop,sqoop,
                sqoop,kafka,hadoop,sqoop,hive,hive,
                hadoop,hadoop,hive,sqoop,kafka,kafka,
                kafka,hue,kafka,hbase,hue,hadoop,hadoop,hive,
                sqoop,sqoop,kafka,hue,hue,kafka
            ]
    """
    # 2.2 对数据执行切割操作
    # 一 转 多: flatMap
    rdd_flatMap = rdd_init.flatMap(lambda line: line.split(' '))
    """
        得到结果为: 
            [
                'hadoop', 'hive', 'hive', 'hadoop', 'sqoop', 
                'sqoop', 'kafka', 'hadoop', 'sqoop', 'hive', 'hive', 
                'hadoop', 'hadoop', 'hive', 'sqoop', 'kafka', 'kafka', 
                'kafka', 'hue', 'kafka', 'hbase', 'hue', 'hadoop', 'hadoop', 'hive', 
                'sqoop', 'sqoop', 'kafka', 'hue', 'hue', 'kafka'
            ]
        
        希望得到结果:
            [
                ('hadoop',1), ('hive',1), ('hive',1), ('hadoop',1), ('sqoop',1), 
                ('sqoop', ('kafka',1), ('hadoop',1), ('sqoop',1), ('hive',1), ('hive',1), 
                ('hadoop', ('hadoop',1), ('hive',1), ('sqoop',1), ('kafka',1), ('kafka',1), 
                ('kafka',1), ('hue',1), ('kafka',1), ('hbase',1), ('hue',1), ('hadoop',1), ('hadoop',1), ('hive',1), 
                ('sqoop',1), ('sqoop',1), ('kafka',1), ('hue',1), ('hue',1), ('kafka',1)
            ]

    """
    # 2.3 将每一个单词转换为 (单词,1)
    # 1 转 1:  map
    rdd_map = rdd_flatMap.map(lambda word: (word, 1))
    """
        [
            ('hadoop', 1), ('hive', 1), ('hive', 1), ('hadoop', 1), ('sqoop', 1), ('sqoop', 1), 
            ('kafka', 1), ('hadoop', 1), ('sqoop', 1), ('hive', 1), ('hive', 1), 
            ('hadoop', 1), ('hadoop', 1), ('hive', 1), ('sqoop', 1), ('kafka', 1), ('kafka', 1), 
            ('kafka', 1), ('hue', 1), ('kafka', 1), ('hbase', 1), ('hue', 1), ('hadoop', 1), ('hadoop', 1), ('hive', 1), 
            ('sqoop', 1), ('sqoop', 1), ('kafka', 1), ('hue', 1), ('hue', 1), ('kafka', 1)
        ]
    """

    # 2.4 分组聚合统计
    rdd_res = rdd_map.reduceByKey(lambda agg, curr: agg + curr)
    # [('hadoop', 7), ('hive', 6), ('hue', 4), ('sqoop', 6), ('kafka', 7), ('hbase', 1)]
    # 2.5 输出结果: 输出到HDFS
    # 注意: 输出目录是不能存在, 如果存在会直接报错
    rdd_res.saveAsTextFile(path='hdfs://node1:8020/pyspark/wd/output1')


    # 3. 释放资源
    sc.stop()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值