Hive:
set mapreduce.job.reduces=2;
select word, count(1) as num
from (select explode(split(line, " ")) word from docs) t
group by word;
没有加order by时:只有一个mr
加了order by 时:有两个mr,因为order by 做全局排序用了一个mr,并且只会产生一个reduce,切记!!!
Spark:
package main.spark_core
import org.apache.spark.{SparkConf, SparkContext}
object wordcount练习 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("练习wordcount").setMaster("local[2]")
val sc = new SparkContext(conf)
val rdd = sc.textFile("/root/IdeaProjects/spark_train/src/main/data/hello.txt")
val data = rdd.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_+_)
.map(x=>(x._1 + "\t" + x._2))
data.foreach(println)
}
}
结果如下:
Python:
map端:
import sys
for line in sys.stdin:
ss = line.strip().split(" ")
for word in ss:
print("\t".join([word.strip(), '1']))
reduce端:
import sys
curword = ""
sum = 0
for line in sys.stdin:
ss = line.strip().split("\t")
if len(line) == 0 or len(line) < 2:
continue
word, num = ss
if curword == "":
curword = word
if word != curword:
print("\t".join([curword, str(sum)]))
curword = word
sum = 0
if word == curword:
sum += int(num)
print("\t".join([curword, str(sum)]))
shell脚本:
HADOOP_CMD="/usr/local/src/hadoop-2.6.5/bin/hadoop"
STREAM_JAR_PATH="/usr/local/src/hadoop-2.6.5/share/hadoop/tools/lib/hadoop-streaming-2.6.5.jar"
INPUT_FILE_PATH_1="/The_Man_of_Property.txt"
OUTPUT_PATH="/output1"
$HADOOP_CMD fs -rm -r -skipTrash $OUTPUT_PATH
# Step 1.
$HADOOP_CMD jar $STREAM_JAR_PATH \
-input $INPUT_FILE_PATH_1 \
-output $OUTPUT_PATH \
-mapper "python map.py" \
-reducer "python reduce.py" \
-file ./map.py \
-file ./reduce.py
结果如下: