需求
使用Spark实现对一个文档中的每一行的单词进行词配对计数,要求去标点符号,将大写符号统一转化成为小写单词。
举例说明,最初的文档为:
“a a, A b
a b c
则处理后的结果为:
(a a) 2
(a b) 2
(a c) 1
(b a) 4
(b c) 1
(c a) 1
(c b) 1
实现过程
开启hadoop和spark
进入Hadoop所在的文件夹并执行启动语句:
$ sbin/start-all.sh
进入Spark所在的文件夹并执行启动语句:
$ sbin/start-all.sh
jar包处理
将编码程序打包成jar包进行处理
HDFS文件设置
使用hdfs创建文件夹,并将input文件放在hdfs文件夹下:
$ hadoop dfs -mkdir -p /wordcount2/input
$ hadoop dfs -put /Users/liuqi/Desktop/input2.txt /wordcount2/input
$ bin/hdfs dfs -ls /wordcount2/input
spark-submit程序
运行mapreduce程序:
$ bin/spark-submit --class WordCount --num-executors 2 --executor-memory 6g --executor-cores 4 /Users/liuqi/Desktop/wordcountspark.jar /wordcount2/output
结果显示如下:
注:如果中间有错,则删除对应文件重新进行操作:
$ bin/hdfs dfs -rm -r /wordcount2/input
查询结果
结果保存在本地:
$ hadoop dfs -getmerge /wordcount2/output /Users/liuqi/Desktop/wordcount2/
附加代码
注:这里只是显示java代码,整个工程去我的CSDN博客进行下载。
WordCount.java:
import scala.Tuple2;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
/**
*
* WordCount
* @author 刘琦
*/
public final class WordCount {
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Usage: WordCount <file>");
System.exit(1);
}
SparkConf sparkConf = new SparkConf().setAppName("WordCount");
JavaSparkContext ctx = new JavaSparkContext(sparkConf);
JavaRDD<String> lines = ctx.textFile("hdfs://localhost:54310/wordcount2/input");
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
public Iterator<String> call(String s) throws Exception {
// TODO Auto-generated method stub
//大小写转化与去符号
String newStr = s.toLowerCase().replaceAll("[\\d\\pP\\p{Punct}]", "");;
String[] wordResult = SPACE.split(newStr);
List<String> wordNewResult = new ArrayList<String>();
String[][] result = new String[wordResult.length][2];
for (int i = 0; i < wordResult.length; i ++){
result[i][0] = wordResult[i];
result[i][1] = "0";
}
//对每一行的单词进行处理
for(int i = 0; i < wordResult.length ; i++){
for(int j = 0; j < wordResult.length; j++){
if(i == j){
continue;
}else if (result[i][1].equals("1")){
//这个词之前出现过了,这里只统计它之后还有没有相同的数据,后来发现不需要这一步了,相同的只计算一次就好
if(i<j && result[i][0].equals(result[j][0])){
result[j][1] = "1";
//wordNewResult.add("(" + result[i][0] + " " + result[j][0] + ")");
//word.set("(" + result[i][0] + " " + result[j][0] + ")");
// output.collect(word, one);
}
}else{
//这个词之前没有出现过
if (!result[i][0].equals(result[j][0])){
//普通操作
wordNewResult.add("(" + result[i][0] + " " + result[j][0] + ")");
// word.set("(" + result[i][0] + " " + result[j][0] + ")");
}else{
//说明两个单词是一样的,并且这个单词之前没有统计过
result[j][1] = "1";
wordNewResult.add("(" + result[i][0] + " " + result[j][0] + ")");
// word.set("(" + result[i][0] + " " + result[j][0] + ")");
}
}
}
}
return wordNewResult.iterator();
}
});
JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) {
return new Tuple2<String, Integer>(s, 1);
}
});
JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
});
//创建Hdfs文件,打开Hdfs输出流
HdfsOperate.openHdfsFile("hdfs://localhost:54310/wordcount2/output");
List<Tuple2<String, Integer>> output = counts.collect();
for (Tuple2<?,?> tuple : output) {
System.out.println(tuple._1() + ": " + tuple._2());
HdfsOperate.writeString(tuple._1() + ": " + tuple._2());
}
ctx.stop();
//关闭Hdfs输出流
HdfsOperate.closeHdfsFile();
}
}
HdfsOperate.java:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.web.resources.ExceptionHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.net.URI;
/**
* 使用Hadoop的FileSystem把数据写入到HDFS
*/
public class HdfsOperate implements Serializable{
private static Logger logger = LoggerFactory.getLogger(HdfsOperate.class);
private static Configuration conf = new Configuration();
private static BufferedWriter writer = null;
//在hdfs的目标位置新建一个文件,得到一个输出流
public static void openHdfsFile(String path) throws Exception {
FileSystem fs = FileSystem.get(URI.create(path),conf);
writer = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(path))));
if(null!=writer){
logger.info("[HdfsOperate]>> initialize writer succeed!");
}
}
//往hdfs文件中写入数据
public static void writeString(String line) {
try {
writer.write(line + "\n");
}catch(Exception e){
logger.error("[HdfsOperate]>> writer a line error:" , e);
}
}
//关闭hdfs输出流
public static void closeHdfsFile() {
try {
if (null != writer) {
writer.close();
logger.info("[HdfsOperate]>> closeHdfsFile close writer succeed!");
}
else{
logger.error("[HdfsOperate]>> closeHdfsFile writer is null");
}
}catch(Exception e){
logger.error("[HdfsOperate]>> closeHdfsFile close hdfs error:" + e);
}
}
}