wordcount多种写法（单机版、MapReduce、Hive、Spark、Scala）

最新推荐文章于 2023-12-15 16:02:44 发布

M10F

最新推荐文章于 2023-12-15 16:02:44 发布

阅读量1.3k

点赞数 11

分类专栏：综合文章标签： wordcount

本文链接：https://blog.csdn.net/weixin_37139561/article/details/91463367

版权

综合专栏收录该内容

1 篇文章 0 订阅

订阅专栏

单机版



public class WordCount {

        public static void main(String[] args) throws IOException {
            Map<String, Integer>[] maps=new HashMap[5];
            int a=0;
            for (int i = 1; i <=5; i++) {
                maps[a++] = docunment(i);

            }
            Map<String, Integer> merge = merge(maps);
            Set<Map.Entry<String, Integer>> entrySet = merge.entrySet();
            entrySet.forEach(System.out::println);
        }
        public static Map<String, Integer> docunment(int i) throws IOException {
            Map<String, Integer> map=new HashMap<String, Integer>();
            BufferedReader br=new BufferedReader(new FileReader("D:\\bd_example\\data\\wordcount\\"+i+".txt"));

            String s=null;
            while ((s=br.readLine())!=null) {
                String[] split = s.split("\t");
                for (String string : split) {
                    if (map.containsKey(string)) {
                        map.put(string, map.get(string)+1);
                    }else {
                        map.put(string, 1);
                    }
                }
            }
            return map;
        }
        public static Map<String, Integer> merge(Map<String, Integer>...maps) {

            Map<String, Integer> map=new HashMap<String, Integer>();
            for (Map<String, Integer> current_map : maps) {
                Set<String> current_keys = current_map.keySet();
                for (String	 k : current_keys) {
                    Integer old_count = current_map.get(k);
                    if (map.containsKey(k)) {
                        Integer current_count = map.get(k);
                        map.put(k, old_count+current_count);
                    }else {
                        map.put(k, old_count);
                    }
                }
            }
            return map;
        }
}

MapReduce

public class WC {
	public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
	
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
	
	
			String string = value.toString();
			String[] words = string.split("\t");
	
			for (String s : words) {
				//java->hadoop
				Text mtText=new Text(s);
				IntWritable iWritable=new IntWritable(1);
				context.write(mtText, iWritable);
			}
		}
		
	}
	public class WCReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
	
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
	
			int sum=0;
			for (IntWritable v : values) {
				
				sum+=v.get();
			}
			
			IntWritable resIntWritable=new IntWritable(sum);
			context.write(key, resIntWritable);
		}
	}


	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
	
		Configuration conf=new Configuration();

		Job job=Job.getInstance(conf);

		job.setJarByClass(WC.class);
		job.setMapperClass(WCMapper.class);
		job.setReducerClass(WCReduce.class);
	
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		
		FileInputFormat.addInputPath(job,new Path(args[0]));
	
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}
}

Hive

create database wc;

use wc;

create table wordcount(word string) 
row format delimited fields  terminated by "\n" location '/wordcount';

load data local inpath "/home/fanger/example/wordcount" into table wordcount;

select * from wordcount;

select
a.word as word,count(*) as num
from(
select 
wc.word as word
from wordcount 
lateral view explode(split(word,"\t")) wc as word) a
group by a.word;

Spark

/*
	基于java的spark编程
*/
public class _01sparkCount {
    public static void main(String[] args) {      
        SparkConf conf=new SparkConf();
        conf.setMaster("local[*]");
        conf.setAppName(_01sparkCount.class.getSimpleName());
        JavaSparkContext jsc=new JavaSparkContext(conf);

        JavaRDD<String> text = jsc.textFile("D:\\bd_example\\data\\wordcount\\");
        //方法一
        int numPartitions = text.getNumPartitions();
        System.out.println(numPartitions);
        JavaRDD<String> lines = text.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterator<String> call(String s) throws Exception {
                return Arrays.asList(s.split("\t")).iterator();
            }
        });
        JavaPairRDD<String, Integer> maps = lines.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                return new Tuple2(s, 1);
            }
        });
        JavaPairRDD<String, Integer> reduces = maps.reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        reduces.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            @Override
            public void call(Tuple2<String, Integer> t) throws Exception {
                System.out.println(t._1+"---->"+t._2);
            }
        });
        //方法二
        jsc.textFile("D:\\bd_example\\data\\wordcount\\").flatMap(x->Arrays.asList(x.split("\t")).iterator()).
                mapToPair(v->new Tuple2<>(v,1)).reduceByKey((v1,v2)->v1+v2).foreach(t->{
            System.out.println(t._1+"\t"+t._2);
        });
        
        jsc.stop();

    }
}

Scala

 /*
   结果： List((s,1), (g,1), (e,2), (d,2), (c,2), (b,1), (a,3))
     */
//方法一
  val array2=Array("a b c","a c d e s","a d e g")
  print(array
  	.map(x=>x.split(" "))
    .flatten
    .map(x=>(x,1))
    .groupBy(x=>x._1)
    .map(x=>(x._1,x._2.length))
    .toList
    .sortWith((x,y)=>x._1>y._1))
//方法二
  val stringToInt = array2
	.flatMap(_.split(" "))
	.map((_,1))
	.groupBy(_._1)
	.map(t=>(t._1,t._2.length))
	.toList
	.sortWith((x,y)=>x._1>y._1)
   println(stringToInt)
//方法三
val conf=new SparkConf().setAppName(AggreatBy.getClass.getSimpleName).setMaster("local[1]")
    val sc=new SparkContext(conf)
    val array2=Array("a b c","a c d e s","a d e g")
    val valuerdd: RDD[String] = sc.parallelize(array2)
    val value = valuerdd.flatMap(_.split("\\s+")).map((_,1))
    val stringToLong: collection.Map[String, Long] = value.countByKey()
    for((k,v)<-stringToLong){
      println(s"${k},${v}")
    }