单机版
public class WordCount {
public static void main ( String[ ] args) throws IOException {
Map< String, Integer> [ ] maps= new HashMap [ 5 ] ;
int a= 0 ;
for ( int i = 1 ; i <= 5 ; i++ ) {
maps[ a++ ] = docunment ( i) ;
}
Map< String, Integer> merge = merge ( maps) ;
Set< Map. Entry< String, Integer> > entrySet = merge. entrySet ( ) ;
entrySet. forEach ( System. out: : println) ;
}
public static Map< String, Integer> docunment ( int i) throws IOException {
Map< String, Integer> map= new HashMap < String, Integer> ( ) ;
BufferedReader br= new BufferedReader ( new FileReader ( "D:\\bd_example\\data\\wordcount\\" + i+ ".txt" ) ) ;
String s= null;
while ( ( s= br. readLine ( ) ) != null) {
String[ ] split = s. split ( "\t" ) ;
for ( String string : split) {
if ( map. containsKey ( string) ) {
map. put ( string, map. get ( string) + 1 ) ;
} else {
map. put ( string, 1 ) ;
}
}
}
return map;
}
public static Map< String, Integer> merge ( Map< String, Integer> . . . maps) {
Map< String, Integer> map= new HashMap < String, Integer> ( ) ;
for ( Map< String, Integer> current_map : maps) {
Set< String> current_keys = current_map. keySet ( ) ;
for ( String k : current_keys) {
Integer old_count = current_map. get ( k) ;
if ( map. containsKey ( k) ) {
Integer current_count = map. get ( k) ;
map. put ( k, old_count+ current_count) ;
} else {
map. put ( k, old_count) ;
}
}
}
return map;
}
}
MapReduce
public class WC {
public class WCMapper extends Mapper < LongWritable, Text, Text, IntWritable> {
@Override
protected void map ( LongWritable key, Text value, Mapper< LongWritable, Text, Text, IntWritable> . Context context)
throws IOException, InterruptedException {
String string = value. toString ( ) ;
String[ ] words = string. split ( "\t" ) ;
for ( String s : words) {
Text mtText= new Text ( s) ;
IntWritable iWritable= new IntWritable ( 1 ) ;
context. write ( mtText, iWritable) ;
}
}
}
public class WCReduce extends Reducer < Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce ( Text key, Iterable< IntWritable> values,
Reducer< Text, IntWritable, Text, IntWritable> . Context context) throws IOException, InterruptedException {
int sum= 0 ;
for ( IntWritable v : values) {
sum+= v. get ( ) ;
}
IntWritable resIntWritable= new IntWritable ( sum) ;
context. write ( key, resIntWritable) ;
}
}
public static void main ( String[ ] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf= new Configuration ( ) ;
Job job= Job. getInstance ( conf) ;
job. setJarByClass ( WC. class ) ;
job. setMapperClass ( WCMapper. class ) ;
job. setReducerClass ( WCReduce. class ) ;
job. setMapOutputKeyClass ( Text. class ) ;
job. setMapOutputValueClass ( IntWritable. class ) ;
job. setOutputKeyClass ( Text. class ) ;
job. setOutputValueClass ( IntWritable. class ) ;
FileInputFormat. addInputPath ( job, new Path ( args[ 0 ] ) ) ;
FileOutputFormat. setOutputPath ( job, new Path ( args[ 1 ] ) ) ;
job. waitForCompletion ( true ) ;
}
}
Hive
create database wc;
use wc;
create table wordcount( word string)
row format delimited fields terminated by "\n" location '/wordcount' ;
load data local inpath "/home/fanger/example/wordcount" into table wordcount;
select * from wordcount;
select
a. word as word, count ( * ) as num
from (
select
wc. word as word
from wordcount
lateral view explode( split( word, "\t" ) ) wc as word) a
group by a. word;
Spark
public class _01sparkCount {
public static void main ( String[ ] args) {
SparkConf conf= new SparkConf ( ) ;
conf. setMaster ( "local[*]" ) ;
conf. setAppName ( _01sparkCount. class. getSimpleName ( ) ) ;
JavaSparkContext jsc= new JavaSparkContext ( conf) ;
JavaRDD< String> text = jsc. textFile ( "D:\\bd_example\\data\\wordcount\\" ) ;
int numPartitions = text. getNumPartitions ( ) ;
System. out. println ( numPartitions) ;
JavaRDD< String> lines = text. flatMap ( new FlatMapFunction< String, String> ( ) {
@Override
public Iterator< String> call ( String s) throws Exception {
return Arrays. asList ( s. split ( "\t" ) ) . iterator ( ) ;
}
} ) ;
JavaPairRDD< String, Integer> maps = lines. mapToPair ( new PairFunction< String, String, Integer> ( ) {
@Override
public Tuple2< String, Integer> call ( String s) throws Exception {
return new Tuple2 ( s, 1 ) ;
}
} ) ;
JavaPairRDD< String, Integer> reduces = maps. reduceByKey ( new Function2< Integer, Integer, Integer> ( ) {
@Override
public Integer call ( Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
} ) ;
reduces. foreach ( new VoidFunction< Tuple2< String, Integer> > ( ) {
@Override
public void call ( Tuple2< String, Integer> t) throws Exception {
System. out. println ( t. _1+ "---->" + t. _2) ;
}
} ) ;
jsc. textFile ( "D:\\bd_example\\data\\wordcount\\" ) . flatMap ( x-> Arrays. asList ( x. split ( "\t" ) ) . iterator ( ) ) .
mapToPair ( v-> new Tuple2< > ( v, 1 ) ) . reduceByKey ( ( v1, v2) -> v1+ v2) . foreach ( t-> {
System. out. println ( t. _1+ "\t" + t. _2) ;
} ) ;
jsc. stop ( ) ;
}
}
Scala
val array2= Array ( "a b c" , "a c d e s" , "a d e g" )
print ( array
. map ( x= > x. split ( " " ) )
. flatten
. map ( x= > ( x, 1 ) )
. groupBy ( x= > x. _1)
. map ( x= > ( x. _1, x. _2. length) )
. toList
. sortWith ( ( x, y) = > x. _1> y. _1) )
val stringToInt = array2
. flatMap ( _. split ( " " ) )
. map ( ( _, 1 ) )
. groupBy ( _. _1)
. map ( t= > ( t. _1, t. _2. length) )
. toList
. sortWith ( ( x, y) = > x. _1> y. _1)
println ( stringToInt)
val conf= new SparkConf ( ) . setAppName ( AggreatBy. getClass. getSimpleName) . setMaster ( "local[1]" )
val sc= new SparkContext ( conf)
val array2= Array ( "a b c" , "a c d e s" , "a d e g" )
val valuerdd: RDD[ String] = sc. parallelize ( array2)
val value = valuerdd. flatMap ( _. split ( "\\s+" ) ) . map ( ( _, 1 ) )
val stringToLong: collection. Map[ String, Long] = value. countByKey ( )
for ( ( k, v) < - stringToLong) {
println ( s"${ k} ,${ v} " )
}