package first;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.StringReader;
public class FirstMap extends Mapper {
private Text t = new Text();
private IntWritable intWritable = new IntWritable(1);
//3823890210294392今天我约了豆浆,油条
//word_id count
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().trim().split("\t");
if (split.length > 2) {
String id = split[0].trim();
String word = split[1].trim();
//对word 进行分词
StringReader sr = new StringReader(word);
IKSegmenter ikSegmenter = new IKSegmenter(sr, true);
Lexeme text = null;
while ((text = ikSegmenter.next()) != null) {
String w = text.getLexemeText();
//word_id count
t.set(w + "_" + id);
context.write(t,new IntWritable(1));
}
}
//输出总行数
context.write(new Text("count"), new IntWritable(1));
}
}
单词统计 cord count