计算每个词在每篇微博中的权值
思路:
公式:TF* loge(N/DF)
TF:当前词在本篇微博中出现的次数
N:总微博数
DF:当前词在多少微博中出现过
编程时特别注意不要导错包,不让会出现许多奇怪的错误:
1.测试数据
3823890335901756 今天是今年最暖和的一天,果断出来逛街!
3823890364788305 春天来了,约好友一起出去去踏青,去赏花!
3823890369489295 我在平湖,让你开挂练九阳真经,走火入魔毁了三叉神经了吧,改练九阴真经吧小子。 (免费下载 )
3823890373686361 约了小伙伴一起去理发!
3823890378201539 今天约了姐妹去逛街吃美食,周末玩得很开心啊!
3823890382081678 这几天一直在约,因为感冒发烧了,所以和老公约好了陪我去打针,求九阳安慰,我想喝豆浆,药好苦的
3823890399188850 和吃货的约会么就是吃
3823890419856548 全国包邮!九阳
3823890436963972 我亲爱的
代码:
First Map
import java.io.StringReader;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
//计算每个词在该条微博中出现的次数,也就是公式中的(TF),统计N(微博总条数)
public class FirstMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
protected void map(LongWritable key, Text value, Context context) throws InterruptedException {
//String[] v=value.toString().trim().split("\t");
String[] v=value.toString().split("\t");
if(v.length>=2){
String id=v[0].trim();
String content=v[1].trim();
StringReader sr=new StringReader(content);
IKSegmenter ikSegmenter=new IKSegmenter(sr, true);
Lexeme word=null;
try {
while((word=ikSegmenter.next())!=null){
String w=word.getLexemeText();
context.write(new Text(w+"_"+id), new IntWritable(1));
}
//修改
sr.close();
//计算公式中的N
context.write(new Text("count"), new IntWritable(1));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}else{
System.out.println(value.toString()+"------------------------------");
}
};
}
FirstReduce:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
//
public class FirstReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
protected void reduce(Text key,Iterable<IntWritable> count, Context context) throws IOException ,InterruptedException {
int sum=0;
for(IntWritable i:count){
sum=sum+i.get();
}
if(key.equals(new Text("count"))){
System.out.println(key.toString()+"__________"+sum);;
}
context.write(key, new IntWritable(sum));
}//去掉分号;
}
FirstPartition:
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
//导错该包导致好几次测试都没有数据输出,特别注意
//import org.apache.hadoop.mapred.lib.HashPartitioner;
public class FIrstPartition extends HashPartitioner<Text, IntWritable>{
@Override
public int getPartition(Text key, IntWritable value, int numReduceTasks) {
if(key.equals(new Text("count")))
return 3;
else
//默认HashPartitioner--哈希值模Reduce数量
return super.getPartition(key, value, numReduceTasks-1);
}
}
FIrstJob:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
//import org.apache.hadoop.examples.SecondarySort.FirstPartitioner;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FirstJob {
public static void main(String[] args) throws ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
try {
Job job=Job.getInstance(conf,"weibo1");
job.setJarByClass(FirstJob.class);
//设置Map任务输出的Key和Value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置Reduce个数
job.setNumReduceTasks(4);
//导错导致无结果写入
//job.setPartitionerClass(FirstPartitioner.class);
job.setPartitionerClass(FIrstPartition.class);
job.setMapperClass(FirstMapper.class);
job.setCombinerClass(FirstReduce.class);
job.setReducerClass(FirstReduce.class);
//mr运行时的输入数据从hdfs的哪个目录中获取
FileInputFormat.addInputPath(job, new Path("/input/weibo1"));
FileOutputFormat.setOutputPath(job, new Path("/output/weibo1"));
boolean f=job.waitForCompletion(true);
if(f){
System.out.println("执行job成功");
TwoJob.mainJob();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
TwoMapper
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
//统计每个词的DF
public class TwoMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
protected void map(LongWritable key, Text value,Context context) throws InterruptedException {
FileSplit fs=(FileSplit) context.getInputSplit();
if(!fs.getPath().getName().contains("part-r-00003")){
String[] v=value.toString().trim().split("\t");
if(v.length>=2){
String[] ss=v[0].split("_");
if(ss.length>=2){
String w=ss[0];
//统计DF,该词一共在那些微博中出现过
try {
context.write(new Text(w), new IntWritable(1));
} catch (Exception e) {
e.printStackTrace();
}
}
}else{
System.out.println(value.toString()+"---------------");
}
}
};
}
TwoReduce
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class TwoReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
protected void reduce(Text key, Iterable<IntWritable> count, Context context) throws IOException ,InterruptedException {
int sum=0;
for(IntWritable i:count){
sum=sum+i.get();
}
context.write(key, new IntWritable(sum));
}
}
TwoJob
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TwoJob {
public static void mainJob() {
Configuration conf=new Configuration();
try {
Job job=Job.getInstance(conf,"weibo2");
job.setJarByClass(TwoJob.class);
//设置Map任务输出的Key和Value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(TwoMapper.class);
job.setCombinerClass(TwoReduce.class);
job.setReducerClass(TwoReduce.class);
//mr运行时的输入数据从hdfs的哪个目录中获取
FileInputFormat.addInputPath(job, new Path("/output/weibo1"));
FileOutputFormat.setOutputPath(job, new Path("/output/weibo2"));
boolean f=job.waitForCompletion(true);
if(f){
System.out.println("job2执行成功");
LastJob.mainJob();
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
LastMapper:
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class LastMapper extends Mapper<LongWritable, Text, Text, Text>{
public static Map<String,Integer> cmap=null;
public static Map<String,Integer>df=null;
//在map方法之前执行
protected void setup(Context context) throws IOException ,InterruptedException {
if(cmap==null||cmap.size()==0||df==null||df.size()==0){
URI[] ss=context.getCacheFiles();
if(ss!=null){
for(int i=0;i<ss.length;i++){
URI uri=ss[i];
if(uri.getPath().endsWith("part-r-00003")){
Path path=new Path(uri.getPath());
BufferedReader br=new BufferedReader(new FileReader(path.getName()));
String line=br.readLine();
if(line.startsWith("count")){
String[] ls=line.split("\t");
cmap=new HashMap<String, Integer>();
cmap.put(ls[0], Integer.parseInt(ls[1].trim()));
}
br.close();
}else if(uri.getPath().endsWith("part-r-00000")){
df=new HashMap<String,Integer>();
Path path=new Path(uri.getPath());
BufferedReader br=new BufferedReader(new FileReader(path.getName()));
String line=null;
while((line=br.readLine())!=null){
String[] ls=line.split("\t");
df.put(ls[0], Integer.parseInt(ls[1].trim()));
}
br.close();
}
}
}
}
};
protected void map(LongWritable key, Text value,Context context) throws IOException ,InterruptedException {
FileSplit fs=(FileSplit) context.getInputSplit();
if(!fs.getPath().getName().contains("part-r-00003")){
String[] v=value.toString().trim().split("\t");
if(v.length>=2){
int tf=Integer.parseInt(v[1].trim());
String[] ss=v[0].split("_");
if(ss.length>=2){
String w=ss[0];
String id=ss[1];
double s=tf*Math.log(cmap.get("count")/df.get(w));
NumberFormat nf=NumberFormat.getInstance();
nf.setMaximumFractionDigits(5);
context.write(new Text(id), new Text(w+":"+nf.format(s)));
}
}else{
System.out.println(value.toString()+"-----------------");
}
}
};
}
LastReduce:
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class LastReduce extends Reducer<Text, Text, Text, Text>{
protected void reduce(Text key, Iterable<Text> value, Context context) throws InterruptedException {
StringBuffer sb=new StringBuffer();
for(Text text:value){
sb.append(text.toString()+"\t");
}
try {
context.write(key, new Text(sb.toString()));
} catch (IOException e) {
e.printStackTrace();
}
}
}
LastJob:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LastJob {
public static void mainJob() {
Configuration conf=new Configuration();
try {
Job job=Job.getInstance(conf,"weibo3");
job.setJarByClass(LastJob.class);
job.addCacheFile(new Path("/output/weibo1/part-r-00003").toUri());
job.addCacheFile(new Path("/output/weibo2/part-r-00000").toUri());
//设置Map任务输出的Key和Value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setMapperClass(LastMapper.class);
job.setCombinerClass(LastReduce.class);
job.setReducerClass(LastReduce.class);
//mr运行时的输入数据从hdfs的哪个目录中获取
FileInputFormat.addInputPath(job, new Path("/output/weibo1/"));
FileOutputFormat.setOutputPath(job, new Path("/output/weibo3"));
boolean f=job.waitForCompletion(true);
if(f){
System.out.println("job3执行成功");
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
结果: