基于MapReduce框架的PageRank算法实战（上）

最新推荐文章于 2019-11-27 21:24:03 发布

weixin_34292924

最新推荐文章于 2019-11-27 21:24:03 发布

阅读量195

点赞数

文章标签：大数据数据库爬虫

原文链接：https://my.oschina.net/eager/blog/676603

版权

为什么80%的码农都做不了架构师？>>>

1、本次实战的数据是通过爬虫获取，若有需要可以找我获取。

部分数据展示：

将数据库数据导出为txt格式的文档，命名为userrelation.txt，并将其上传至hdfs中。

2、将数据转换为类似于下图格式的links.txt。第一列是微博所属者的ID，后面的为其所有关注人的ID

3、代码实现

/**
* 处理微博人物关系，得到类似A B C D结构数据
* @author ZD
*/
public class UserRelation {

   private static class UserRelationMapper extends Mapper<LongWritable, Text, Text, Text> {

       @Override
       protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
           String[] strs = value.toString().split(" ");
           context.write(new Text(strs[0].trim()), new Text(strs[1].trim())); //将关注者和被关注者的ID传给Reducer层
       }
   }

   private static class UserRelationReducer extends Reducer<Text, Text, Text, Text> {

       @Override
       protected void reduce(Text value, Iterable<Text> datas, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
           StringBuffer sb = new StringBuffer();
           Iterator<Text> it = datas.iterator();
           if(it.hasNext()){
               sb.append(it.next().toString());
           }
           while(it.hasNext()){
               sb.append(","+it.next().toString());
           }
//将后面所有被关注者格式改为ID1，ID2，ID3...的形式
           context.write(value, new Text(sb.toString()));
       }
   }

   public static void main(String[] args) {
       try {
           Configuration cfg = HadoopCfg.getConfigration();
           Job job = Job.getInstance(cfg);
           job.setJobName("UserRelation");
           job.setJarByClass(UserRelation.class);
           job.setMapperClass(UserRelationMapper.class);
           job.setMapOutputKeyClass(Text.class);
           job.setMapOutputValueClass(Text.class);
           job.setReducerClass(UserRelationReducer.class);
           job.setOutputKeyClass(Text.class);
           job.setOutputValueClass(Text.class);
           FileInputFormat.addInputPath(job, new Path("/input/userrelation.txt"));
           FileOutputFormat.setOutputPath(job, new Path("/second/sec2/"));
           System.exit(job.waitForCompletion(true) ? 0 : 1);
       } catch (Exception e) {
           e.printStackTrace();
       }
   }
}