（二十）【搜狗搜索日志分析】MR的操作

最新推荐文章于 2024-04-21 15:08:06 发布

Ares_song

最新推荐文章于 2024-04-21 15:08:06 发布

阅读量678

点赞数

分类专栏：云计算与大数据文章标签： hadoop 搜狗搜索 MR操作

本文链接：https://blog.csdn.net/Ares_song/article/details/106908840

版权

云计算与大数据专栏收录该内容

31 篇文章 6 订阅

订阅专栏

上一篇博客主要写了如何用Hive操作日志，这一篇主要通过MR程序来进行处理

由于在win中操作，更改目录麻烦，所以本次MR操作在虚拟机的eclipse中

环境准备

在虚拟机中就不使用Maven工程，需要将本地的包都导入到项目中

找到你下载的hadoop,将share中的以下包内容导入到你的项目中

进入所选的目录，将lib中的所有包，以及框中的包导入（四个目录同理）

导入后，可以开始编写MR程序了

编写程序

以下IP是本地Hadoop环境的ip，如需使用以下代码，自行更改ip和文件路径。运行过程中，请确保HDFS中输出文件不存在。第二次运行程序，需要删除第一次运行的结果。

(1)查询总条数

package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-27 21:46
 * @version: 1.0.0
 **/
public class MRCountAll {
    public static Integer i = 0;
    public static boolean flag = true;

    public static class CountAllMap extends Mapper<Object, Text, Text, Text> {

        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            i++;
        }
    }
    public static void  runcount(String Inputpath, String Outpath) {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
        Job job = null;
        try {
            job = Job.getInstance(conf, "count");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        job.setJarByClass(MRCountAll.class);
        job.setMapperClass(CountAllMap.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        try {
            FileInputFormat.addInputPath(job, new Path(Inputpath));
        } catch (IllegalArgumentException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        FileOutputFormat.setOutputPath(job, new Path(Outpath));

        try {
            job.waitForCompletion(true);
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    public static void main(String[] args) throws Exception {
        runcount("/sogou.500w.utf8", "/sogou/data/CountAll");
        System.out.println("总条数:  " + i);
    }

}

(2)非空查询条数

package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-27 21:49
 * @version: 1.0.0
 **/
public class CountNotNull {
    public static String Str = "";
    public static int i = 0;
    public static boolean flag = true;

    public static class wyMap extends Mapper<Object, Text, Text, IntWritable> {
        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            String[] values = value.toString().split("\t");
            if (!values[2].equals(null) && values[2] != "") {
                context.write(new Text(values[1]), new IntWritable(1));
                i++;
            }
        }
    }



    public static void run(String inputPath, String outputPath) {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");

        Job job = null;
        try {
            job = Job.getInstance(conf, "countnotnull");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        assert job != null;
        job.setJarByClass(CountNotNull.class);
        job.setMapperClass(wyMap.class);
        //job.setReducerClass(wyReduce.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        try {
            FileInputFormat.addInputPath(job, new Path(inputPath));
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            FileOutputFormat.setOutputPath(job, new Path(outputPath));
            job.waitForCompletion(true);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

    }
    public static void main(String[] args) {
        run("hdfs://192.168.60.1:9000/sogou.500w.utf8", "hdfs://192.168.60.1:9000/sogou/data/CountNotNull");
       // System.out.println("非空条数:  " + i);
    }
}

(3)无重复总条数

package MR;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-28 11:26
 * @version: 1.0.0
 **/
public class CountNotRepeat {

    public static int i = 0;
    public static class NotRepeatMap extends Mapper<Object , Text , Text, Text>{
        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context) throws IOException, InterruptedException {

            String text = value.toString();
            String[] values = text.split("\t");
            String time = values[0];
            String uid = values[1];
            String name = values[2];
            String url = values[5];
            context.write(new Text(time+uid+name+url), new Text("1"));

        }
    }

    public static class NotRepeatReduc extends Reducer<Text , IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text,IntWritable,Text,IntWritable>.Context context) throws IOException, InterruptedException {
            i++;
            context.write(new Text(key.toString()),new IntWritable(i));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");

        Job job = null;
        try {
            job = Job.getInstance(conf, "countnotnull");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        assert job != null;
        job.setJarByClass(CountNotRepeat.class);
        job.setMapperClass(NotRepeatMap.class);
        job.setReducerClass(NotRepeatReduc.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        try {
            FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountNotRepeat"));
            job.waitForCompletion(true);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        System.out.println("无重复总条数为:  " + i);
    }
}

(4)独立UID总数

package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-28 11:37
 * @version: 1.0.0
 **/
public class CountNotMoreUid {

    public static int i = 0;
    public static class UidMap extends Mapper<Object , Text , Text, Text>{
        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context) throws IOException, InterruptedException {

            String text = value.toString();
            String[] values = text.split("\t");
            String uid = values[1];
            context.write(new Text(uid), new Text("1"));

        }
    }

    public static class UidReduc extends Reducer<Text , IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text,IntWritable,Text,IntWritable>.Context context) throws IOException, InterruptedException {
            i++;
            context.write(new Text(key.toString()),new IntWritable(i));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");

        Job job = null;
        try {
            job = Job.getInstance(conf, "countnotnull");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        assert job != null;
        job.setJarByClass(CountNotNull.class);
        job.setMapperClass(UidMap.class);
        job.setReducerClass(UidReduc.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        try {
            FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountNotMoreUid"));
            job.waitForCompletion(true);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        System.out.println("独立UID条数:  " + i);
    }
}

以下程序皆可运行，不再贴图

(5)查询频度排名（频度最高的前50词）

package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.TreeMap;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-28 11:41
 * @version: 1.0.0
 **/
public class CountTop50 {
    public static class TopMapper  extends Mapper<LongWritable, Text, Text, LongWritable>{
        Text text =new Text();
        @Override
        protected void map(LongWritable key, Text value,Context context)
                throws IOException, InterruptedException {
            String[] line= value.toString().split("\t");
            String keys = line[2];
            text.set(keys);
            context.write(text,new LongWritable(1));
        }
    }

    public static class TopReducer extends Reducer< Text,LongWritable, Text, LongWritable>{
        Text text = new Text();
        TreeMap<Integer,String > map = new TreeMap<Integer,String>();
        @Override
        protected void reduce(Text key, Iterable<LongWritable> value, Context context)
                throws IOException, InterruptedException {
            int sum=0;//key出现次数
            for (LongWritable ltext : value) {
                sum+=ltext.get();
            }
            map.put(sum,key.toString());
            //去前50条数据
            if(map.size()>50){
                map.remove(map.firstKey());
            }
        }
        @Override
        protected void cleanup(Context context)
                throws IOException, InterruptedException {
            for(Integer count:map.keySet()){
                context.write(new Text(map.get(count)), new LongWritable(count));
            }
        }
    }
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");

        Job job = Job.getInstance(conf, "count");
        job.setJarByClass(CountTop50.class);
        job.setJobName("Five");

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        job.setMapperClass(TopMapper.class);
        job.setReducerClass(TopReducer.class);

        FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
        FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountTop50"));

        job.waitForCompletion(true);
    }
}

(6)查询次数大于2次的用户总数

package MR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-28 11:53
 * @version: 1.0.0
 **/
public class CountQueriesGreater2 {

    public static int total = 0;
    public static class MyMaper extends Mapper<Object, Text, Text, IntWritable> {

        protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            String[] str = value.toString().split("\t");
            Text word;
            IntWritable one = new IntWritable(1);
            word = new Text(str[1]);
            context.write(word, one);
        }
    }
    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text arg0, Iterable<IntWritable> arg1,
                              Reducer<Text, IntWritable, Text, IntWritable>.Context arg2) throws IOException, InterruptedException {
            // arg0是一个单词 arg1是对应的次数
            int sum = 0;
            for (IntWritable i : arg1) {
                sum += i.get();
            }
            if(sum>2){
                total=total+1;
            }
            //arg2.write(arg0, new IntWritable(sum));
        }
    }
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
        // 1.实例化一个Job
        Job job = Job.getInstance(conf, "six");
        // 2.设置mapper类
        job.setMapperClass(MyMaper.class);
        // 3.设置Combiner类 不是必须的
        // job.setCombinerClass(MyReducer.class);
        // 4.设置Reducer类
        job.setReducerClass(MyReducer.class);
        // 5.设置输出key的数据类型
        job.setOutputKeyClass(Text.class);
        // 6.设置输出value的数据类型
        job.setOutputValueClass(IntWritable.class);
        // 设置通过哪个类查找job的Jar包
        job.setJarByClass(CountQueriesGreater2.class);
        // 7.设置输入路径
        FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
        // 8.设置输出路径
        FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountQueriesGreater2"));
        // 9.执行该作业
        job.waitForCompletion(true);
        System.out.println("查询次数大于2次的用户总数：" + total + "条");

    }
}

(7)查询次数大于2次的用户占比

package MR;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-28 11:49
 * @version: 1.0.0
 **/
public class CountQueriesGreaterPro {
    /*
     * InputFormat ->FileInputFormat(子类)-> TextInputFormat(子类)
     * 程序当中默认调用的就是TextInputFormat 1.验证数据路径是否合法 2.TextInputFormat默认读取数据，就是一行一行读取的
     *
     * 开始执行map，一个map就是一个task（是一个Java进程，运行在JVM上的）
     * map执行完毕后，会执行combiner（可选项），对一个map中的重复单词进行合并，value是一个map中出现的相同单词次数。
     * shuffle(partitioner分区，进行不同map的值并按key排序)
     * Reducer接收所有map数据，并将具有相同key的value值合并
     *
     * OutputFormat ->FileOutputFormat(子类)->TextOutputFormat(子类) 1.验证数据路径是否合法
     * 2.TextOutputFormat写入文件格式是 key + "\t" + value + "\n"
     */
    // Text是一个能够写入文件的Java String数据类型
    // IntWritable是能够写入文件的int数据类型
    // 头两个参数表示的是输入数据key和value的数据类型
    // 后两个数据表示的就是输出数据key和value的数据类型
    public static int total1 = 0;
    public static int total2 = 0;
    public static class MyMaper extends Mapper<Object, Text, Text, IntWritable> {
        @Override
        // key是行地址
        // value是一行字符串
        protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            total2++;
            String[] str = value.toString().split("\t");
            Text word;
            IntWritable one = new IntWritable(1);
            word = new Text(str[1]);
            context.write(word, one);
            // 执行完毕后就是一个单词 对应一个value(1)
        }
    }
    // 头两个参数表示的是输入数据key和value的数据类型
    // 后两个数据表示的就是输出数据key和value的数据类型
    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text arg0, Iterable<IntWritable> arg1,
                              Reducer<Text, IntWritable, Text, IntWritable>.Context arg2) throws IOException, InterruptedException {

            // arg0是一个单词 arg1是对应的次数
            int sum = 0;
            for (IntWritable i : arg1) {
                sum += i.get();
            }
            if(sum>2){
                total1++;
            }
            arg2.write(arg0, new IntWritable(sum));
        }
    }
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        System.out.println("seven begin");
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
        // 1.实例化一个Job
        Job job = Job.getInstance(conf, "seven");
        // 2.设置mapper类
        job.setMapperClass(MyMaper.class);
        // 3.设置Combiner类 不是必须的
        // job.setCombinerClass(MyReducer.class);
        // 4.设置Reducer类
        job.setReducerClass(MyReducer.class);
        // 5.设置输出key的数据类型
        job.setOutputKeyClass(Text.class);
        // 6.设置输出value的数据类型
        job.setOutputValueClass(IntWritable.class);
        // 设置通过哪个类查找job的Jar包
        job.setJarByClass(CountQueriesGreaterPro.class);
        // 7.设置输入路径
        FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
        // 8.设置输出路径
        FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountQueriesGreaterPro"));
        // 9.执行该作业
        job.waitForCompletion(true);
        System.out.println("total1="+total1+"\ttotal2="+total2);
        float percentage = (float)total1/(float)total2;
        System.out.println("查询次数大于2次的用户占比为：" + percentage*100+"%");
        System.out.println("over");
    }
}

(8)Rank在10以内的点击次数占比

package MR;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-28 11:52
 * @version: 1.0.0
 **/
public class CountRank {
    public static int sum1 = 0;
    public static int sum2 = 0;
    public static class MyMapper extends Mapper<Object, Text, Text, Text> {
        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            sum2++;
            String[] str = value.toString().split("\t");
            int rank = Integer.parseInt(str[3]);
            if(rank<11)
            {
                sum1=sum1+1;
            }
        }
    }
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
        Job job = Job.getInstance(conf, "eight");
        job.setMapperClass(MyMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setJarByClass(CountRank.class);
        FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
        FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountRank"));
        job.waitForCompletion(true);
        System.out.println("sum1="+sum1+"\tsum2="+sum2);
        float percentage = (float)sum1/(float)sum2;
        System.out.println("Rank在10以内的点击次数占比：" +percentage*100+"%");
    }
}

(9)直接输入URL查询的比例

package MR;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-28 11:55
 * @version: 1.0.0
 **/
public class CountURL {
    public static int sum1 = 0;
    public static int sum2 = 0;

    public static class MyMapper extends Mapper<Object, Text, Text, Text> {

        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            String[] str = value.toString().split("\t");
            Pattern p = Pattern.compile("www");
            Matcher matcher = p.matcher(str[2]);
            matcher.find();
            try {
                if(matcher.group()!=null)
                    sum1++;
                    sum2++;
            } catch (Exception e) {
                sum2++;
            }
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");
        Job job = Job.getInstance(conf, "nine");
        job.setMapperClass(MyMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setJarByClass(CountURL.class);
        FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
        FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountURL"));
        job.waitForCompletion(true);
        System.out.println("sum1="+sum1+"\tsum2="+sum2);
        float percentage = (float)sum1/(float)sum2;
        System.out.println("直接用url'%www%'查询的用户占比：" +percentage*100+"%");
    }
}

(10)查询搜索过”仙剑奇侠传“的uid，并且次数大于3

package MR;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
 * @Title:
 * @author: 陈宏松
 * @create: 2018-11-28 11:58
 * @version: 1.0.0
 **/
public class CountUidGreater3 {
    public static String Str="";
    public static int i=0;
    public static class Map extends Mapper<Object, Text, Text, IntWritable>{
        @Override
        protected void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            String []values=value.toString().split("\t");
            String pattern="仙剑奇侠传";
            if(values[2].equals(pattern)){
                context.write(new Text(values[1]), new IntWritable(1));
            }
        }
    }
    public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> value,
                              Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            int sum=0;
            for(IntWritable v:value){
                sum=sum+v.get();
            }
            if(sum>3){
                Str=Str+key.toString()+"\n";
                i++;
            }
        }
    }

    public static void main(String[] args) {
        Configuration conf=new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.1:9000");

        Job job = null;
        try {
            job = Job.getInstance(conf, "count");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        job.setJarByClass(CountUidGreater3.class);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        try {
            FileInputFormat.addInputPath(job, new Path("/sogou.500w.utf8"));
        } catch (IllegalArgumentException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        try {
            FileOutputFormat.setOutputPath(job, new Path("/sogou/data/CountUidGreater3"));
            job.waitForCompletion(true);
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        System.out.println("i:  "+i);
        System.out.println(Str);
    }
}

以上结果均可以在Hadoop的web界面中查询

Ares_song

关注

0
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
（二十）【搜狗搜索日志分析】MR的操作

上一篇博客主要写了如何用Hive操作日志，这一篇主要通过MR程序来进行处理由于在win中操作，更改目录麻烦，所以本次MR操作在虚拟机的eclipse中环境准备在虚拟机中就不使用Maven工程，需要将本地的包都导入到项目中找到你下载的hadoop,将share中的以下包内容导入到你的项目中进入所选的目录，将lib中的所有包，以及框中的包导入（四个目录同理）导入后，可以开始编写MR程序了编写程序以下IP是本地Hadoop环境的ip，如需使用以下代码，自行更..
复制链接

扫一扫