Hadoop实战演练：搜索数据分析----每秒搜索量计算与展示（5）

最新推荐文章于 2021-12-10 15:10:25 发布

置顶 Evankaka

最新推荐文章于 2021-12-10 15:10:25 发布

阅读量2.9k

点赞数 1

分类专栏： Hadoop实战演练

本文链接：https://blog.csdn.net/Evankaka/article/details/52549915

版权

Hadoop实战演练专栏收录该内容

10 篇文章 3 订阅

订阅专栏

林炳文Evankaka原创作品。转载请注明出处http://blog.csdn.net/evankaka

摘要：本文使用hadoop分析每秒的搜索量并保存到mysql存储和展示

工程源码下载：https://github.com/appleappleapple/BigDataLearning/tree/master/Hadoop-Demo

一、环境与数据

1、本地开发环境
Windows7 + Eclipse Luna
hadoop版本：2.6.2
JDK版本：1.8
2、数据来源：
搜狗实验室
http://www.sogou.com/labs/resource/q.php
3、数据格式
访问时间\t用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL
其中，用户ID是根据用户使用浏览器访问搜索引擎时的Cookie信息自动赋值，即同一次使用浏览器输入的不同查询对应同一个用户ID

样例：（注意，比 Hadoop实战演练：搜索数据分析----数据去重（1）多了第一列搜索的时间）

4、统计

本文要统计出每一秒的搜索人数并将结果存储到mysql

二、编程实现

1、统计

package com.lin.counttime;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * 功能概要：
 * 
 * @author linbingwen
 * @since  2016年8月1日 
 */
public class CountTime {
	public static class Map extends Mapper<Object, Text, Text, IntWritable> {
		private final static IntWritable one = new IntWritable(1);
		// 实现map函数
        @Override
		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
			// 将输入的纯文本文件的数据转化成String
			String line = value.toString();
			
			// 将输入的数据首先按行进行分割
			StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");
			// 分别对每一行进行处理
			while (tokenizerArticle.hasMoreElements()) {
			    // 每行按空格划分
			    StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken());
			    String c1 = tokenizerLine.nextToken();//
			    Text newline = new Text(c1);
			    context.write(newline, one);
			}
		}

	}
	
	public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result =new IntWritable();
        
		// 实现reduce函数
        @Override
		public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
			int count = 0;
			 for(IntWritable val:values){
				  count += val.get();
			 }
	        result.set(count);
			context.write(key, result);
		}
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		//设置hadoop的机器、端口
		conf.set("mapred.job.tracker", "10.75.201.125:9000");
		//设置输入输出文件目录
		String[] ioArgs = new String[] { "hdfs://hmaster:9000/one_in", "hdfs://hmaster:9000/one_out" };
		String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage:  <in> <out>");
			System.exit(2);
		}
		//设置一个job
		Job job = Job.getInstance(conf, "key Word count");
		job.setJarByClass(CountTime.class);
		
		// 设置Map、Combine和Reduce处理类
		job.setMapperClass(CountTime.Map.class);
		job.setCombinerClass(CountTime.Reduce.class);
		job.setReducerClass(CountTime.Reduce.class);
		
		// 设置输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// 将输入的数据集分割成小数据块splites，提供一个RecordReder的实现
		job.setInputFormatClass(TextInputFormat.class);
		
		// 提供一个RecordWriter的实现，负责数据输出
		job.setOutputFormatClass(TextOutputFormat.class);
		
		// 设置输入和输出目录
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}


}

输出结果：

2、结果输出到mysql:

package com.lin.counttime;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * 功能概要：将top 搜索词保存到数据库
 * 
 * @author linbingwen
 */
public class SaveCountTimeResult {
    /**  
    * 实现DBWritable  
    *   
    * TblsWritable需要向mysql中写入数据  
    */  
   public static class TblsWritable implements Writable, DBWritable {  
       String tbl_name;  
       int tbl_age;  
 
       public TblsWritable() {  
       }  
 
       public TblsWritable(String name, int age) {  
           this.tbl_name = name;  
           this.tbl_age = age;  
       }  

       @Override  
       public void write(PreparedStatement statement) throws SQLException {  
           statement.setString(1, this.tbl_name);  
           statement.setInt(2, this.tbl_age);  
       }  

       @Override  
       public void readFields(ResultSet resultSet) throws SQLException {  
           this.tbl_name = resultSet.getString(1);  
           this.tbl_age = resultSet.getInt(2);  
       }  

       @Override  
       public void write(DataOutput out) throws IOException {  
           out.writeUTF(this.tbl_name);  
           out.writeInt(this.tbl_age);  
       }  

       @Override  
       public void readFields(DataInput in) throws IOException {  
           this.tbl_name = in.readUTF();  
           this.tbl_age = in.readInt();  
       }  
 
       public String toString() {  
           return new String(this.tbl_name + " " + this.tbl_age);  
       }  
   }
   
   public static class StudentMapper extends Mapper<LongWritable, Text, LongWritable, Text>{
       @Override
       protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
           context.write(key, value);
       }
   }
   
   public static class StudentReducer extends Reducer<LongWritable, Text, TblsWritable, TblsWritable> {
       @Override
       protected void reduce(LongWritable key, Iterable<Text> values,Context context) throws IOException, InterruptedException {
           // values只有一个值，因为key没有相同的
           StringBuilder value = new StringBuilder();
           for(Text text : values){
               value.append(text);
           }
           
           String[] studentArr = value.toString().split("\t");
           
           if(studentArr[0] != null){
               String name = studentArr[0].trim();
               
               int age = 0;
               try{
                   age = Integer.parseInt(studentArr[1].trim());
               }catch(NumberFormatException e){
               }
               
               context.write(new TblsWritable(name, age), null);  
           }
       }
   }
   
   public static void main(String[] args) throws Exception {
	   Configuration conf = new Configuration();
		
	   DBConfiguration.configureDB(conf, "com.mysql.cj.jdbc.Driver","jdbc:mysql://localhost:3306/learning?serverTimezone=UTC","root", "linlin");  
		
		//设置hadoop的机器、端口
		conf.set("mapred.job.tracker", "10.75.201.125:9000");
		//设置输入输出文件目录
		String[] ioArgs = new String[] { "hdfs://hmaster:9000/one_out"};
		String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
		if (otherArgs.length != 1) {
			System.err.println("Usage:  <in> <out>");
			System.exit(2);
		}
		//设置一个job
		Job job = Job.getInstance(conf, "SaveResult");
		job.setJarByClass(SaveCountTimeResult.class);
		
		// 输入路径
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        
        // Mapper
        job.setMapperClass(StudentMapper.class);
        // Reducer
        job.setReducerClass(StudentReducer.class);
        
        // mapper输出格式
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);
        
        // 输入格式，默认就是TextInputFormat
        job.setOutputFormatClass(DBOutputFormat.class);  
        
        // 输出到哪些表、字段
        DBOutputFormat.setOutput(job, "count_time", "time", "total");
        System.exit(job.waitForCompletion(true) ? 0 : 1);
   }
   

}

结果成功保存

建表sql:

CREATE TABLE `count_time` (
  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
  `time` varchar(255) DEFAULT NULL,
  `total` bigint(20) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB  DEFAULT CHARSET=utf8