Hadoop实战演练:搜索数据分析----每秒搜索量计算与展示(5)

          林炳文Evankaka原创作品。转载请注明出处http://blog.csdn.net/evankaka

         摘要:本文使用hadoop分析每秒的搜索量并保存到mysql存储和展示

        工程源码下载:https://github.com/appleappleapple/BigDataLearning/tree/master/Hadoop-Demo

一、环境与数据

1、本地开发环境
Windows7 + Eclipse Luna
hadoop版本:2.6.2
JDK版本:1.8
2、数据来源:
搜狗实验室
http://www.sogou.com/labs/resource/q.php
3、数据格式
访问时间\t用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL
其中,用户ID是根据用户使用浏览器访问搜索引擎时的Cookie信息自动赋值,即同一次使用浏览器输入的不同查询对应同一个用户ID

样例:(注意,比 Hadoop实战演练:搜索数据分析----数据去重 (1)多了第一列搜索的时间)


4、统计

本文要统计出每一秒的搜索人数并将结果存储到mysql


二、编程实现

1、统计

package com.lin.counttime;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * 功能概要:
 * 
 * @author linbingwen
 * @since  2016年8月1日 
 */
public class CountTime {
	public static class Map extends Mapper<Object, Text, Text, IntWritable> {
		private final static IntWritable one = new IntWritable(1);
		// 实现map函数
        @Override
		public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
			// 将输入的纯文本文件的数据转化成String
			String line = value.toString();
			
			// 将输入的数据首先按行进行分割
			StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");
			// 分别对每一行进行处理
			while (tokenizerArticle.hasMoreElements()) {
			    // 每行按空格划分
			    StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken());
			    String c1 = tokenizerLine.nextToken();//
			    Text newline = new Text(c1);
			    context.write(newline, one);
			}
		}

	}
	
	public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result =new IntWritable();
        
		// 实现reduce函数
        @Override
		public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
			int count = 0;
			 for(IntWritable val:values){
				  count += val.get();
			 }
	        result.set(count);
			context.write(key, result);
		}
	}
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		//设置hadoop的机器、端口
		conf.set("mapred.job.tracker", "10.75.201.125:9000");
		//设置输入输出文件目录
		String[] ioArgs = new String[] { "hdfs://hmaster:9000/one_in", "hdfs://hmaster:9000/one_out" };
		String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage:  <in> <out>");
			System.exit(2);
		}
		//设置一个job
		Job job = Job.getInstance(conf, "key Word count");
		job.setJarByClass(CountTime.class);
		
		// 设置Map、Combine和Reduce处理类
		job.setMapperClass(CountTime.Map.class);
		job.setCombinerClass(CountTime.Reduce.class);
		job.setReducerClass(CountTime.Reduce.class);
		
		// 设置输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		// 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现
		job.setInputFormatClass(TextInputFormat.class);
		
		// 提供一个RecordWriter的实现,负责数据输出
		job.setOutputFormatClass(TextOutputFormat.class);
		
		// 设置输入和输出目录
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}


}
输出结果:



2、结果输出到mysql:

package com.lin.counttime;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * 功能概要:将top 搜索词保存到数据库
 * 
 * @author linbingwen
 */
public class SaveCountTimeResult {
    /**  
    * 实现DBWritable  
    *   
    * TblsWritable需要向mysql中写入数据  
    */  
   public static class TblsWritable implements Writable, DBWritable {  
       String tbl_name;  
       int tbl_age;  
 
       public TblsWritable() {  
       }  
 
       public TblsWritable(String name, int age) {  
           this.tbl_name = name;  
           this.tbl_age = age;  
       }  

       @Override  
       public void write(PreparedStatement statement) throws SQLException {  
           statement.setString(1, this.tbl_name);  
           statement.setInt(2, this.tbl_age);  
       }  

       @Override  
       public void readFields(ResultSet resultSet) throws SQLException {  
           this.tbl_name = resultSet.getString(1);  
           this.tbl_age = resultSet.getInt(2);  
       }  

       @Override  
       public void write(DataOutput out) throws IOException {  
           out.writeUTF(this.tbl_name);  
           out.writeInt(this.tbl_age);  
       }  

       @Override  
       public void readFields(DataInput in) throws IOException {  
           this.tbl_name = in.readUTF();  
           this.tbl_age = in.readInt();  
       }  
 
       public String toString() {  
           return new String(this.tbl_name + " " + this.tbl_age);  
       }  
   }
   
   public static class StudentMapper extends Mapper<LongWritable, Text, LongWritable, Text>{
       @Override
       protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
           context.write(key, value);
       }
   }
   
   public static class StudentReducer extends Reducer<LongWritable, Text, TblsWritable, TblsWritable> {
       @Override
       protected void reduce(LongWritable key, Iterable<Text> values,Context context) throws IOException, InterruptedException {
           // values只有一个值,因为key没有相同的
           StringBuilder value = new StringBuilder();
           for(Text text : values){
               value.append(text);
           }
           
           String[] studentArr = value.toString().split("\t");
           
           if(studentArr[0] != null){
               String name = studentArr[0].trim();
               
               int age = 0;
               try{
                   age = Integer.parseInt(studentArr[1].trim());
               }catch(NumberFormatException e){
               }
               
               context.write(new TblsWritable(name, age), null);  
           }
       }
   }
   
   public static void main(String[] args) throws Exception {
	   Configuration conf = new Configuration();
		
	   DBConfiguration.configureDB(conf, "com.mysql.cj.jdbc.Driver","jdbc:mysql://localhost:3306/learning?serverTimezone=UTC","root", "linlin");  
		
		//设置hadoop的机器、端口
		conf.set("mapred.job.tracker", "10.75.201.125:9000");
		//设置输入输出文件目录
		String[] ioArgs = new String[] { "hdfs://hmaster:9000/one_out"};
		String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
		if (otherArgs.length != 1) {
			System.err.println("Usage:  <in> <out>");
			System.exit(2);
		}
		//设置一个job
		Job job = Job.getInstance(conf, "SaveResult");
		job.setJarByClass(SaveCountTimeResult.class);
		
		// 输入路径
		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        
        // Mapper
        job.setMapperClass(StudentMapper.class);
        // Reducer
        job.setReducerClass(StudentReducer.class);
        
        // mapper输出格式
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);
        
        // 输入格式,默认就是TextInputFormat
        job.setOutputFormatClass(DBOutputFormat.class);  
        
        // 输出到哪些表、字段
        DBOutputFormat.setOutput(job, "count_time", "time", "total");
        System.exit(job.waitForCompletion(true) ? 0 : 1);
   }
   

}

结果成功保存

建表sql:

CREATE TABLE `count_time` (
  `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
  `time` varchar(255) DEFAULT NULL,
  `total` bigint(20) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB  DEFAULT CHARSET=utf8



3、结果展示
笔者用百度Echars做了一个页面展示如下:

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值