主要仍然分了三类重要的class,此处还有个特殊的方便数据库操作的自定义Writable类。
首先展示一下数据吧,本次demo只是为了验证操作的正确性,所以有无异常的结果即达到要求,数据如下:
2018-06-02 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-06-01 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-01-03 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-06-02 11:12:21,12321323423,裤子和衣服和洗漱品,甘肃省
2016-06-01 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-02-02 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-06-02 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-11-11 11:12:21,12321323423,裤子和衣服和洗漱品,甘肃省
2018-01-02 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-11-11 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-03-02 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2011-01-02 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2014-01-02 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
2018-11-11 11:12:21,12321323423,裤子和衣服和洗漱品,陕西省
当然数据里的信息很多,本来可以把输出做的更完美的,但是为了赶学习进度,最近的事儿也是确实多,所以就先简单的利用其中的一部分信息吧。
不多BB,上代码:
Map类:
package com.fyg.bigdata.shopcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class ShopCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] lines=value.toString().split(",");
String dataStr=lines[0].substring(0,10);
if(dataStr.equals("2018-11-11")){
context.write(new Text(lines[3]),new IntWritable(1));
}
}
}
Reduce类:
package com.fyg.bigdata.shopcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//input:<陕西省,[1,1,1]><甘肃省,[1,1,1,1]>
public class ShopCountReducer extends Reducer<Text, IntWritable,MyDBWritable,MyDBWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value:values) {
sum+=value.get();
}
context.write(new MyDBWritable(key.toString(),sum),null);
}
}
Job类:
package com.fyg.bigdata.shopcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.db.DBConfiguration;
import org.apache.hadoop.mapred.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class ShopCountJob {
public static void main(String[] args)throws Exception {
Configuration conf =new Configuration();
// database defalut setting
DBConfiguration.configureDB(conf,"com.mysql.jdbc.Driver","jdbc:mysql://demo05:3306/d1?characterEncoding=utf-8","root","root");
// when you close blow setting,it will run just at localhost
// conf.set("fs.defaultFS","nn1");
Job job=Job.getInstance(conf);
job.setJarByClass(ShopCountJob.class);
// set the class type of map and reduce
job.setMapperClass(ShopCountMapper.class);
job.setReducerClass(ShopCountReducer.class);
// set the format of map'soutput
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// set the format of reduce's output(if the formats of map and reduce are the same ,no need to set this one)
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// this is the setting of Formatting processing class ;
// you won't need to set this property,the defalut setting is "TextInputFormat"
// the InputFormat is for the map progress,the outputFormat is for the reduce progress
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(DBOutputFormat.class);
// set the paths of input and output
TextInputFormat.setInputPaths(job,new Path("/data/input"));
DBOutputFormat.setOutput(job,"shops","name","count");
// post your job
job.waitForCompletion(true);
}
}
MyDBWritable类:
package com.fyg.bigdata.shopcount;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.lib.db.DBWritable;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
public class MyDBWritable implements DBWritable {
private String name;
private Integer count;
public MyDBWritable(String name,Integer count){
this.name=name;
this.count=count;
}
@Override
public void write(PreparedStatement preparedStatement) throws SQLException {
// 此处下标从1开始
preparedStatement.setString(1,name);
preparedStatement.setInt(2,count);
}
@Override
public void readFields(ResultSet resultSet) throws SQLException {
}
}
虽然是为了方便数据库的应用去自定义了Writable类,但这个操作给了我很大的启迪,后续若是实践大型项目中,Java的优美之处可以体现的更淋漓尽致,也希望后续的学习中自己可以多注意自定义这种类的封装思想。
顺便给一下自己的pom配置吧:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.fyg.homework</groupId>
<artifactId>ShopCount</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>6</source>
<target>6</target>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<!--hadoop的版本-->
<hadoop.version>2.6.4</hadoop.version>
</properties>
<dependencies>
<!-- hadoop的公共组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- hadoop的客户端 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- yarn的公共组件 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!---yarn的客户端组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!---yarn的服务端端组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--yarn的resourcemanager的组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-resourcemanager</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--yarn的nodemanager的组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-nodemanager</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-applicationhistoryservice</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.39</version>
</dependency>
</dependencies>
</project>
本次demo难度并不大,相当于只是一个简单的task基础上做一些功能上的扩充,后续学习完hive后会利用hive+hbase来做个更全面的项目吧。
See ya~