前言
爬取当当网图书出版信息,并统计每年每月出版总量
使用python将数据进行处理,抛弃无用数据后只剩下出版日期
然后就和wordcount一样了
一、Hadoop依赖
hadoop-common
hadoop-hdfs
hadoop-mapreduce-client-core
hadoop-mapreduce-client-jobclient
hadoop-mapreduce-client-common
二、官方依赖库
三、配置文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hutonm</groupId>
<artifactId>datacount</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-jobclient -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.7.4</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.4</version>
</dependency>
</dependencies>
</project>
四、数据文件处理
人间失格
ERROR
作家出版社
2015-08-01
少年读史记(套装全5册)
张嘉骅
青岛出版社
2015-09-01
解忧杂货店
东野圭吾
南海出版公司
2014-05-01
雪落香杉树 (福克纳奖得主,全球畅销500万册)
戴维・伽特森
作家出版社
2017-06-18
我喜欢生命本来的样子(周国平经典散文作品集)
周国平
作家出版社
2017-02-01
神奇校车・桥梁书版(全20册)
ERROR
贵州人民出版社
2014-04-01
我不(大冰2017新作。再度突破当当预售新记录!10分钟卖出10万册!30分钟登顶全网NO.1。不容错过的奇书!)
大冰
湖南文艺出版社
2017-08-01
天才在左疯子在右(完整版)
高铭
北京联合出版公司
2016-01-01
使用python文本处理,截取出版日期
import re
pattern = re.compile(r'^\d{4}-\d+')
file = open('date.txt','a',encoding='utf-8')
with open('图书信息数据文档.txt','r',encoding='utf-8') as f:
for line in f.readlines():
line = line.strip()
match = pattern.search(line)
if match:
file.write(match.group())
file.write('\n')
处理后内容为:
2015-08
2015-09
2014-05
2017-06
2017-02
2014-04
2017-08
2016-01
2014-08
2007-11
2011-01
2006-05
2015-09
2014-04
2015-06
2016-06
五、MapReduce程序
package com.hutonm;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.File;
import java.io.IOException;
import java.util.StringTokenizer;
public class Datacount {
public static class SumMap extends
Mapper<Object,Text,Text,IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text date = new Text();
public void map(Object key,Text value,Context context)
throws IOException, InterruptedException {
StringTokenizer stringTokenizer = new StringTokenizer(value.toString());
while (stringTokenizer.hasMoreElements()){
//date = 2017-02
date.set(stringTokenizer.nextToken());
//(2014-02,1)
context.write(date,one);
}
}
}
public static class SumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key,Iterable<IntWritable> value ,Context context) throws IOException, InterruptedException {
//key = date
//2014-02 1 2014-02 1
//2015-03 1
//
int sum = 0;
for (IntWritable val : value){
sum += val.get();
}
result.set(sum);
context.write(key,result);
}
}
public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
Configuration conf = new Configuration();
// String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
// if(otherArgs.length != 2){
// System.err.println("Usage: wordcount <in> <out>");
// System.exit(2);
// }
Job job = Job.getInstance(conf,"Datacount");
job.setJarByClass(Datacount.class);
job.setMapperClass(SumMap.class);
job.setCombinerClass(SumReducer.class);
job.setReducerClass(SumReducer.class);
//控制map reduce函数输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现
// job.setInputFormatClass(TextInputFormat.class);
// // 提供一个RecordWriter的实现,负责数据输出
// job.setOutputFormatClass(TextOutputFormat.class);
//设置输入和输出目录
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
System.exit(job.waitForCompletion(true)? 0 : 1);
}
}
六、运行处理
提交数据文件
hadoop fs -put /usr/local/hadoop/data.txt /input
运行hadoop程序
hadoop jar /usr/local/hadoop/datacount.jar com.hutonm.Datacount /input /output
没事干我还用hive跑了一遍
创建表
create table year_count(year string)
row format delimited fields terminnated by ',';
加载数据
load data local inpath '/usr/local/data.txt' into table year_count;
查询
insert into local directory '/usr/local/hadoop'
select time,count(*)
from year_count
group by(time);
//切记不要insert overwrite
orz 反正我的hadoop文件全被删除了
正在处理中….