Maven构建Hadoop工程

前言

爬取当当网图书出版信息,并统计每年每月出版总量
使用python将数据进行处理,抛弃无用数据后只剩下出版日期
然后就和wordcount一样了

一、Hadoop依赖

hadoop-common
hadoop-hdfs
hadoop-mapreduce-client-core
hadoop-mapreduce-client-jobclient
hadoop-mapreduce-client-common

二、官方依赖库

Maven官方依赖库

三、配置文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>hutonm</groupId>
    <artifactId>datacount</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.4</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.4</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.7.4</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-jobclient -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>2.7.4</version>
            <scope>provided</scope>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>2.7.4</version>
        </dependency>

    </dependencies>

</project>

四、数据文件处理

人间失格
ERROR
作家出版社
2015-08-01

少年读史记(套装全5册)
张嘉骅
青岛出版社
2015-09-01

解忧杂货店
东野圭吾
南海出版公司
2014-05-01

雪落香杉树 (福克纳奖得主,全球畅销500万册)
戴维・伽特森
作家出版社
2017-06-18

我喜欢生命本来的样子(周国平经典散文作品集)
周国平
作家出版社
2017-02-01

神奇校车・桥梁书版(全20册)
ERROR
贵州人民出版社
2014-04-01

我不(大冰2017新作。再度突破当当预售新记录!10分钟卖出10万册!30分钟登顶全网NO.1。不容错过的奇书!)
大冰
湖南文艺出版社
2017-08-01

天才在左疯子在右(完整版)
高铭
北京联合出版公司
2016-01-01

使用python文本处理,截取出版日期

import re

pattern = re.compile(r'^\d{4}-\d+')
file = open('date.txt','a',encoding='utf-8')
with open('图书信息数据文档.txt','r',encoding='utf-8') as f:
    for line in f.readlines():
        line = line.strip()
        match = pattern.search(line)
        if match:
                file.write(match.group())
                file.write('\n')

处理后内容为:

2015-08
2015-09
2014-05
2017-06
2017-02
2014-04
2017-08
2016-01
2014-08
2007-11
2011-01
2006-05
2015-09
2014-04
2015-06
2016-06

五、MapReduce程序

package com.hutonm;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.File;
import java.io.IOException;
import java.util.StringTokenizer;

public class Datacount {
    public static class SumMap extends
            Mapper<Object,Text,Text,IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private Text date = new Text();

        public void map(Object key,Text value,Context context)
                throws IOException, InterruptedException {

            StringTokenizer stringTokenizer = new StringTokenizer(value.toString());
            while (stringTokenizer.hasMoreElements()){
                //date = 2017-02
                date.set(stringTokenizer.nextToken());
                //(2014-02,1)
                context.write(date,one);
            }
        }
    }

    public static class SumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

        private IntWritable result = new IntWritable();
        public void reduce(Text key,Iterable<IntWritable> value ,Context context) throws IOException, InterruptedException {
            //key = date
            //2014-02 1 2014-02 1
            //2015-03 1
            //
            int sum = 0;
            for (IntWritable val : value){
                sum += val.get();
            }
            result.set(sum);
            context.write(key,result);
        }
    }

    public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
        Configuration conf = new Configuration();
//        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
//        if(otherArgs.length != 2){
//            System.err.println("Usage: wordcount <in> <out>");
//            System.exit(2);
//        }
        Job job = Job.getInstance(conf,"Datacount");

        job.setJarByClass(Datacount.class);
        job.setMapperClass(SumMap.class);
        job.setCombinerClass(SumReducer.class);
        job.setReducerClass(SumReducer.class);

        //控制map reduce函数输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现
//        job.setInputFormatClass(TextInputFormat.class);
//        // 提供一个RecordWriter的实现,负责数据输出
//        job.setOutputFormatClass(TextOutputFormat.class);


        //设置输入和输出目录
        FileInputFormat.addInputPath(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        System.exit(job.waitForCompletion(true)? 0 : 1);
    }

}

六、运行处理

提交数据文件

hadoop fs -put /usr/local/hadoop/data.txt /input

运行hadoop程序

hadoop jar /usr/local/hadoop/datacount.jar com.hutonm.Datacount /input /output

没事干我还用hive跑了一遍

创建表

create table year_count(year string)
row format delimited fields terminnated by ',';

加载数据

load data local inpath '/usr/local/data.txt' into table year_count;

查询

insert into local directory '/usr/local/hadoop'
select time,count(*)
from year_count
group by(time);

//切记不要insert overwrite
orz 反正我的hadoop文件全被删除了
正在处理中….

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值