09-Hadoop编程

本文介绍了Hadoop编程,包括MapReduce案例分析,如WordCount、PageRank和TF-IDF算法实现,以及协同过滤推荐系统中的ItemCF。通过实例展示了Hadoop MapReduce的工作流程和源码解析。
摘要由CSDN通过智能技术生成

Hadoop编程

Hadoop思维导图下载链接

实例

/opt/sxt/hadoop-2.6.5/share/hadoop/mapreduce/

jar包

  • hadoop-mapreduce-examples-2.6.5.ja

准备

  • for i in seq 100000;do echo “hello sxt $i” >> test.txt;done
  • hdfs dfs -mkdir -p /user/root
  • hdfs dfs -ls -R /
  • hdfs dfs -D dfs.blocksize=1048576 -put ./test.txt /user/root

命令

  • hadoop jar hadoop-mapreduce-examples-2.6.5.jar wordcount /input /output

    • wordcount为主程序
    • *input:是hdfs文件系统中数据所在的目录
    • *ouput:是hdfs中不存在的目录,mr程序运行的结果会输出到该目录(输出路径不允许放东西)

讲解

  • 以下是输出目录的内容:

  • -rw-r–r-- 3 root supergroup 0 2017-07-02 02:49 /mr/test/output/_SUCCESS/

    • /_SUCCESS:是信号/标志文件
  • -rw-r–r-- 3 root supergroup 49 2017-07-02 02:49 /mr/test/output/part-r-00000

    • /part-r-00000:是reduce输出的数据文件,r:reduce的意思,00000是对应的reduce
  • 多个reduce会有多个数据文件

WordCount案例

启动

  • zkServer.sh start
  • start.dfs.sh
  • yarn-daemon.sh start resourcemanager
  • start-yarn.sh

WordCount

  • MyWC

    • package com.sxt.mr.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyWC {

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    Configuration conf = new Configuration();
    //Create a new job
    Job job = Job.getInstance(conf);
    //要打成jar包的入口函数
    job.setJarByClass(MyWC.class);

    //Specify various job-specific parameters
    //定义job名称
    job.setJobName("myjob");

    //定义输入路径
    Path inPath = new Path("/user/root/test.txt");
    FileInputFormat.addInputPath(job, inPath);

    //定义输出路径(不允许放东西)
    Path outPath = new Path("/output/wordcount");
    //有则删除
    if (outPath.getFileSystem(conf).exists(outPath)) {
        outPath.getFileSystem(conf).delete(outPath, true);
    }
    FileOutputFormat.setOutputPath(job, outPath);


    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setReducerClass(MyReducer.class);

    //Submit the job,then poll for progress until the job is complete
    //提交job作业
    job.waitForCompletion(true);


}

}

  • MyMapper

    • package com.sxt.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.StringTokenizer;

public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    StringTokenizer itr = new StringTokenizer(value.toString());      //hello **
    while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
    }

}

}

  • MyReducer

    • package com.sxt.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

//迭代计算
private IntWritable result = new IntWritable();

@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    int sum = 0;
    for (IntWritable val : values) {
        sum += val.get();
    }
    result.set(sum);
    context.write(key, result);

}

}

源码分析

Mapreduce案例

案例一

  • MyTQ

    • package com.bjsxt.tq;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/*
*1949-10-01 14:21:02 34c
*

  • */
    public class MyTQ {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

     //1.配置
     Configuration conf = new Configuration();
     Job job = Job.getInstance(conf);
    
     job.setJarByClass(MyTQ.class);
     job.setJobName("tq");
    
     //2.设置输入输出路径
     Path inPath = new Path("/tq/input");
     FileInputFormat.addInputPath(job, inPath);
     Path outPath = new Path("/tq/output");
     if (outPath.getFileSystem(conf).exists(outPath)) {
         outPath.getFileSystem(conf).delete(outPath, true);
     }
     FileOutputFormat.setOutputPath(job, outPath);
    
     //3.设置Mapper
     job.setMapperClass(Tmapper.class);//自定义传输key
     job.setMapOutputKeyClass(Tq.class);
     job.setOutputValueClass(IntWritable.class);
    
     //4.自定义排序比较器
     job.setSortComparatorClass(TSortComparator.class);
    
     //5.自定义分区器
     job.setPartitionerClass(TPartitoner.class);
    
     //6. 自定义组排序器
     job.setGroupingComparatorClass(TGroupComparator.class);
    
     //7.设置reducetask数量
     job.setNumReduceTasks(2);
    
     //8.设置reducer
     job.setReducerClass(Treducer.class);
    
     //9.打印过程
     job.waitForCompletion(true);
    

    }
    }

  • TGroupComparator

    • package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TGroupComparator extends WritableComparator {
Tq t1 = null;
Tq t2 = null;

public TGroupComparator() {
    super(Tq.class, true);
}


@Override
public int compare(WritableComparable a, WritableComparable b) {
    t1 = (Tq) a;
    t2 = (Tq) b;

    int c1 = Integer.compare(t1.getYear(), t2.getYear());
    if (c1 == 0) {
        return Integer.compare(t1.getMonth(), t2.getMonth());


    }

    return c1;

}

}

  • Tmapper

    • package com.bjsxt.tq;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

public class Tmapper extends Mapper<LongWritable, Text, Tq, IntWritable> {
Tq tkey = new Tq();
IntWritable tval = new IntWritable();

public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

    //获得时间、温度数组
    String[] words = StringUtils.split(value.toString(), '\t');
    String pattern = "yyyy-MM-dd";
    SimpleDateFormat sdf = new SimpleDateFormat(pattern);
    try {
        Date date = sdf.parse(words[0]);
        Calendar cal = Calendar.getInstance();
        cal.setTime(date);

        tkey.setYear(cal.get(Calendar.YEAR));
        tkey.setMonth(cal.get(Calendar.MONTH) + 1);
        tkey.setDay(cal.get(Calendar.DAY_OF_MONTH));
        int wd = Integer.parseInt(words[1].substring(0, words[1].lastIndexOf("c")));
        tkey.setWd(wd);

        tval.set(wd);
        context.write(tkey, tval);

    } catch (ParseException e) {
        e.printStackTrace();
    }


}

}

  • TPartitoner

    • package com.bjsxt.tq;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class TPartitoner extends Partitioner<Tq, IntWritable> {

@Override
public int getPartition(Tq key, IntWritable value, int i) {
    return key.getYear() % i;
}

}

  • Tq

    • package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Tq implements WritableComparable {

private int year;
private int month;
private int day;
private int wd;

public int getYear() {
    return year;
}

public void setYear(int year) {
    this.year = year;
}

public int getMonth() {
    return month;
}

public void setMonth(int month) {
    this.month = month;
}

public int getDay() {
    return day;
}

public void setDay(int day) {
    this.day = day;
}

public int getWd() {
    return wd;
}

public void setWd(int wd) {
    this.wd = wd;
}

@Override
public String toString() {
    return year + "-" + month + "-" + day;
}

@Override
public void write(DataOutput dataOutput) throws IOException {
    dataOutput.writeInt(this.getYear());
    dataOutput.writeInt(this.getMonth());
    dataOutput.writeInt(this.getDay());
    dataOutput.writeInt(this.getWd());
}

@Override
public void readFields(DataInput dataInput) throws IOException {
    this.setYear(dataInput.readInt());
    this.setMonth(dataInput.readInt());
    this.setDay(dataInput.readInt());
    this.setWd(dataInput.readInt());


}

@Override
public int compareTo(Tq o) {
    int c1 = Integer.compare(this.getYear(), o.getYear());

    if (c1 == 0) {
        int c2 = Integer.compare(this.getMonth(), o.getMonth());

        if (c2 == 0) {
            return Integer.compare(this.getDay(), o.getDay());
        }
        return c2;
    }
    return c1;
}

}

  • Treducer

    • package com.bjsxt.tq;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/*

  • 1949-10-01 34
  • 1949-10-02 34
  • 1949-10-03 34
  • 1949-10-05 34

*/
public class Treducer extends Reducer<Tq, IntWritable, Text, IntWritable> {

Text tkey = new Text();
IntWritable tval = new IntWritable();

@Override
protected void reduce(Tq key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    for (IntWritable val : values) {
        int flag = 0;
        int day = 0;
        if (flag == 0) {
            tkey.set(key.toString());
            tval.set(val.get());
            context.write(tkey, tval);
            flag++;
            day = key.getDay();
        }
        if (flag != 0 && day != key.getDay()) {
            tkey.set(key.toString());
            tval.set(val.get());
            context.write(tkey, tval);
            //  break;
            return;
        }

    }


}

}

  • TSortComparator

    • package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/*
*

  • 实现天气年月正序,温度倒序
  • */

public class TSortComparator extends WritableComparator {

Tq t1 = null;
Tq t2 = null;

public TSortComparator() {
    super(Tq.class, true);
}


@Override
public int compare(WritableComparable a, WritableComparable b) {
    t1 = (Tq) a;
    t2 = (Tq) b;

    int c1 = Integer.compare(t1.getYear(), t2.getYear());
    if (c1 == 0) {
        int c2 = Integer.compare(t1.getMonth(), t2.getMonth());
        if (c2 == 0) {
            return Integer.compare(t1.getWd(), t2.getWd());

        }
        return c2;
    }

    return c1;

}

}

案例二

  • 列表差集

  • 思路

  • MyFD

    • package com.bjsxt.fd;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyFD {
public static void main(String[]

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值