09-Hadoop编程

最新推荐文章于 2024-06-28 23:31:29 发布

一个孤独漫步者的遐想

最新推荐文章于 2024-06-28 23:31:29 发布

阅读量181

点赞数

分类专栏：大数据文章标签：大数据 mapreduce 推荐系统 hadoop

本文链接：https://blog.csdn.net/weixin_43555997/article/details/104317244

版权

本文介绍了Hadoop编程，包括MapReduce案例分析，如WordCount、PageRank和TF-IDF算法实现，以及协同过滤推荐系统中的ItemCF。通过实例展示了Hadoop MapReduce的工作流程和源码解析。

摘要由CSDN通过智能技术生成

Hadoop编程

Hadoop思维导图下载链接

实例

/opt/sxt/hadoop-2.6.5/share/hadoop/mapreduce/

jar包

hadoop-mapreduce-examples-2.6.5.ja

准备

for i in seq 100000;do echo “hello sxt $i” >> test.txt;done
hdfs dfs -mkdir -p /user/root
hdfs dfs -ls -R /
hdfs dfs -D dfs.blocksize=1048576 -put ./test.txt /user/root

命令

hadoop jar hadoop-mapreduce-examples-2.6.5.jar wordcount /input /output
- wordcount为主程序
- *input:是hdfs文件系统中数据所在的目录
- *ouput:是hdfs中不存在的目录，mr程序运行的结果会输出到该目录（输出路径不允许放东西）

讲解

以下是输出目录的内容：
-rw-r–r-- 3 root supergroup 0 2017-07-02 02:49 /mr/test/output/_SUCCESS/
- /_SUCCESS：是信号/标志文件
-rw-r–r-- 3 root supergroup 49 2017-07-02 02:49 /mr/test/output/part-r-00000
- /part-r-00000：是reduce输出的数据文件，r：reduce的意思，00000是对应的reduce
多个reduce会有多个数据文件

WordCount案例

启动

zkServer.sh start
start.dfs.sh
yarn-daemon.sh start resourcemanager
start-yarn.sh

WordCount

MyWC
- package com.sxt.mr.wc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyWC {

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    Configuration conf = new Configuration();
    //Create a new job
    Job job = Job.getInstance(conf);
    //要打成jar包的入口函数
    job.setJarByClass(MyWC.class);

    //Specify various job-specific parameters
    //定义job名称
    job.setJobName("myjob");

    //定义输入路径
    Path inPath = new Path("/user/root/test.txt");
    FileInputFormat.addInputPath(job, inPath);

    //定义输出路径（不允许放东西）
    Path outPath = new Path("/output/wordcount");
    //有则删除
    if (outPath.getFileSystem(conf).exists(outPath)) {
        outPath.getFileSystem(conf).delete(outPath, true);
    }
    FileOutputFormat.setOutputPath(job, outPath);


    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setReducerClass(MyReducer.class);

    //Submit the job,then poll for progress until the job is complete
    //提交job作业
    job.waitForCompletion(true);


}

}

MyMapper
- package com.sxt.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.StringTokenizer;

public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    StringTokenizer itr = new StringTokenizer(value.toString());      //hello **
    while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
    }

}

}

MyReducer
- package com.sxt.mr.wc;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

//迭代计算
private IntWritable result = new IntWritable();

@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    int sum = 0;
    for (IntWritable val : values) {
        sum += val.get();
    }
    result.set(sum);
    context.write(key, result);

}

}

源码分析

Mapreduce案例

案例一

MyTQ
- package com.bjsxt.tq;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/*
*1949-10-01 14:21:02 34c
*

*/
public class MyTQ {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

 //1.配置
 Configuration conf = new Configuration();
 Job job = Job.getInstance(conf);

 job.setJarByClass(MyTQ.class);
 job.setJobName("tq");

 //2.设置输入输出路径
 Path inPath = new Path("/tq/input");
 FileInputFormat.addInputPath(job, inPath);
 Path outPath = new Path("/tq/output");
 if (outPath.getFileSystem(conf).exists(outPath)) {
     outPath.getFileSystem(conf).delete(outPath, true);
 }
 FileOutputFormat.setOutputPath(job, outPath);

 //3.设置Mapper
 job.setMapperClass(Tmapper.class);//自定义传输key
 job.setMapOutputKeyClass(Tq.class);
 job.setOutputValueClass(IntWritable.class);

 //4.自定义排序比较器
 job.setSortComparatorClass(TSortComparator.class);

 //5.自定义分区器
 job.setPartitionerClass(TPartitoner.class);

 //6. 自定义组排序器
 job.setGroupingComparatorClass(TGroupComparator.class);

 //7.设置reducetask数量
 job.setNumReduceTasks(2);

 //8.设置reducer
 job.setReducerClass(Treducer.class);

 //9.打印过程
 job.waitForCompletion(true);

}
}

TGroupComparator
- package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TGroupComparator extends WritableComparator {
Tq t1 = null;
Tq t2 = null;

public TGroupComparator() {
    super(Tq.class, true);
}


@Override
public int compare(WritableComparable a, WritableComparable b) {
    t1 = (Tq) a;
    t2 = (Tq) b;

    int c1 = Integer.compare(t1.getYear(), t2.getYear());
    if (c1 == 0) {
        return Integer.compare(t1.getMonth(), t2.getMonth());


    }

    return c1;

}

}

Tmapper
- package com.bjsxt.tq;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

public class Tmapper extends Mapper<LongWritable, Text, Tq, IntWritable> {
Tq tkey = new Tq();
IntWritable tval = new IntWritable();

public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

    //获得时间、温度数组
    String[] words = StringUtils.split(value.toString(), '\t');
    String pattern = "yyyy-MM-dd";
    SimpleDateFormat sdf = new SimpleDateFormat(pattern);
    try {
        Date date = sdf.parse(words[0]);
        Calendar cal = Calendar.getInstance();
        cal.setTime(date);

        tkey.setYear(cal.get(Calendar.YEAR));
        tkey.setMonth(cal.get(Calendar.MONTH) + 1);
        tkey.setDay(cal.get(Calendar.DAY_OF_MONTH));
        int wd = Integer.parseInt(words[1].substring(0, words[1].lastIndexOf("c")));
        tkey.setWd(wd);

        tval.set(wd);
        context.write(tkey, tval);

    } catch (ParseException e) {
        e.printStackTrace();
    }


}

}

TPartitoner
- package com.bjsxt.tq;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class TPartitoner extends Partitioner<Tq, IntWritable> {

@Override
public int getPartition(Tq key, IntWritable value, int i) {
    return key.getYear() % i;
}

}

Tq
- package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Tq implements WritableComparable {

private int year;
private int month;
private int day;
private int wd;

public int getYear() {
    return year;
}

public void setYear(int year) {
    this.year = year;
}

public int getMonth() {
    return month;
}

public void setMonth(int month) {
    this.month = month;
}

public int getDay() {
    return day;
}

public void setDay(int day) {
    this.day = day;
}

public int getWd() {
    return wd;
}

public void setWd(int wd) {
    this.wd = wd;
}

@Override
public String toString() {
    return year + "-" + month + "-" + day;
}

@Override
public void write(DataOutput dataOutput) throws IOException {
    dataOutput.writeInt(this.getYear());
    dataOutput.writeInt(this.getMonth());
    dataOutput.writeInt(this.getDay());
    dataOutput.writeInt(this.getWd());
}

@Override
public void readFields(DataInput dataInput) throws IOException {
    this.setYear(dataInput.readInt());
    this.setMonth(dataInput.readInt());
    this.setDay(dataInput.readInt());
    this.setWd(dataInput.readInt());


}

@Override
public int compareTo(Tq o) {
    int c1 = Integer.compare(this.getYear(), o.getYear());

    if (c1 == 0) {
        int c2 = Integer.compare(this.getMonth(), o.getMonth());

        if (c2 == 0) {
            return Integer.compare(this.getDay(), o.getDay());
        }
        return c2;
    }
    return c1;
}

}

Treducer
- package com.bjsxt.tq;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

1949-10-01 34
1949-10-02 34
1949-10-03 34
1949-10-05 34

*/
public class Treducer extends Reducer<Tq, IntWritable, Text, IntWritable> {

Text tkey = new Text();
IntWritable tval = new IntWritable();

@Override
protected void reduce(Tq key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    for (IntWritable val : values) {
        int flag = 0;
        int day = 0;
        if (flag == 0) {
            tkey.set(key.toString());
            tval.set(val.get());
            context.write(tkey, tval);
            flag++;
            day = key.getDay();
        }
        if (flag != 0 && day != key.getDay()) {
            tkey.set(key.toString());
            tval.set(val.get());
            context.write(tkey, tval);
            //  break;
            return;
        }

    }


}

}

TSortComparator
- package com.bjsxt.tq;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/*
*

实现天气年月正序，温度倒序
*/

public class TSortComparator extends WritableComparator {

Tq t1 = null;
Tq t2 = null;

public TSortComparator() {
    super(Tq.class, true);
}


@Override
public int compare(WritableComparable a, WritableComparable b) {
    t1 = (Tq) a;
    t2 = (Tq) b;

    int c1 = Integer.compare(t1.getYear(), t2.getYear());
    if (c1 == 0) {
        int c2 = Integer.compare(t1.getMonth(), t2.getMonth());
        if (c2 == 0) {
            return Integer.compare(t1.getWd(), t2.getWd());

        }
        return c2;
    }

    return c1;

}

}

案例二

列表差集
思路
MyFD
- package com.bjsxt.fd;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyFD {
public static void main(String[]