====HDFS Federation=====================================
HDFS Federation
业务1 namenode1(hdfs01) 日志文件[log] --> 分析网站PV、UV...
业务2 namenode2(hdfs02) 数据库[table] --> 用户的购买记录分析
业务3 namenode3(hdfs03) 图片 --> 路况预测、用户上传的图片
** 隔离性
** 减轻namenode存储压力,访问压力
** 从某些角度来说,也增加了namenode的安全性
Federation和多个物理独立集群的区别??
** Federation中所有HDFS集群共用datanode服务器
** 物理独立集群不会共用datanode,每个Namenode管理的都只是自己的datanode
如果是一个HDFS集群
* 一个namenode、元数据、fsimage存储在同一台服务器
* 所有客户端访问入口只有一台namenode
----配置----------------------
规划:
** 两个HDFS集群联邦
blue01 blue02 blue03
namenode namenode
datanode datanode datanode
在[blue01]:
a)
$ cp -ra cluster-hadoop-2.5.0/ hadoop-2.5.0
b)
hdfs-site.xml:替换掉原来的内容
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>ns1,ns2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.ns1</name>
<value>blue01.mydomain:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.ns1</name>
<value>blue01.mydomain:50070</value>
</property>
<property>
<name>dfs.namenode.rpc-address.ns2</name>
<value>blue02.mydomain:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.ns2</name>
<value>blue02.mydomain:50070</value>
</property>
</configuration>
c)
拷贝配置文件到02、03两个节点
$ scp etc/hadoop/hdfs-site.xml blue02.mydomain:/opt/modules/hadoop-2.5.0/etc/hadoop/
$ scp etc/hadoop/hdfs-site.xml blue03.mydomain:/opt/modules/hadoop-2.5.0/etc/hadoop/
d)
在[blue01]、[blue02]上分别格式化NameNode:
$ rm -rf data/
** hdfs-cluster为clusterId名称,可以自定义,但必须一致
$ bin/hdfs namenode -format -clusterId hdfs-cluster
e)
在任意一个NameNode节点上启动NameNode,如[blue01]:
$ sbin/start-dfs.sh
http://192.168.122.128:50070/ (active)
http://192.168.122.130:50070/ (active)
测试:
在[blue03]中,创建目录,上传文件,观察是传到哪个hdfs集群
** 由core-site.xml决定,修改为:
<property>
<name>fs.defaultFS</name>
<value>hdfs://blue02.mydomain:8020</value>
</property>
====distcp==================================================
功能:
** 把一个HDFS集群的数据拷贝到另外一个HDFS集群
应用:
** 集群迁移
** 数据迁移
** distcp 跨版本可能会出现问题
bin/hadoop distcp hdfs://nn1:8020/foo/bar hdfs://nn2:8020/bar/foo
示例:
** 需要启动两个集群的mapreduce,进行分布式拷贝
** 目标目录/bbb不需要存在
$ bin/hadoop distcp hdfs://192.168.122.128:8020/aaa/a.txt hdfs://192.168.122.130:8020/bbb/b.txt
** hftp 可以避免跨版本的问题
bin/hadoop distcp -i hftp://sourceFS:50070/src hdfs://destFS:8020/dest
示例:
$ bin/hadoop distcp -i hftp://192.168.122.128:50070/aaa/a.txt hdfs://192.168.122.130:8020/ccc/c.txt
====二次排序=================================================
思路:
** MapReduce特性:自动对key进行排序
** 把需要排序的第一字段和第二字段组合成一个新的key
(可选)
** 修改分区规则--针对原始key进行分区
** 修改分组规则--针对原始key进行分组
具体实现:
** 自定义Key数据类型,实现WritableComparable接口
(可选)
** 自定义分区函数类,实现Partitioner接口
** 自定义分组类
** 继承WritableComparator类
** 继承RawComparator接口
----PairWritable-------------------
package com.myblue.mymapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
//自定义类型
public class PairWritable implements WritableComparable<PairWritable> {
private String first;
private int second;
public PairWritable() {
}
public PairWritable(String first, int second) {
this.set(first, second);
}
public void set(String first, int second) {
this.first = first;
this.second = second;
}
public String getFirst() {
return first;
}
public void setFirst(String first) {
this.first = first;
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((first == null) ? 0 : first.hashCode());
result = prime * result + second;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
PairWritable other = (PairWritable) obj;
if (first == null) {
if (other.first != null)
return false;
} else if (!first.equals(other.first))
return false;
if (second != other.second)
return false;
return true;
}
@Override
public String toString() {
return first + "," + second ;
}
public void write(DataOutput out) throws IOException {
out.writeUTF(first);
out.writeInt(second);
}
public void readFields(DataInput in) throws IOException {
this.first = in.readUTF();
this.second = in.readInt();
}
//比较两个对象的大小
public int compareTo(PairWritable o) {
//先比较第一个值
int result = this.getFirst().compareTo(o.getFirst());
if( result != 0 ){//比较出大小
return result;
}else
return Integer.valueOf(getSecond()).compareTo(Integer.valueOf(o.getSecond()));
}
}
----SecondarySortMapReduce-------------------
package com.myblue.mymapreduce;
import java.io.IOException;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SecondarySortMapReduce extends Configured implements Tool {
public static class SecondarySortMapper extends
Mapper<LongWritable, Text, PairWritable, LongWritable> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] splits = value.toString().split(",") ;
//output key
PairWritable mapOutputKey = new PairWritable();
mapOutputKey.set(splits[0], Integer.valueOf(splits[1]));
//output Value
LongWritable mapOutputValue = new LongWritable() ;
mapOutputValue.set(Long.valueOf(splits[1]));
context.write(mapOutputKey, mapOutputValue);
}
}
public static class SecondarySortReducer extends
Reducer<PairWritable, LongWritable, Text, LongWritable> {
@Override
protected void reduce(PairWritable key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
Text outputKey = new Text();
for(LongWritable value : values){
outputKey.set(key.getFirst());
context.write(outputKey, value);
}
}
}
public int run(String[] args) throws Exception {
Configuration conf = super.getConf();
Job job = Job.getInstance(conf);
job.setJarByClass(getClass());
//Mapper
job.setMapperClass(SecondarySortMapper.class);
job.setMapOutputKeyClass(PairWritable.class);
job.setMapOutputValueClass(LongWritable.class);
// 1.Partition,可以注释
job.setPartitionerClass(FirstPartitioner.class);
// 2.sort
// job.setSortComparatorClass(cls);
// 3.combine
// job.setCombinerClass(WordCountReducer.class);
// 4.compress
// conf.set("mapreduce.map.output.compress", "false");
// 5.group,可以注释
job.setGroupingComparatorClass(FirstGroupingComparator.class);
//Reducer
job.setReducerClass(SecondarySortReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
Path outPath = new Path(args[1]);
FileSystem dfs = FileSystem.get(conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
args = new String[]{"hdfs://blue01.mydomain:8020/input",
"hdfs://blue01.mydomain:8020/output"};
Configuration conf = new Configuration();
int status = ToolRunner.run(conf,new SecondarySortMapReduce(),args);
System.exit(status);
}
}
----FirstPartitioner-------------------
package com.myblue.mymapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class FirstPartitioner extends Partitioner<PairWritable, LongWritable>{
public int getPartition(PairWritable key, LongWritable value,
int numPartitions) {
// [0-9] [a-n] [o-z]
// if(key.matches("[0-9].*")){
// return 0;
// }else if(key.matches("[a-n].*")){
// return 1;
// }else
// return 2;
return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
----FirstGroupingComparator-------------------
package com.myblue.mymapreduce;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;
public class FirstGroupingComparator implements RawComparator<PairWritable> {
public int compare(PairWritable o1, PairWritable o2) {
return o1.getFirst().compareTo(o2.getFirst());
}
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return WritableComparator.compareBytes(b1, 0, l1-4, b2, 0, l2-4);
}
}
====Hadoop版本现状================================================
** apache 版本
** 开源
** 原生态
** 其他所有hadoop发型版本都是基于这个版本二次开发而来
** 下载: archive.apache.org/dist/
** 最新版本 2.7.3
** Cloudera Hadoop
** 基于apache版本衍生而来
** 也是市场占有率最多的Hadoop版本,大多数企业都在用它
** CDH版本,免费
** 下载地址: archive.cloudera.com/cdh5/cdh/5/
** 收费项目: 支持、咨询服务、培训
** 为什么不选择apache版本
在兼容性,安全性,稳定性上需要有所增强
** CDH,Cloudera Manager,Cloudera Support
** Hortonworks Hadoop
** 2011年成立
** HDP:Hortonworks Data Platform,100%开源产品
** Ambari: 一款开源的安装和管理系统
** mapR
** 不是基于apache hadoop开发的
** 自己重构了HDFS平台
** 平台性能要比hadoop要好很多
** 其他版本: Inter、IBM、微软....