环境配置
hadoop 版本:2.7.2
ubuntu 版本:lubuntu-16.04.1-desktop-amd64
java 版本:1.8.0_102
windows 版本:Microsoft Windows [版本 10.0.10586]
eclipse 版本:Neon Release (4.6.0)
ubuntu 用户名:zzz
局域网ip地址:192.168.56.101
windows 下采用VirtualBox 5.0.26 搭建虚拟机
1.无密码ssh localhost
ssh-keygen -t rsa -P ''
identification文件保存在 /home/zzz/.ssh/id_rsa.
public key 文件保存在 /home/zzz/.ssh/id_rsa.pub.
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
dsa 是同md5只能用签名校验,rsa能用于解密,上面只能用rsa
2.设置java home变量
在hadoop-env.sh文件中修改,添加jdk安装路径
export JAVA_HOME=/opt/jdk1.8.0_102/
3.修改core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://192.168.56.101:9000</value>
<description>defaultFS设置本机ip地址,不能设为localhost,127.0.0.1或者0.0.0.0</description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/home/zzz/hdata/</value>
<description>存放与hadoop相关数据的文件夹</description>
</property>
</configuration>
4.格式化并启动hdfs
hdfs namenode -format
start-dfs.sh
5.创建主文件夹
hadoop fs -mkdir /user/
hadoop fs -mkdir /user/zzz
6.复制mapred-site.xml.template为mapred-site.xml,修改内容
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
<description>mapreduce运行方式,选择local, classic ,yarn其中之一</description>
</property>
</configuration>
7.修改yarn-site.xml
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>192.168.56.101</value>
<description>yarn设置本机ip地址,不能设为localhost,127.0.0.1或者0.0.0.0</description>
</property>
<property>
<name>yarn.nodemanager.hostname</name>
<value>192.168.56.101</value>
<description>yarn设置本机ip地址,不能设为localhost,127.0.0.1或者0.0.0.0</description>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
<description>nodemanager服务设置</description>
</property>
</configuration>
8.启动yarn
start-yarn.sh
9.连接管理hdfs文件系统
package cn.zzz.test.hadoop;
import static cn.zzz.test.hadoop.Utils.listFile;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
/**
* @author zzz
*
*/
public class TestFs
{
public static final String IP = "192.168.56.101";
public static final String USER_NAME = "zzz";
/**
* @param args
*/
public static void main(String[] args)
{
//连接远程hdfs并管理文件
try (FileSystem fs = FileSystem.get(new URI("hdfs://" + IP + ":9000"), new Configuration(), USER_NAME))
{
Path home = fs.getHomeDirectory();
Path input=new Path(home,"input");
Path output=new Path(home,"output");
if( !fs.exists(input))
{
System.out.println("创建~/input文件夹");
fs.mkdirs(input);
}
if( !fs.exists(output))
{
System.out.println("创建~/output文件夹");
fs.mkdirs(output);
}
Path file =new Path(input,"test.txt");
if( !fs.exists(file))
{
File _file = new File("src");
if(_file.exists())
{
//列出本地所有文件
List<File> _dirs = listFile(_file);
System.out.println("拷贝内容"+_file.getCanonicalPath()+"\t,文件数量:"+_dirs.size());
if(!_dirs.isEmpty())
{
Path[] ps=new Path[_dirs.size()];
for(int i=0;i<ps.length;i++)
{
ps[i]=new Path(_dirs.get(i).getCanonicalPath());
}
fs.copyFromLocalFile(false, true, ps, input);
}
//远程写入文件
try(FSDataOutputStream os = fs.create(file, true);)
{
os.write("It work!\n成功写入。".getBytes());
}
}else
{
System.out.println("本地文件夹不存在"+_file);
}
} else
{
System.out.println("读取内容");
try(BufferedReader br=new BufferedReader(new InputStreamReader( fs.open(file)));)
{
br.lines().forEach(System.out::println);
}
}
System.out.println("~/input");
RemoteIterator<LocatedFileStatus> dirs = fs.listLocatedStatus(input);
while (dirs.hasNext())
{
LocatedFileStatus dir = dirs.next();
System.out.println((dir.isDirectory()?"DIR":"FILE" )+" "+ dir.getPath() + " " +dir.getLen());
}
} catch (IOException | InterruptedException | URISyntaxException e)
{
e.printStackTrace();
}
}
}
10.Eclipse Ant 自动打包jar
在项目-属性-构建器-新建ant构建器,在构建文件选择文件,参考内容如下
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<project default="create_run_jar" name="Create Runnable Jar for Project TestHadoop">
<!--this file was created by Eclipse Runnable JAR Export Wizard-->
<!--ANT 1.7 is required -->
<!--define folder properties-->
<property name="dir.buildfile" value="."/>
<property name="dir.jarfile" value="${dir.buildfile}/target"/>
<target name="create_run_jar">
<jar destfile="${dir.jarfile}/dist.jar" filesetmanifest="mergewithoutmain">
<manifest>
<attribute name="Main-Class" value="cn.zzz.test.hadoop.TestMapReduce"/>
<attribute name="Class-Path" value="."/>
</manifest>
<fileset dir="${dir.buildfile}/bin"/>
</jar>
</target>
</project>
11.远程提交Job
首先确认使用ant构建jar或者eclipse导出jar
package cn.zzz.test.hadoop;
import static cn.zzz.test.hadoop.Utils.catAll;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @author zhhaogen
*
*/
public class TestMapReduce
{
public static final String IP = "192.168.56.101";
public static final String USER_NAME = "zzz";
public static class MyMap extends Mapper<LongWritable, Text, Text, IntWritable>
{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens())
{
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException
{
int sum = 0;
for (IntWritable val : values)
{
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}
/**
* @param args
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException
{
Configuration conf = new Configuration();
System.setProperty("HADOOP_USER_NAME", USER_NAME);
conf.set("fs.defaultFS", "hdfs://" + IP + ":9000");
conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resourcemanager.hostname", IP);
conf.set("mapreduce.app-submission.cross-platform", "true");
conf.set(JobContext.JAR,"target\\dist.jar");
Path HomeDirectory = new Path("hdfs://" + IP + ":9000/user/" + USER_NAME);
Path input = new Path(HomeDirectory, "input");
Path output = new Path(HomeDirectory, "output/result-" + System.currentTimeMillis());
System.out.println("输入路径:" + input);
System.out.println("输出路径:" + output);
Job job = Job.getInstance(conf, "job_test" + new SimpleDateFormat("HHmmss").format(new Date()));
System.out.println("任务名:" + job.getJobName());
//在服务器上运行时可使用Job#setJarByClass找到jar 路径,否则需要指定JobContext.JAR参数
job.setMapperClass(MyMap.class);
job.setCombinerClass(MyReduce.class);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.out.println("jar 路径"+job.getConfiguration().get(JobContext.JAR));
FileInputFormat.addInputPath(job, input);
FileOutputFormat.setOutputPath(job, output);
boolean ret=false;
ret = job.waitForCompletion(true);
//实际jar会上传到服务器tmp目录
System.out.println("Job 完成状态:" + ret + "," + job.getStatus().getState()+",job.jar位置:"+job.getJar());
if (ret)
{
//输出结果
System.out.println(catAll(output));
} else
{
System.exit(0);
}
}
}
12.其他
web UI查看,可访问,http://192.168.56.101:8088/ 和http://192.168.56.101:50070
配置log4j打印调试信息,在src创建文件log4j.properties,内容
log4j.rootLogger=INFO,console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n