HDFS操作
命令操作
#查看hdfs根目录下面的文件
hdfs dfs -ls /
#mkdir在hdfs根目录创建一个test目录
hdfs dfs -mkdir /test
#put将当期目录下面的1.txt文件上传到hdfs的test目录下面
hdfs dfs -put 1.txt /test
Java API操作
项目pom.xml文件。
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.bulid.sourceEcoding>UTF-8</project.bulid.sourceEcoding>
<hadoop.version>2.7.6</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
</dependency>
</dependencies>
Java API代码。
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.net.URI;
/**
* Java API操作HDFS
*/
public class HDFSApp {
public static final String HDFS_PATH = "hdfs://192.168.26.111:9000";
Configuration configuration = null;
FileSystem fileSystem = null;
@Before
public void setUp() throws Exception {
System.out.println("HDFSApp.setUp()");
configuration = new Configuration();
fileSystem = FileSystem.get(new URI(HDFS_PATH), configuration);
}
@After
public void tearDown() throws Exception {
fileSystem = null;
configuration = null;
System.out.println("HDFSApp.tearDown()");
}
/**
* 创建目录
*/
@Test
public void mkdir() throws Exception {
fileSystem.mkdirs(new Path("/hdfsapi/test"));
}
/**
* 创建文件
*/
@Test
public void create() throws Exception {
FSDataOutputStream output = fileSystem.create(new Path("/hdfsapi/test/a.txt"));
output.write("hello world".getBytes());
output.flush();
output.close();
}
/**
* 重命名
*/
@Test
public void rename() throws Exception {
Path oldPath = new Path("/hdfsapi/test/a.txt");
Path newPath = new Path("/hdfsapi/test/b.txt");
System.out.println(fileSystem.rename(oldPath, newPath));
}
/**
* 上传本地文件到HDFS
*/
@Test
public void copyFromLocalFile() throws Exception {
Path src = new Path("/home/hadoop/data/hello.txt");
Path dist = new Path("/hdfsapi/test/");
fileSystem.copyFromLocalFile(src, dist);
}
/**
* 查看某个目录下的所有文件
*/
@Test
public void listFiles() throws Exception {
FileStatus[] listStatus = fileSystem.listStatus(new Path("/hdfsapi/test"));
for (FileStatus fileStatus : listStatus) {
String isDir = fileStatus.isDirectory() ? "文件夹" : "文件"; //文件/文件夹
String permission = fileStatus.getPermission().toString(); //权限
short replication = fileStatus.getReplication(); //副本系数
long len = fileStatus.getLen(); //长度
String path = fileStatus.getPath().toString(); //路径
System.out.println(isDir + "\t" + permission + "\t" + replication + "\t" + len + "\t" + path);
}
}
/**
* 查看文件块信息
*/
@Test
public void getFileBlockLocations() throws Exception {
FileStatus fileStatus = fileSystem.getFileStatus(new Path("/hdfsapi/test/b.txt"));
BlockLocation[] blocks = fileSystem.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
for (BlockLocation block : blocks) {
for (String host : block.getHosts()) {
System.out.println(host);
}
}
}
}
拒绝当前用户访问的话,需要设置环境变量将用户替换为root用户。
设置环境变量HADOOP_USER_NAME=root。
MapReduce进行词频统计
1.创建要执行统计的目录
hdfs dfs -mkdir /wc
#上传文本文件
vi 1.txt
上传本地1.txt文件到hdfs上面。
hdfs dfs -put 1.txt /wc
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.util.StringTokenizer;
/**
* WordCount的MapReduce实现
*/
public class WordCountApp {
public static class MyMapper
extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class MyReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
String INPUT_PATH = "hdfs://192.168.26.111:9000/wc";
String OUTPUT_PATH = "hdfs://192.168.26.111:9000/outputwc";
Configuration conf = new Configuration();
final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
if (fileSystem.exists(new Path(OUTPUT_PATH))) {
fileSystem.delete(new Path(OUTPUT_PATH), true);
}
Job job = Job.getInstance(conf, "WordCountApp");
// run jar class
job.setJarByClass(WordCountApp.class);
// 设置map
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置reduce
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置input formart
job.setInputFormatClass(TextInputFormat.class);
Path inputPath = new Path(INPUT_PATH);
FileInputFormat.addInputPath(job, inputPath);
// 设置output format
job.setOutputFormatClass(TextOutputFormat.class);
Path outputPath = new Path(OUTPUT_PATH);
FileOutputFormat.setOutputPath(job, outputPath);
// 提交job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
MapReduce跑不通参考这个解决https://blog.csdn.net/whs0329/article/details/121878162
记得设置用户root环境变量,exit 0表示词频统计运行成功。
查看词频统计结果。