是什么?
使用java 读取 hadoop 中的一个文件。
为什么?
用于初步接触hadoop开发。
怎么做?
一、依赖jar
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>myHadoop</groupId>
<artifactId>myHadoop</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<hadoop.hdfs.version>2.4.0</hadoop.hdfs.version>
<hadoop.mapreduce.client.core.version>2.4.0</hadoop.mapreduce.client.core.version>
<hadoop.common.version>2.4.0</hadoop.common.version>
<hadoop.mapreduce.client.common.version>2.4.0</hadoop.mapreduce.client.common.version>
<hadoop.mapreduce.client.jobclient.version>2.4.0</hadoop.mapreduce.client.jobclient.version>
<commons.lang.version>3.3.2</commons.lang.version>
<commons.io.version>2.4</commons.io.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.hdfs.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.mapreduce.client.core.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.common.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>${hadoop.mapreduce.client.common.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>${hadoop.mapreduce.client.jobclient.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons.lang.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons.io.version}</version>
</dependency>
</dependencies>
<build>
<finalName>gQuery-${version}</finalName>
<!-- <resources> <resource> <directory>src/main/resources</directory> <filtering>true</filtering>
<excludes> <exclude>*</exclude> <exclude>*/*</exclude> </excludes> </resource>
</resources> -->
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.1</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<encoding>utf8</encoding>
</configuration>
</plugin>
<!-- Jar 插件包含建立Jar文件的目标 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>2.4</version>
<!-- The configuration of the plugin -->
<configuration>
<!-- Configuration of the archiver -->
<archive>
<!-- 生成的jar中,不要包含pom.xml和pom.properties这两个文件 -->
<addMavenDescriptor>false</addMavenDescriptor>
<!-- Manifest specific configuration -->
<manifest>
<!-- 是否要把第三方jar放到manifest的classpath中 -->
<addClasspath>true</addClasspath>
<!-- 生成的manifest中classpath的前缀,因为要把第三方jar放到lib目录下,所以classpath的前缀是lib/ -->
<classpathPrefix>lib/</classpathPrefix>
<!-- 应用的main class -->
<mainClass>com.geotmt.hadoop.App</mainClass>
</manifest>
<!-- 用maven在MANIFEST.MF资料中的Class-Path中增加当前目录(.) -->
<manifestEntries>
<Class-Path>.</Class-Path>
</manifestEntries>
</archive>
<!--排除的模式列表,例如**/*.xml-->
<excludes>
<exclude>**/*.properties</exclude>
<exclude>**/*.xml</exclude>
</excludes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>3.1.0</version>
<executions>
<execution>
<id>copy-resources</id>
<phase>package</phase>
<goals>
<goal>copy-resources</goal>
</goals>
<configuration>
<encoding>UTF-8</encoding>
<!--被编译过的应用程序class文件存放的目录。-->
<outputDirectory>${project.build.directory}/etc</outputDirectory>
<resources>
<!--这个元素描述了项目相关或测试相关的所有资源路径-->
<resource>
<!--描述存放资源的目录,该路径相对POM路径-->
<directory>src/main/resources/</directory>
<!--包含的模式列表,例如**/*.xml.-->
<includes>
<exclude>**/*.properties</exclude>
<exclude>**/*.xml</exclude>
</includes>
<!--是否使用参数值代替参数名。参数值取自properties元素或者文件里配置的属性,文件在filters元素里列出。-->
<filtering>true</filtering>
</resource>
</resources>
</configuration>
</execution>
</executions>
</plugin>
<!-- 把依赖的jar包拷到lib目录下 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<!--在构建生命周期中执行一组目标的配置。每个目标可能有不同的配置。-->
<executions>
<!--execution元素包含了插件执行需要的信息-->
<execution>
<!--执行目标的标识符,用于标识构建过程中的目标,或者匹配继承过程中需要合并的执行目标-->
<id>copy-dependencies</id>
<!--绑定了目标的构建生命周期阶段,如果省略,目标会被绑定到源数据里配置的默认阶段-->
<phase>package</phase>
<!--配置的执行目标-->
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<!--被编译过的应用程序class文件存放的目录。-->
<outputDirectory>
${project.build.directory}/etc/lib
</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>false</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
<excludeArtifactIds>junit</excludeArtifactIds>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
java代码
package com.geotmt.hadoop.hdfs;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.InputStream;
import java.net.URI;
/**
* 从Hdfs读取一个文件
*
* Created by c.z on 2018/8/10. */
public class ReadFileHandle {
public static void main(String[] args) throws Exception {
ReadFileHandle readFileHandle = new ReadFileHandle();
readFileHandle.readFile();
}
public void readFile() throws Exception{
String uri = "hdfs://10.111.32.142:8020/user/zhaochao/20180727/sms.1532681090912.log.tmp";
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri),conf);
try(InputStream in = fs.open(new Path(uri)))
{
IOUtils.copy(in,System.out);
}
}
}