一个简单地Wordcount例子
在wordcount.txt写入
Spark Hadoop JAVA Python Spark Hadoop HIVE
WordCountMapper
package Mapreducer.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
* <p>
* LongWritable :表示对应Java中的Long,其对应的值是数据读取的偏移量
* Text:表示读取文件中的一行数据,对应java中的String
* <p>
* Text:表示输出数据的Key为text类型
* IntWritable:表示输出数据的value为 IntWritable 类型 对应JAVA中的int
*/
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//不需要调用父类的方法
// super.map(key, value, context);
String[] words= value.toString().split(" ");
for (String word : words) {
context.write(new Text(word),new IntWritable(1));
}
}
}
WordCountReducer
package Mapreducer.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
/**
* Reduce端的具体实现方法
*
* @param key Map端传过来的key
* @param values Map端传过来相同Key的values的集合
* @param context 上下文操作对象
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//不调用父类的Reduce
// super.reduce(key, values, context);
int count=0;
for (IntWritable value : values) {
count +=value.get();
}
context.write(key,new IntWritable(count));
}
}
WordCountDriver
package Mapreducer.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
// Drive用于提交我们的Job任务
public class WordCountDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
// conf.set();
// getInstance(Configuration conf) 获取Job的实例对象
Job job = Job.getInstance(conf);
// job.setJar();
// 设置Mapper和Reducer的具体实现类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 设置Map端输出的数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 设置Reduce端输出的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
/**
* setInputPaths(Job job, String commaSeparatedPaths)
*
* void setOutputPath(Job job, Path outputDir) {
*/
// 设置输入输入路径
FileInputFormat.setInputPaths(job,new Path("E:\\JAVA XMS\\Hadoop\\input"));
FileOutputFormat.setOutputPath(job,new Path("E:\\JAVA XMS\\Hadoop\\output"));
// job提交
boolean res = job.waitForCompletion(true);
}
}
HDFS
用Java连接HDFS
HDFS_0_FileSystem
package HDFS;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class HDFS_0_FileSystem {
public static void main(String[] args) throws URISyntaxException, IOException, InterruptedException {
//创建一个配置文件
Configuration conf = new Configuration();
/**
* Get a filesystem instance based on the uri, the passed configuration and the user
* Params:
* uri – of the filesystem
* conf – the configuration to use
* user – to perform the get as
* Returns:
* the filesystem instance
*/
//可以在主节点master下的hadoop-2.6.7\etc\hadoop\core-site.xml里找到
URI uri = new URI("hdfs://master:9000");
FileSystem fs = FileSystem.get(uri, conf, "root");
System.out.println(fs.getClass().getName());
}
}
HDFS_1_UPFile
package HDFS;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.net.URISyntaxException;
public class HDFS_1_UPFile {
public static void main(String[] args) throws URISyntaxException, IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://master:9000");
FileSystem fileSystem = FileSystem.get(conf);
/**
* Params:
* delSrc – whether to delete the src
* overwrite – whether to overwrite an existing file
* src – path
* dst – path
*
* copyFromLocalFile(boolean delSrc, Path src, Path dst)
*/
fileSystem.copyFromLocalFile(false,true,new Path("E:\\JAVA XMS\\Hadoop\\data\\sanguo.txt"),new Path("/sanguo/"));
fileSystem.close();
System.out.println("upFiles has finished");
}
}
HDFS_2_UPFileReplication
package HDFS;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
//这里查看副本数在主节点master下的hadoop-2.6.7\etc\hadoop\hdfs-site.xml里找到
public class HDFS_2_UPFileReplication {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://master:9000");
conf.set("dfs.replication","3");
FileSystem fs = FileSystem.get(conf);
fs.copyFromLocalFile(false,true,new Path("E:\\JAVA XMS\\Hadoop\\data\\sanguo.txt"),new Path("/sanguo/"));
fs.close();
System.out.println("upFiles has finished");
}
}
HDFS_3_DownFile
package HDFS;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
public class HDFS_3_DownFile {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://master:9000");
conf.set("dfs.replication","3");
FileSystem fs = FileSystem.get(conf);
fs.copyToLocalFile(false,new Path("/sanguo/sanguo.txt"),new Path("E:\\JAVA XMS\\Hadoop\\data"));
fs.close();
System.out.println("Finished");
}
}
HDFS_4_RenameFile
package HDFS;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
public class HDFS_4_RenameFile {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://master:9000");
conf.set("dfs.replication","3");
FileSystem fs = FileSystem.get(conf);
fs.rename(new Path("/sanguo/sanguo.txt"),new Path("/sanguo/sanguo3.txt"));
fs.close();
System.out.println("Finished");
}
}
HDFS_5_DeletFiles
package HDFS;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
public class HDFS_5_DeletFiles {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://master:9000");
conf.set("dfs.replication","3");
FileSystem fs = FileSystem.get(conf);
/**
* Params:
* f – the path to delete.
* recursive – if path is a directory and set to true, the directory is deleted else throws an exception. In case of a file the recursive can be set to either true or false.
* Returns:
* true if delete is successful else false.
*
* boolean delete(Path f, boolean recursive)
*/
fs.delete(new Path("/test1/test2"),false);
fs.close();
System.out.println("Finished");
}
}
HDFS_6_FileStatus
package HDFS;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import java.io.IOException;
public class HDFS_6_FileStatus {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://master:9000");
conf.set("dfs.replication", "3");
FileSystem fs = FileSystem.get(conf);
/**
* Params:
* f – is the path
* recursive – if the subdirectories need to be traversed recursively
* 如果需要递归 则设置为 true
*/
RemoteIterator<LocatedFileStatus> lsr = fs.listFiles(new Path("/sanguo"), true);
while (lsr.hasNext()) {
LocatedFileStatus next = lsr.next();
// 获取每个block的位置信息
BlockLocation[] blocks = next.getBlockLocations();
// 获取block块大小
System.out.println(next.getBlockSize());
// 获取路径
System.out.println(next.getPath());
// 获取所有者
System.out.println(next.getOwner());
for (BlockLocation block : blocks) {
System.out.println("hosts:" + block.getHosts());
System.out.println("name:" + block.getNames());
System.out.println("length:" + block.getLength());
}
fs.close();
System.out.println("Finished");
}
}
}
HDFS_7_isFileorDirector
package HDFS;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
public class HDFS_7_isFileorDirector {
public static void main(String[] args) throws IOException {
Configuration configuration = new Configuration();
configuration.set("fs.defaultFS", "hdfs://master:9000");
configuration.set("dfs.replication", "3");
FileSystem fs = FileSystem.get(configuration);
FileStatus[] fileStatuses = fs.listStatus(new Path("/sanguo"));
for (FileStatus fileStatus : fileStatuses) {
System.out.println(fileStatus.isFile());
System.out.println(fileStatus.isDirectory());
}
fs.close();
System.out.println("application has finished");
}
}