1.本地创建一个文件,"words.txt" ,上传到hdfs: "/neusoftin" 下
package hdfs;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
* HDFS 文件上传
* @author Administrator
*
*/
public class CreateFile {
public static void main(String[] args) throws Exception {
//设置一个配置 服务器所在 信息
Configuration conf = new Configuration();
// linux 上的 hdfs 访问 地址
conf.set("fs.defaultFS", "hdfs://master:8020");
// 从服务器 获取 hdfs 文件 操作对象
FileSystem hdfs = FileSystem.get(conf);
//找到上传的 文件
byte[] buf = ("BEIJING, April 14 (Xinhua) According to Sputnik's report on April 14, the Russian " +
"side said that two Ukrainian combat helicopters entered Russian airspace and attacked " +
"residential buildings. At present, the Ukrainian side has not yet responded to this " +
"news. This is the second time since the outbreak of the Russian-Ukrainian conflict " +
"that Russia has claimed that Ukrainian helicopters have entered Russian airspace." +
"According to reports, the Russian investigation committee said that Ukrainian helicopters" +
" carried out at least six attacks on the residential building in Klimovo. The Committee " +
"has launched a criminal investigation into the attack." +
"Not long ago, Alexander Kuznetsov, assistant minister of health of Russia, said that " +
"the Ukrainian army attacked Bryansk, causing seven injuries, and all the injured were " +
"hospitalized." +
"For the second time, Russia claimed that Ukrainian warplanes entered Russian airspace." +
"On April 1, the Russian Defense Ministry announced that an oil facility in Russia was attacked " +
"and caught fire by Ukrainian military helicopters on the same day. This is the first time that " +
"Russia has reported Ukraine's air strikes on its territory since Russia launched a special" +
" military operation against Ukraine on February 24th." +
"Russian Defense Ministry spokesman Konashenkov said that two Ukrainian Mi -24 helicopters" +
" entered Russian airspace, targeting a civilian petroleum product on the outskirts" +
" of Belgorod, the capital of Belgorod.").getBytes();
//对应 hdfs 路径
Path dst = new Path("/neusoftin/words.txt");
// 创建文件路径
FSDataOutputStream out = hdfs.create(dst);
out.write(buf, 0, buf.length);// 向文件 传入信息
out.close();
// 验证 是否创建成功
System.out.println(hdfs.exists(dst));
}
}
2.使用Tool工具类实现Mapreduce :WordCountTool方法
package mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.BufferedReader;
import java.io.InputStreamReader;
/**\
* 通过run 启动多个
*/
public class WordCountTool extends Configured implements Tool {
public static void main(String[] args) throws Exception {
// 服务器连接对象
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://master:8020");
FileSystem hdfs = FileSystem.get(conf);
// 设置读取路径和文件
String input ="/neusoftin/*.txt";
String output= "/neusoftout"; // mapreduce 最后的结果,路径不能事先存在
Path outputpath = new Path(output);
// 执行前先 删除 结果文件夹;所以 如果为 true
if(hdfs.exists(outputpath)){
hdfs.delete(outputpath);
}
//工具类中 启动
args = new String[]{"/neusoftin/*.txt", "/neusoftout"};
int re =ToolRunner.run(conf,new WordCountTool(),args);
System.exit(re);
}
@Override
public int run(String[] strings) throws Exception {
//
Job job =Job.getInstance(getConf());
job.setJarByClass(WordCountMain.class);//执行jar启动类
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job,strings[0]);// 输入入口
//Mapper
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setCombinerClass(WordCoundCombiner.class);
// reducer
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 结果文件 输出
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path(strings[1]));// 执行输出
//运行
boolean result = job.waitForCompletion(true);
FileSystem hdfs = FileSystem.get(getConf());
if(result){
// 获取hdfs 路径下的
for(FileStatus fs: hdfs.listStatus(new Path(strings[1]))){
FSDataInputStream dis = hdfs.open(fs.getPath());
//用IOUtils下的copyBytes将流中的数据打印输出到控制台
BufferedReader reader = new BufferedReader(new InputStreamReader(dis)); // 字节转字符
String line = reader.readLine();
while(line!=null){
System.out.println(line);
line = reader.readLine();
}
}
}
return 0;
}
}
3.WordCountMapper方法
package mapreduce;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
/**
* 汇总
* in传入
*
* LongWritable, Text
*
* LongWritable : 读取文件内容的偏移位置()
* 读取方式 默认行
* Text : 读取一行的数据,每读取一行,调用一次map
* 返回Map
* key:
* 字符
* value:
* 统计、汇总、排序、
* out传出
* Text, IntWritable
*
*/
public class WordCountMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
// 优化 写法
private Text outMapKey= new Text();
private static final IntWritable outMapValue = new IntWritable(1);
/**
*
* @param key
* @param value 传入的文本
* @param context 返回的map
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//获取 传入需要 统计的信息
String line = value.toString();
//分片
// line.split(" "); 是否为空的判定
if(StringUtils.isBlank(line)){
return ;
}
//调用工具类 差分 获取单词
StringTokenizer st = new StringTokenizer(line);
while(st.hasMoreTokens()){ // 循环判断是否还有 可以或缺的 单词
String word =st.nextToken(); // 向下获取单词
outMapKey.set(word);
context.write(outMapKey, outMapValue); // 向reduce 传递 信息 key 和value
}
}
}
4.WordCountReducer方法
package mapreduce;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.StringTokenizer;
/**
*
* map传入 返回map
* Text, IntWritable,Text ,IntWritable
*/
public class WordCountReducer extends Reducer<Text, IntWritable,Text ,IntWritable> {
/**
*
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum=0; // 定义求和变量
for(IntWritable value :values){ //循环 vaues
sum+= value.get(); //+1
}
context.write(key,new IntWritable(sum));// 返回map
}
}