hadoop经典wordcount和hdfs增加删除追加

最新推荐文章于 2022-06-14 10:11:44 发布

小飞侠-2

最新推荐文章于 2022-06-14 10:11:44 发布

阅读量217

点赞数

文章标签： java 大数据

学习hadoop的经典实例，网上找了好多代码，自己做了些修改，这个例子有hadoop的wordcount和操作hdfs的全过程，已跑通。

1.RunJob。java

package com.dinfo.hadoop;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobTracker;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.YARNRunner;
import org.apache.hadoop.mapreduce.Cluster.JobTrackerStatus;
import org.apache.hadoop.mapreduce.ClusterMetrics;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.TaskTrackerInfo;

import com.dinfo.oec.had.common.ConfigurationCommon;

public class RunJob {

public static void main(String[] args) throws IOException, InterruptedException {
Configuration config = ConfigurationCommon.conf;

/*config.set("fs.default.name", "hdfs://192.168.2.213:9000");
config.set("hadoop.job.user", "hadoop");
config.set("mapreduce.framework.name", "yarn");
config.set("mapreduce.jobtracker.address", "192.168.2.213:9001");
config.set("yarn.resourcemanager.hostname", "192.168.2.213");
config.set("yarn.resourcemanager.admin.address", "192.168.2.213:8033");
config.set("yarn.resourcemanager.address", "192.168.2.213:8032");
config.set("yarn.resourcemanager.resource-tracker.address", "192.168.2.213:8036");
config.set("yarn.resourcemanager.scheduler.address", "192.168.2.213:8030");*/

config.set("fs.default.name", "hdfs://dinfo213:9000/");
config.set("hadoop.job.user","hadoop");
config.set("mapred.job.tracker","dinfo213:9001");
try{
String inputpath = "/input/wordcount.txt";
Path outpath =new Path("/output/yjh/word");
JobConf conf = new JobConf(config,RunJob.class);
conf.setJobName("yangjianghong");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
conf.setMapperClass(WordCountMapper.class);
conf.setReducerClass(WordCountReducer.class);
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
//conf.setJar("F:\\Oec-HadRtas-4.0.0-jar-with-dependencies.jar");
conf.setJarByClass(RunJob.class);
FileInputFormat.setInputPaths(conf, new Path(inputpath));
FileOutputFormat.setOutputPath(conf, outpath);
JobClient.runJob(conf);

}catch (Exception e){
e.printStackTrace();
}
}
}

2.WordCountMappeer。java

package com.dinfo.hadoop;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;

public class WordCountMapper implements Mapper<LongWritable, Text, Text, IntWritable>{

private Text k =new Text();
private IntWritable v =new IntWritable(1);
/**
* 默认情况下，数据片段每次一行一行的传入maptask。每一行就是一个数据（结构是key：value。），key:该行所在文件中的下标，value：该行的内容
*/
@Override
public void configure(JobConf job) {
}
@Override
public void close() throws IOException {
}
@Override
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {

String line =value.toString();
String[] ws= StringUtils.split(line, ' ');
for(String word :ws){
k.set(word);
output.collect(k, v);
}
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

3.WordCountReduce.java

package com.dinfo.hadoop;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;

public class WordCountReducer implements Reducer<Text, IntWritable, Text, IntWritable>{

@Override
public void configure(JobConf job) {
// TODO Auto-generated method stub

}

@Override
public void close() throws IOException {
// TODO Auto-generated method stub

}

@Override
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
int sum=0;
while (values.hasNext())
{
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));

}

}

4.ConfigurationCommon.java

package com.dinfo.oec.had.common;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.log4j.Logger;

import com.dinfo.oec.hadoop.util.PropertyUtil;

/**
* Date:2016年3月24日下午1:49:29
* Module:
* Description: 获取集群连接配置类
* Remark: 
* @author xilina
* @version 4.0.1
* ------------------------------------------------------------
* Change history
* Serial number: date:modified person: modification reason:
* 1 
*/
public class ConfigurationCommon {
public static Configuration conf;
public static Logger logger = Logger.getLogger(ConfigurationCommon.class);
public static String claster_name = PropertyUtil.getValue("claster_name");
public static String namenode_address1 = PropertyUtil.getValue("namenode_address1");
public static String namenode_address2 = PropertyUtil.getValue("namenode_address2");
public static String zookeeper_hosts = PropertyUtil.getValue("zookeeper_hosts");
public static String hadoop_home_dir = PropertyUtil.getValue("hadoop_home_dir");
public static String hadoop_user_name=PropertyUtil.getValue("hadoop_user_name");

static{
conf=getConfiguration(claster_name,namenode_address1,namenode_address2,zookeeper_hosts);
System.setProperty("hadoop.home.dir", hadoop_home_dir);
System.setProperty("HADOOP_USER_NAME",hadoop_user_name);
}

/**
* 初始化ConfigurationCommon
* @param in
* @throws UnsupportedEncodingException
* @throws IOException
*/
public static void InitConfigurationCommon(InputStream in) throws UnsupportedEncodingException, IOException{
Properties property=new Properties();
property.load(new InputStreamReader(in, "utf-8"));
claster_name=property.getProperty("claster_name");
namenode_address1=property.getProperty("namenode_address1");
namenode_address2=property.getProperty("namenode_address2");
zookeeper_hosts=property.getProperty("zookeeper_hosts");
hadoop_home_dir=property.getProperty("hadoop_home_dir");
hadoop_user_name=property.getProperty("hadoop_user_name");
conf=getConfiguration(claster_name,namenode_address1,namenode_address2,zookeeper_hosts);
System.setProperty("hadoop.home.dir", hadoop_home_dir);
System.setProperty("HADOOP_USER_NAME",hadoop_user_name);
logger.info("初始化hadoop-ConfigurationCommon完成");
logger.info("claster_name:"+claster_name+" namenode_address1:"+namenode_address1+" namenode_address:"+namenode_address1);
logger.info("zookeeper_hosts:"+zookeeper_hosts+" hadoop_home_dir:"+hadoop_home_dir+" hadoop_user_name:"+hadoop_user_name);
}

/**
* Description:hadoop环境基本信息设置
* Remark:
* @param clusterName 集群名称
* @param nn1Address namenode主机名
* @param nn2Address namenode主机名
* @return 连接配置对象 Configuration
*/
public static Configuration getConfiguration(String clusterName,String nn1Address,String nn2Address,String zkhosts){
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://"+clusterName);
conf.set("dfs.nameservices",clusterName);
conf.set("dfs.ha.namenodes.hacluster", "nn1,nn2");
conf.set("dfs.namenode.rpc-address.hacluster.nn1",nn1Address);
conf.set("dfs.namenode.rpc-address.hacluster.nn2",nn2Address);
conf.set("dfs.client.failover.proxy.provider.hacluster","org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
conf.set("hbase.zookeeper.quorum", zkhosts);
conf.setBoolean( "dfs.support.append", true );
conf.set("dfs.client.block.write.replace-datanode-on-failure.policy","NEVER");
conf.set("dfs.client.block.write.replace-datanode-on-failure.enable","true");
conf = HBaseConfiguration.create(conf);
return conf;
}

}

5.Hdfsutils。java

package com.dinfo.oec.hadoop.util;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.log4j.Logger;

import com.dinfo.oec.had.common.ConfigurationCommon;

//hdfs 数据文件增删该查
public class HdfsUtil {
public static Configuration conf;
public static Logger logger = Logger.getLogger(HdfsUtil.class);
static{
conf=ConfigurationCommon.conf;
}

public static void main(String[] args) throws IOException {
/*Path dirpath = new Path("/aa");
FileSystem fs = getFS();*/
List<String> lsit = new ArrayList<String>();
lsit.add("2222 中国邮政poc演示");
// boolean b=HdfsUtil.createDataFile(lsit,"/input/disanalytest/sanalydata.txt","hadoop");
// System.err.println(b);
String str=HdfsUtil.readFile("/input/disanalytest/sanalydata.txt");
System.out.println(str);
}

/**fileSystem.existes 判断是否存在
* Description:如果没有指定路径的文件目录，则创建文件目录
* Remark:
* @param path 目录路径
* @param ownerUser 所属系统用户名
* @return
*/
public static void makeDirectory(String path,String ownerUser){
Path dirpath = new Path(path);
FileSystem fs = getFS();
try {
if(!fs.exists(dirpath)){
fs.mkdirs(dirpath);
fs.setOwner(dirpath, ownerUser, ownerUser);
logger.info("Create folder ###"+path+"### successfully!");
}else{
logger.info("Folder ###"+path+"### existed!");
}
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
* Description:判断文件系统中指定路径的文件是否存在
* Remark:
* @param filepath 文件路径
* @return 判断结果，true:存在；false:不存在
*/
public static boolean isHaveFile(String filepath){
Path path = new Path(filepath);
FileSystem fs = getFS();
try {
if(fs.exists(path)){
return true;
}else{
return false;
}
} catch (IOException e) {
e.printStackTrace();
return false;
}finally{
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**保存数据到hdfs上 fileSystem.create
* Description:保存批量分析数据至分布式文件系统
* Remark:
* @param datas 分析数据
* @param filepath 文件路径
* @return 数据保存状态，true保存成功，否则保存失败
*/
public static boolean createDataFile(List<String> datas,String filepath,String owneruser){

Path path = new Path(filepath);
FSDataOutputStream os = null;
FileSystem fs = getFS();
try {
os = fs.create(path);
fs.setOwner(path, owneruser, owneruser);
for(String data:datas){
os.write(data.getBytes());
os.write("\r\n".getBytes());
os.flush();
}
return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}finally{
try {
if(null != os){
os.close();
}
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/** 把hdfs数据流 fsdatainputStream in=fileSystem。open(new Path(filePath)) 方式连接文件读取文件
* Description:读取hdfs文件系统
* Remark:
* @param path 文件路径
* @return
*/
public static String readFile(String path){
BufferedReader br = null;
StringBuffer buf = new StringBuffer();
FileSystem fs = getFS();
FSDataInputStream in = null;
try {
in = fs.open(new Path(path));
br = new BufferedReader(new InputStreamReader(in,"utf-8"));
String line = "";
while((line = br.readLine())!=null){
line = line.split("\t")[0];
line = line.split("###")[line.split("###").length-1];
buf.append("###"+line);
}
return buf.toString().replaceFirst("###", "");
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
if (in != null)
in.close();
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return "";
}

/**
* Description:获取分布式文件系统配置信息
* Remark:
* @return
*/
public static FileSystem getFS(){
try {
FileSystem fs = FileSystem.get(conf);
return fs;
} catch (IOException e) {
e.printStackTrace();
}
return null;
}

/**fileSystem.delete
* Description:删除文件
* Remark:
* @param path
*/
public static void deleteFile(String path){
if(isHaveFile(path)){
FileSystem fs = getFS();
try {
fs.delete(new Path(path));
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
* 在hdfs上创建文件，并且写入数据
* @param data 需要追加的数据
* @param filepath 创建文件夹的路径
* @param owneruser 用户
* @return
*/
public static boolean createDataFile2(String data,String filepath,String owneruser){
Path path = new Path(filepath);
FSDataOutputStream os = null;
FileSystem fs = getFS();
try {
os = fs.create(path);
fs.setOwner(path, owneruser, owneruser);

os.write(data.getBytes("utf-8"));
os.write("\r\n".getBytes());
os.flush();

return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}finally{
try {
if(null != os){
os.close();
}
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

/**
* 给hdfs文件系统追加数据
* @param hdfs_path hdfs文件路径
* @param str 追加的字符串
* @param owneruser 用户
* @return
*/
public static boolean appendFile(String hdfs_path,String str,String owneruser){

Path path = new Path(hdfs_path);
FileSystem fs = null;
OutputStream out=null;
try {
fs = getFS();
fs.setOwner(path, owneruser, owneruser);
//fs = FileSystem.get(URI.create(hdfs_path), conf);
ByteArrayInputStream in = new ByteArrayInputStream((str+"\r\n").getBytes());
out = fs.append(path);
IOUtils.copyBytes(in, out, 4096, true);

return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}finally {
closeAll(fs,out);
}
}

/**
* 关闭文件系统、关闭输出流
* @param fs hdfs文件系统
* @param os 输出流
*/
public static void closeAll(FileSystem fs,OutputStream os){
if(os!=null){
try {
os.flush();
os.close();
fs.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}
}

}

6.PropertyUtils.java

package com.dinfo.oec.hadoop.util;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Properties;

import org.apache.log4j.Logger;
import org.mortbay.log.Log;

//hadoop 配置信息转换成流通过properties 对象load （int）链接hadoop
public class PropertyUtil {
public static Logger logger = Logger.getLogger(PropertyUtil.class);
private static Properties p = null;
private static InputStream in;

static {
Log.info(System.getProperty("user.dir"));
Log.info(PropertyUtil.class.getClassLoader()
.getResource("hdaemonServer.properties").getPath());
in = PropertyUtil.class.getClassLoader().getResourceAsStream(
"hdaemonServer.properties");
}

public static String getValue(String key) {
if (p == null) {
try {
p = new Properties();
InputStreamReader fin = new InputStreamReader(in, "UTF-8");
p.load(fin);
} catch (Exception e) {
e.printStackTrace();
}
}
String value = p.getProperty(key);
return value;
}

}

7.hdaemonserver.propers

#1.------------------------hdaemonServer配置--------------------------------------
#分布式服务端口号
hdaemonserver.serverPort=9991
#集群服务表示，0为非集群，1为集群服务
hdaemonserver.clusterFlg=0
#zookeeper服务节点访问地址
hdaemonserver.zookeeperHost=192.168.2.112:2191,192.168.2.212:2191,192.168.2.213:2191
#调用hadooop集群服务的可执行jar包的路径
hadoopJarPath=D:/hadoop/jar/hadoop.jar

#2.-----------------------hadoop配置---------------------------------------------
#hadoop集群名称
claster_name=hacluster
#hadoop集群namenode地址
namenode_address1=192.168.2.213:9000
#hadoop集群namenode地址
namenode_address2=192.168.2.212:9000
#hadoop集群的zookeepr地址
zookeeper_hosts=192.168.2.213:2181,192.168.2.212:2181,192.168.2.112:2181
#hadoop客户端地址
hadoop_home_dir=C:/hadoopPlun/hadoop-2.6.0
#hadoop用户名
hadoop_user_name=hadoop
#job任务输出文件路径
outpath=/output/disanalyresult

小飞侠-2

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
hadoop经典wordcount和hdfs增加删除追加

学习hadoop的经典实例，网上找了好多代码，自己做了些修改，这个例子有hadoop的wordcount和操作hdfs的全过程，已跑通。1.RunJob。javapackage com.dinfo.hadoop;import java.io.IOException;import org.apache.hadoop.conf.Configuration;import org.ap...
复制链接

扫一扫