菜鸟初探ETL全流程-Jar源码+shell自动化脚本实现Hadoop分布式自动化部署到Hive中

ETL流程

ETL是将业务系统的数据经过抽取、清洗转换之后加载到数据仓库的过程,目的是将企业中的分散、零乱、标准不统一的数据整合到一起,为企业的决策提供分析依据, ETL是BI(商业智能)项目重要的一个环节。

数据源:微博评论数据

数据格式

Contont数据格式


<comment><content>颈椎肩周保健操  http://t.cn/a96LQZ  (分享自 @56网)</content><time>2012-4-5 21:53:54</time><repostsCount>3</repostsCount><commentsCount>0</commentsCount></comment>
<comment><content>颈椎保健操完整演示  http://t.cn/hbDyeB  (分享自 @56网)</content><time>2012-4-5 21:51:51</time><repostsCount>3</repostsCount><commentsCount>1</commentsCount></comment>
<comment><content>http://t.cn/hbDyeB</content><time>2012-4-5

User数据格式

User [id=1207231450, screenName=null, name=null, province=0, city=0, location=null, description=null, url=null, profileImageUrl=null, userDomain=null, gender=null, followersCount=0, friendsCount=0, statusesCount=0, favouritesCount=0, createdAt=null, following=false, verified=false, verifiedType=0, allowAllActMsg=false, allowAllComment=false, followMe=false, avatarLarge=null, onlineStatus=0, status=null, biFollowersCount=0, remark=情感, lang=null, verifiedReason=null]

Java代码,用来处理元数据,并且生成指定格式的输出文件.并且打成jar包复用

Java代码:

package Util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

public class ContextUtil{
/**
 * 对外操作接口方法
 * @throws Exception 
 */
public static void startService(String path, String split,
		String args0,String args1, String inputcharset, String outputcharset,
		String outputpath0, String outputpath1) throws Exception {
		File dir = new File(path);
		List<File> result = new ArrayList<File>();
		fileToLine(dir, result, split);
		List<String> user = getContentDataList(result, args0);
		List<String> list = getStringList(user, inputcharset);
		for (String string : list) {
			System.out.println("开始输出文件2");
			wirteFile(string, outputpath0, outputcharset);
		}
		
		
		List<String> context = getContentDataList(result, args1);
		List<String> aa = getXMLList(context, inputcharset);
		for (String string : aa) {
			System.out.println("开始输出文件1");
			wirteFile(string, outputpath1, outputcharset);
		}
}
/**
 * 以G:\weibodata\公安/user\1893451894.txt 第四个字段为分割,获取对应文件路径
 * 
 * @param result
 * @return
 */
  private static List<String> getContentDataList(List<File> result, String str) {
	List<String> ContentDatalist = new ArrayList<String>();
	for (File file : result) {
		String[] Contentlist = file.getPath().toString().split("/");
 //	System.out.println("路径为 :"+file.getPath().toString());
//			System.out.println("字段长度为"+Contentlist.length);
		 if (Contentlist[Contentlist.length-2].equals(str)) {
			ContentDatalist.add(file.getPath());
		}
	}
	return ContentDatalist;
}

/**
 * 递归查找文件
 * 
 * @param dir
 * @param result
 */
private static void fileToLine(File dir, List<File> result, String str) {
	File[] files = dir.listFiles();
	if (files != null) {
		for (File file : files) {
			if (file.isDirectory()) {
				// 递归目录
				fileToLine(file, result, str);
			} else {
				// 判断后缀,这里可以加入其他判断
				if (file.getName().endsWith(str)) {
					result.add(file);
				}
			}
		}
	}
}

/**
 * 持久化数据
 * 
 * @param files
 * @param logName
 * @throws Exception
 */
private static void writeToFile(List<File> files, String logName)
		throws Exception {
	BufferedWriter bw = new BufferedWriter(new FileWriter(logName));
	for (File file : files) {
		String filePath = file.getAbsolutePath();
		bw.write(filePath);
		bw.newLine();
		bw.flush();
	}
	bw.close();
}

/**
 * 写出整理好的文件方法 不覆盖,只追加
 * 
 * @param str
 * @param path
 * @throws Exception
 */
private static void wirteFile(String str, String path, String charset)
		throws Exception {
	File file = new File(path);
	FileOutputStream f = new FileOutputStream(path, true);
	OutputStreamWriter wr = new OutputStreamWriter(f, charset);
	BufferedWriter bw = new BufferedWriter(wr);
	// System.out.println(file.exists());
	bw.write(str);
	bw.newLine();
	bw.flush();
	bw.close();
	//如果不加关闭这个流,那么每个新的流都会打开一次文件,那么会累计同时打开多个文件.windows系统隐藏掉了很多
	//细节,所以还是要注意
	//java.io.FileNotFoundException: *****(Too many open files)
	//修改方式ulimit -n XXXX   linux默认是1024.用户只能往下改,root可以提升,不过可能有bug,我一次性直接设置了3000,但是没什么用

}

/**
 * txt按行切割方法
 * 
 * @param list
 * @param bianma
 * @return
 * @throws Exception
 */
private static List<String> getStringList(List<String> list, String bianma)
		throws Exception {
	List<String> lineList = new ArrayList<String>();
	for (String str : list) {
		// System.out.println("str :"+str);
		FileInputStream fis = new FileInputStream(str);
		Reader reader = new InputStreamReader(fis, bianma);
		BufferedReader br = new BufferedReader(reader);
		String tempLine = null;
		while ((tempLine = br.readLine()) != null) {
			String finalstr = "";
			String s1 = tempLine.trim().substring(6, tempLine.length() - 1);
			String[] s2 = s1.trim().split(",");
			for (String string : s2) {
				String[] s3 = string.trim().split("=");
				if (s3[0].equals("id") || s3[0].equals("name")
						|| s3[0].equals("province")
						|| s3[0].equals("remark")) {
					finalstr += s3[1] + "\t";
				}

			}
			lineList.add(finalstr);
		}
	}
	return lineList;
}

/**
 * xml按行切割方法
 * 
 * @param list
 * @param bianma
 * @return
 * @throws Exception
 */
private static List<String> getXMLList(List<String> list, String bianma)
		throws Exception {
	List<String> lineList = new ArrayList<String>();
	for (String str : list) {
//			System.out.println(str);
		FileInputStream fis = new FileInputStream(str);
		InputStreamReader reader = new InputStreamReader(fis, bianma);
		BufferedReader br = new BufferedReader(reader);
		String tempLine = null;
		while ((tempLine = br.readLine()) != null) {
			String content = getstr(tempLine, "content");
			String time = getstr(tempLine, "time");
			String repostsCount = getstr(tempLine, "repostsCount");
			String commentsCount = getstr(tempLine, "commentsCount");
			String finalstr = str.split("/")[5].split(".txt")[0] + "\t"
					+ content + "\t" + time + "\t" + repostsCount + "\t"
					+ commentsCount;
			// System.out.println(finalstr);
			lineList.add(finalstr);
		}
	}
	return lineList;
}

/**
 * xml切割便捷方法
 * 
 * @param str
 * @param split
 * @return
 */
private static String getstr(String str, String split) {
	String arr = str.trim().substring(9, str.length() - 10);
	String as = split + ">";
	if (arr.trim().split(as).length < 1) {
		System.out.println("下标长度小于1");
		return "null";
	}
	String[] arr1 = arr.trim().split(as);
	if (arr1.length == 0 || arr1.length < 2) {
//			System.out.println("下标");
		System.out.println(str);
		return "null";
	} else {
		String[] arr2 = arr1[1].trim().split("</");
		if (arr2.length == 0) {
			return "null";
		}
		return arr2[0];
	}
}


public static void main(String[] args) throws Exception {
	String path = "G:\\weibodata";
	String split = ".txt";
	String arg0 = "user";
	String args1 = "content";
	String outputpath0 = "G:\\serdata.txt";
	String outputpath1 = "G:\\conten.txt";
	String inputcharset = "gbk";
	String outputcharset = "utf-8";
	if (args==null||args.length==0) {
		System.out.println("请传参");
	}
	for (int i = 0; i < args.length; i++) {
		System.out.println(args[i]);
	}
	startService(args[0], args[1], args[2], args[3], args[4],
			args[5], args[6], args[7]);


//void Util.ContextUtil.startService(String path, String split, String args0, String args1, String inputcharset,
//		String outputcharset, String outputpath0, String outputpath1) throws Exception

}
}

打成Jar包后,上传到HDFS中

数据源放在input中,main存放持久化脚本 ,output指定输出 Util存放jar包

查看shell脚本前,回顾ETL在分布式上的流程

1. 运行jar处理数据,输出到指定目录
2. 提交文件到HDFS系统中
3. Hive创建表
4. Load加载数据
1. jar包脚本 makdata
#!/bin/sh

#args0 --->FilePath
#args1 --->Split
#args --->Split-/-string0
#args --->Split-/-string1
#args2 --->inputCharSet
#args3 --->outputCharSet
#args4 --->outputPath0
#args5 --->outputPath1

FilePath=$1
Split=$2
inputCharSet=$3
outputCharSet=$4
outputPath0=$5
outputPath1=$6
     
#run java jar xxxx.jar 8args
java -cp ../Util/WBEtl-0.0.1-SNAPSHOT.jar Util.ContextUtil $FilePath $Split  user content $inputCharSet $outputCharSet  $outputPath0 $outputPath1
2. push文件脚本putfile.sh
#!/bin/sh
localpath=$1
hdfspath=$2

hdfs dfs -put $localpath $hdfspath
3. 建表脚本

User表
makeUserTable.sh

#!/bin/sh
db=$1
tb_name=$2

hive -e "USE $db;
CREATE TABLE $tb_name(
  id string,
  name string,
  num string,
  remark string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;"

Content表
makeContontTable.sh

#!/bin/sh
db=$1
tb_name=$2

hive -e "USE $db;
CREATE TABLE $tb_name(
  uid string,
  words string,
  lotime string,
  city string,
  location string
 )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;"
4. Load导入数据

loaddata.sh

#!/bin/sh
db=$1
data_path=$2
tb_name=$3

hive -e "USE $db;
LOAD DATA INPATH \"$data_path\"  OVERWRITE INTO TABLE $tb_name;"

下面展示一些 内联代码片

  1. main脚本,串联运行所有脚本
    main.sh
   #!/bin/sh   
#var define
#1.make data
     echo '++++++start product data++++++'
sh makdata.sh ../input .txt gbk utf-8 ../output/user.txt ../output/content.txt
echo '++++++end product data++++++'
#2.put local to hdfs
echo '++++++start put local to hdfs++++++'
sh putfile.sh ../output/user.txt /tmp/demo/wtx/
sh putfile.sh ../output/content.txt  /tmp/demo/wtx/
echo '++++++end  put local to hdfs++++++'
#3.create table
echo '++++++start create table ++++++'
sh makeContontTable.sh wangtianxin this1
sh makeUserTable.sh wangtianxin this2
echo '++++++end create table++++++'
#4.load data
echo '++++++start load data++++++'
sh loaddata.sh wangtianxin /tmp/demo/wtx/user.txt this2
sh loaddata.sh wangtianxin /tmp/demo/wtx/content.txt this1
echo '++++++end load data++++++'

因为main脚本中的参数写的是相对路径,所以要运行需要在main包下运行main.sh

运行完成~

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值