大数据分析 Hadoop实现基于sort的groupby
#题目要求
#代码实现
1.hdfs输入
2.处理输入命令行,注意程序会自动删除逗号。利用命令行取得的数组大小来判断需要输出的统计结果
3从hdfs读取文件内容,进行排序(自定义结构体,并定义sort的比较方式);使用了类的继承
4.遍历计算groupby的总数,平均数,最大值,注意循环的最后一个结果需要单独获取,因为循环体内判断了最后一次后不计算
5.列表添加结果,结果是自定义的结构体,和获取结构体元素的函数,其中注意构造函数中新建数组来存放,否则会以指针形式构造,输出时只会记录最后一个数据的内容
6.输出到hbase,涉及输出的格式
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.lang.Math;
import java.text.DecimalFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.log4j.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
* Read file from HDFS ->
* Select the items that meet the condition ->
* Save result to HBase
*/
public class Hw1Grp3 {
public static void main(String[] args) throws IOException, URISyntaxException {
//define the input parameter
String[] operator = {"", "", ""}; //store count avg(R*) max(R*) which can be used in the output
String select_col = "";//the groupby_col(int)
int avg_col = -1;
int max_col = -1;
String filepath = "";//the path and name of file
int groupby_col = 0;
int num = 0;
//process the input string
for (int i = 0; i < args.length; i++) {
if (args[i].startsWith("R=")) {
filepath = args[i].substring(2);
} else if (args[i].startsWith("groupby:R")) {
select_col = args[i].substring(9);
groupby_col = Integer.parseInt(select_col);
} else if (args[i].startsWith("res:")) {
String select[] = args[i].substring(4).split(",");
num = select.length;
// get the operator
for (int j = 0; j < num; j++) {
operator[j] = select[j];
if (operator[j].startsWith("avg(R")) {
avg_col = Integer.parseInt(operator[j].substring(5, operator[j].length() - 1));
} else if (operator[j].startsWith("max(R")) {
max_col = Integer.parseInt(operator[j].substring(5, operator[j].length() - 1));
}
}
}//separation operatior:count,avg(R*),max(R*)
}
//hdfs operator
String file = "hdfs://localhost:9000" + filepath;
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(file), conf);
Path path = new Path(file);
FSDataInputStream in_stream = fs.open(path);
BufferedReader in = new BufferedReader(new InputStreamReader(in_stream));
String line;
List<String[]> tempResult = new ArrayList<>();
//read file from hdfs
while ((line = in.readLine()) != null) {
String row[] = line.split("\\|");//split every row as an array $input the file as a list
tempResult.add(row);
}
//sort row[] with groupby_key_value
Comparator comparator = new ComparatorListSort(groupby_col);//define the list compare way in the last of this file
Collections.sort(tempResult, comparator);
//statistic the count ,avg(),max()
List<Answer> Result = new ArrayList<>();//record the groupby data:groupby_col_value,count,avg,max
String res0 = "";//groupby_col_value
double[] res = new double[3]; //defined to store every statistical record
//initialize
String row[] = tempResult.get(0);
String groupby_col_value = row[groupby_col];
double count = 1;
double avg_value = 0;
double max_value = 0;
if (avg_col != -1) {
avg_value = Double.parseDouble(row[avg_col]);
}
if(max_col != -1) {
max_value = Double.parseDouble(row[max_col]);
}
//DecimalFormat df = new DecimalFormat("#.00");
for (int i = 1; i < tempResult.size(); i++) {
row = tempResult.get(i);
//groupby_col_value is same
if(row[groupby_col].equals(groupby_col_value) ){
count++;
if(avg_col != -1) {
avg_value = avg_value + Double.parseDouble(row[avg_col]);
}
if(max_col != -1) {
max_value = Math.max(max_value, Double.parseDouble(row[max_col]));
}
} else {
res0 = groupby_col_value;
groupby_col_value = row[groupby_col];
if(avg_col != -1) {
//String str = df.format((avg_value / count));
//res[1] = Double.parseDouble(str);
res[1] = (double) Math.round((avg_value / count) * 100) / 100;
avg_value = Double.parseDouble(row[avg_col]);
}
res[0] = count;
count = 1;
if(max_col != -1) {
res[2] = max_value;
max_value = Integer.parseInt(row[max_col]);
}
Answer ans = new Answer(res0, res);
Result.add(ans);
}
}
//add the last record
res0 = groupby_col_value;
res[0] = count;
res[1] = (double) Math.round((avg_value / count)*100) / 100;
res[2] = max_value;
Answer ans = new Answer(res0, res);
Result.add(ans);
//save result to hbase
saveHbase(Result, operator,num);
in.close();
fs.close();
}
/**
* save result to habse
* @param result the result after sorted and computed
* @param operator the required operator
* @param num number of operator
* @throws MasterNotRunningException
* @throws ZooKeeperConnectionException
* @throws IOException
*/
public static void saveHbase(List<Answer> result, String[] operator,int num) throws MasterNotRunningException,
ZooKeeperConnectionException, IOException {
//create table descriptor
String tableName = "Result";
HTableDescriptor htd = new HTableDescriptor(TableName.valueOf(tableName));
//create column descriptor
HColumnDescriptor cf = new HColumnDescriptor("res");
htd.addFamily(cf);
//configure Hbase
Configuration configuration = HBaseConfiguration.create();
HBaseAdmin hAdmin = new HBaseAdmin(configuration);
if (hAdmin.tableExists(tableName)) {
System.out.println("Table already exists");
// delete the table
hAdmin.disableTable(tableName);
hAdmin.deleteTable(tableName);
}
hAdmin.createTable(htd);
System.out.println("table " + tableName + " created successfully");
hAdmin.close();
//put the anser into result eg: (row key=good, res:count=1)
//(row key=good, res:avg(R5)=12)
//(row key=good, res:max(R0)=100)
HTable table = new HTable(configuration, tableName);
for (int i = 0; i < result.size(); i++) {
Answer ans = result.get(i);
for (int j = 0; j < num; j++) {
if (operator[j].startsWith("count")) {
int x =(int)(ans.get_res())[0];
Put put = new Put((ans.get_res0()).getBytes());
put.add("res".getBytes(), operator[j].getBytes(), String.valueOf(x).getBytes());
table.put(put);
} else if (operator[j].startsWith("avg(R")) {
Put put = new Put((ans.get_res0()).getBytes());
put.add("res".getBytes(), operator[j].getBytes(), String.valueOf((ans.get_res())[1]).getBytes());
table.put(put);
} else if (operator[j].startsWith("max(R")) {
int x =(int)(ans.get_res())[2];
Put put = new Put((ans.get_res0()).getBytes());
put.add("res".getBytes(), operator[j].getBytes(), String.valueOf(x).getBytes());
table.put(put);
}
}
}
table.close();
System.out.println("Hbase table put successfully");
}
}//main class
/**
* define a way to sort original record - ascending order
*/
class ComparatorListSort implements Comparator<String[]>{
private int index;
public ComparatorListSort(int i){
index = i;
}
@Override
public int compare(String[] S1,String[] S2){
return S1[index].compareTo(S2[index]);
}
}
/**
* define a structure to store the result record
*/
class Answer {
private String res0;
private double[] res;
public Answer(String res0, double[] nres){
this.res0 = res0;
this.res = new double[3];
for (int i = 0; i < res.length;i++){
res[i] = nres[i];
}
}
public String get_res0(){
return res0;
}
public double[] get_res(){
return res;
}
}