大数据分析 Hadoop实现基于sort的groupby

大数据分析 Hadoop实现基于sort的groupby

#题目要求
在这里插入图片描述
#代码实现
1.hdfs输入
2.处理输入命令行,注意程序会自动删除逗号。利用命令行取得的数组大小来判断需要输出的统计结果
3从hdfs读取文件内容,进行排序(自定义结构体,并定义sort的比较方式);使用了类的继承
4.遍历计算groupby的总数,平均数,最大值,注意循环的最后一个结果需要单独获取,因为循环体内判断了最后一次后不计算
5.列表添加结果,结果是自定义的结构体,和获取结构体元素的函数,其中注意构造函数中新建数组来存放,否则会以指针形式构造,输出时只会记录最后一个数据的内容
6.输出到hbase,涉及输出的格式

import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.lang.Math;
import java.text.DecimalFormat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;

import org.apache.log4j.*;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
 *  Read file from HDFS ->
 *  Select the items that meet the condition ->
 *  Save result to HBase
 */
public class Hw1Grp3 {
    public static void main(String[] args) throws IOException, URISyntaxException {
        //define the input parameter
        String[] operator = {"", "", ""}; //store count avg(R*) max(R*) which can be used in the output
        String select_col = "";//the groupby_col(int)
        int avg_col = -1;
        int max_col = -1;
        String filepath = "";//the path and name of file
        int groupby_col = 0;
        int num = 0;

        //process the input string
        for (int i = 0; i < args.length; i++) {
            if (args[i].startsWith("R=")) {
                filepath = args[i].substring(2);
            } else if (args[i].startsWith("groupby:R")) {
                select_col = args[i].substring(9);
                groupby_col = Integer.parseInt(select_col);
            } else if (args[i].startsWith("res:")) {
                String select[] = args[i].substring(4).split(",");
                num = select.length;
                // get the operator
                for (int j = 0; j < num; j++) {
                    operator[j] = select[j];
                    if (operator[j].startsWith("avg(R")) {
                        avg_col = Integer.parseInt(operator[j].substring(5, operator[j].length() - 1));
                    } else if (operator[j].startsWith("max(R")) {
                        max_col = Integer.parseInt(operator[j].substring(5, operator[j].length() - 1));
                    }
                }

            }//separation operatior:count,avg(R*),max(R*)
        }
        //hdfs operator
        String file = "hdfs://localhost:9000" + filepath;
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(file), conf);
        Path path = new Path(file);
        FSDataInputStream in_stream = fs.open(path);
        BufferedReader in = new BufferedReader(new InputStreamReader(in_stream));
        String line;
        List<String[]> tempResult = new ArrayList<>();

        //read file from hdfs
        while ((line = in.readLine()) != null) {
            String row[] = line.split("\\|");//split every row as an array $input the file as a list
            tempResult.add(row);
        }
        //sort row[] with groupby_key_value
        Comparator comparator = new ComparatorListSort(groupby_col);//define the list compare way in the last of this file
        Collections.sort(tempResult, comparator);

        //statistic the count ,avg(),max()
        List<Answer> Result = new ArrayList<>();//record the groupby data:groupby_col_value,count,avg,max
        String res0 = "";//groupby_col_value
        double[] res = new double[3]; //defined to store every statistical record
        //initialize
        String row[] = tempResult.get(0);
        String groupby_col_value = row[groupby_col];
        double count = 1;
        double avg_value = 0;
        double max_value = 0;
        if (avg_col != -1) {
            avg_value = Double.parseDouble(row[avg_col]);
        }
        if(max_col != -1) {
            max_value = Double.parseDouble(row[max_col]);
        }
        //DecimalFormat df = new DecimalFormat("#.00");
        for (int i = 1; i < tempResult.size(); i++) {
            row = tempResult.get(i);
            //groupby_col_value is same
            if(row[groupby_col].equals(groupby_col_value) ){
                count++;
                if(avg_col != -1) {
                    avg_value = avg_value + Double.parseDouble(row[avg_col]);
                }
                if(max_col != -1) {
                    max_value = Math.max(max_value, Double.parseDouble(row[max_col]));
                }
            } else {
                res0 = groupby_col_value;
                groupby_col_value = row[groupby_col];
                if(avg_col != -1) {
                    //String str = df.format((avg_value / count));
                    //res[1] = Double.parseDouble(str);
                    res[1] = (double) Math.round((avg_value / count) * 100) / 100;
                    avg_value = Double.parseDouble(row[avg_col]);
                }
                res[0] = count;
                count = 1;
                if(max_col != -1) {
                    res[2] = max_value;
                    max_value = Integer.parseInt(row[max_col]);
                }
                Answer ans = new Answer(res0, res);
                Result.add(ans);
            }

        }
        //add the last record
        res0 = groupby_col_value;
        res[0] = count;
        res[1] = (double) Math.round((avg_value / count)*100) / 100;
        res[2] = max_value;
        Answer ans = new Answer(res0, res);
        Result.add(ans);

        //save result to hbase
        saveHbase(Result, operator,num);
        in.close();
        fs.close();
    }

    /**
     * save result to habse
     * @param result the result after sorted and computed
     * @param operator the required operator
     * @param num number of operator
     * @throws MasterNotRunningException
     * @throws ZooKeeperConnectionException
     * @throws IOException
     */
    public static void saveHbase(List<Answer> result, String[] operator,int num) throws MasterNotRunningException,
            ZooKeeperConnectionException, IOException {
        //create table descriptor
        String tableName = "Result";
        HTableDescriptor htd = new HTableDescriptor(TableName.valueOf(tableName));
        //create column descriptor
        HColumnDescriptor cf = new HColumnDescriptor("res");
        htd.addFamily(cf);
        //configure Hbase
        Configuration configuration = HBaseConfiguration.create();
        HBaseAdmin hAdmin = new HBaseAdmin(configuration);

        if (hAdmin.tableExists(tableName)) {
            System.out.println("Table already exists");
            // delete the table
            hAdmin.disableTable(tableName);
            hAdmin.deleteTable(tableName);
        }
        hAdmin.createTable(htd);
        System.out.println("table " + tableName + " created successfully");
        hAdmin.close();

        //put the anser into result eg: (row key=good, res:count=1) 
        //(row key=good, res:avg(R5)=12)
        //(row key=good, res:max(R0)=100)
        HTable table = new HTable(configuration, tableName);
        for (int i = 0; i < result.size(); i++) {
            Answer ans = result.get(i);
            for (int j = 0; j < num; j++) {
                    if (operator[j].startsWith("count")) {
                        int x =(int)(ans.get_res())[0];
                        Put put = new Put((ans.get_res0()).getBytes());
                        put.add("res".getBytes(), operator[j].getBytes(), String.valueOf(x).getBytes());
                        table.put(put);

                    } else if (operator[j].startsWith("avg(R")) {
                               Put put = new Put((ans.get_res0()).getBytes());
                               put.add("res".getBytes(), operator[j].getBytes(), String.valueOf((ans.get_res())[1]).getBytes());
                               table.put(put);
                    } else if (operator[j].startsWith("max(R")) {
                              int x =(int)(ans.get_res())[2];
                              Put put = new Put((ans.get_res0()).getBytes());
                              put.add("res".getBytes(), operator[j].getBytes(), String.valueOf(x).getBytes());
                              table.put(put);
                    }
            }
        }
        table.close();
        System.out.println("Hbase table put successfully");
    }
}//main class
    /**
     * define a way to sort original record - ascending order
     */
 class ComparatorListSort implements Comparator<String[]>{
    private  int index;
    public ComparatorListSort(int i){
        index = i;
    }
    @Override
    public int compare(String[] S1,String[] S2){
        return S1[index].compareTo(S2[index]);
    }
}
/**
 *  define a structure to store the result record
 */
 class Answer {
    private String res0;
    private double[] res;

    public Answer(String res0, double[] nres){
        this.res0 = res0;
        this.res = new double[3];
        for (int i = 0; i < res.length;i++){
            res[i] = nres[i];
        }
    }
    public String get_res0(){
        return res0;
    }
    public double[] get_res(){
        return res;
    }
}

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值