国科大陈老师大数据课程第一次作业。
从命令行找出待操作列:
输入格式固定,使用String.contains()方法判断当前命令行的种类,然后取出需要的字符串部分。
注意,avg和max可能有多列(老师说的,我也不知道真假),同时列数可能非个位数
String filePath = "";
int rowKey = 0;
// create array to save the operational column
String[] count = args[2].split(",");
int length = count.length;
int[] avgCol = new int[length - 1];
int avgColNum = 0;
int[] maxCol = new int[length - 1];
int maxColNum = 0;
for (int i = 0; i < args.length; i++) {// find the params from args[]
// locate the string
int beginIdx = 0;
int endIdx = 0;
String aa = args[i];
if (aa.contains("R=")) {// find the file path
beginIdx = aa.indexOf("R=") + 2;
filePath = "hdfs://localhost:9000" + aa.substring(beginIdx);
System.out.println("the file path is \t" + filePath);
} else if (aa.contains("groupby")) {
// find the row key, attention the length of number maybe longer than 1
beginIdx = aa.indexOf("by:R", endIdx) + 4;
rowKey = Integer.parseInt(aa.substring(beginIdx));
System.out.println("the rowKey is \t R" + rowKey);
} else {
for (int j = 0; j < count.length; j++) {
// locate the string
beginIdx = 0;
endIdx = 0;
if (count[j].contains("avg")) { // find the avg column(s), if it has
beginIdx = count[j].indexOf("avg(", endIdx) + 5;
endIdx = count[j].indexOf(")", beginIdx);
avgCol[avgColNum++] = Integer.parseInt(count[j].substring(beginIdx, endIdx));
System.out.println("the avg columu is R" + avgCol[avgColNum - 1]);
}
if (count[j].contains("max")) { // find the max column(s), if it has
beginIdx = count[j].indexOf("max(", endIdx) + 5;
endIdx = count[j].indexOf(")", beginIdx);
maxCol[maxColNum++] = Integer.parseInt(count[j].substring(beginIdx, endIdx));
System.out.println("the max column is R" + maxCol[maxColNum - 1]);
}
}
System.out.println("avg column number is:" + avgColNum);
System.out.println("max column number is:" + maxColNum);
}
}
从HDFS读文件:
这里读了两遍,第一遍获取行数,第二遍取数据
// load data from hadoop
public static String[] loadData(String filePath) throws IOException, URISyntaxException {
// create the fileSystem
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(filePath), conf);
// create the inputStream
Path path = new Path(filePath);
FSDataInputStream in_stream = fs.open(path);
BufferedReader inForNum = new BufferedReader(new InputStreamReader(in_stream));
int num = 0;
String str;
// calculate the line number
while ((str = inForNum.readLine()) != null) {
num++;
if (num < 5)
System.out.println(str);
}
inForNum.close();
// create the inputStream
FSDataInputStream inStream = fs.open(path);
BufferedReader inForData = new BufferedReader(new InputStreamReader(inStream));
int index = 0;
String[] data = new String[num];
System.out.println("The Second time!!!!!!!");
// load data
while ((str = inForData.readLine()) != null) {
data[index++] = str;
if (index < 5)
System.out.println(str);
}
inForData.close();
fs.close();
return data;
}
把读入文件中需要的列挑出并排序:
挑出需要的列,先存avg再存max,结果中这些顺序无所谓
// select the avgCol and maxCol
String[] trimData = new String[data.length];
for (int index = 0; index < data.length; index++) { // trim data
String[] trim = data[index].split("\\|");
trimData[index] = trim[rowKey];
for (int i = 0; i < avgColNum; i++) // save avg column
trimData[index] += ("|" + trim[avgCol[i]]);
for (int i = 0; i < maxColNum; i++) // save max column
trimData[index] += ("|" + trim[maxCol[i]]);
if (index < 5) { // check the data
System.out.println(data[index]);
if (avgColNum != 0 && maxColNum != 0)
System.out.println(trimData[index] + " " + trim[avgCol[0]] + " " + trim[maxCol[0]]);
}
}
// sort data
sortData(trimData);
System.out.println("After sort and trim, the data is:~~~~~~~~~~~~~~");
for (int i = 0; i < 5; i++) {
System.out.println(trimData[i]);
}
手写的冒泡排序,速度较慢
public static void sortData(String[] data) { // sort the data with Bubble sort
String str;
for (int i = 0; i < data.length; i++) {
for (int j = 0; j < data.length - i - 1; j++) {
String a = data[j].split("\\|")[0];
String b = data[j + 1].split("\\|")[0];
if (a.compareTo(b) > 0) {
str = data[j];
data[j] = data[j + 1];
data[j + 1] = str;
}
}
}
}
groupby:
将rowkey相同的行合并,并返回合并后的行数
// merge data with same row key and calculate the count, the avg, the max
public static int mergeData(String[] data, String[] mergeData, int avgColNum, int maxColNum) {
int countNum = 1;
String lastRK = "";
int index = -1;
// save the intermediate data
double[] avg = new double[avgColNum > 0 ? avgColNum : 1];
double[] sum = new double[avgColNum > 0 ? avgColNum : 1];
double[] max = new double[maxColNum > 0 ? maxColNum : 1];
DecimalFormat df = new DecimalFormat("#########0.00"); // define the format
for (String str : data) {
String[] strAarry = str.split("\\|");
String nowRK = strAarry[0];
if (lastRK.compareTo(nowRK) == 0) { // if current rowkey is same as the last
countNum++;
mergeData[index] = nowRK + "|" + Integer.toString(countNum);
if (avgColNum != 0) { // calculate the avg
for (int i = 1; i <= avgColNum; i++) {
sum[i - 1] += Double.parseDouble(strAarry[i]);
avg[i - 1] = sum[i - 1] / countNum * 1.0;
mergeData[index] += "|" + df.format(avg[i - 1]);
}
}
if (maxColNum != 0) { // calculate the max
for (int i = 0; i < maxColNum; i++) {
max[i] = Math.max(Double.parseDouble(strAarry[i + 1 + avgColNum]), max[i]);
mergeData[index] += "|" + Double.toString(max[i]);
}
}
} else { // if current rowkey isn't same as the last
lastRK = nowRK;
countNum = 1;
index++;
mergeData[index] = nowRK + "|" + Integer.toString(countNum);
if (avgColNum != 0) { // calculate the avg
for (int i = 1; i <= avgColNum; i++) {
sum[i - 1] = Double.parseDouble(strAarry[i]);
avg[i - 1] = sum[i - 1];
mergeData[index] += "|" + df.format(avg[i - 1]);
}
}
if (maxColNum != 0) { // calculate the max
for (int i = 0; i < maxColNum; i++) {
max[i] = Double.parseDouble(strAarry[i + 1 + avgColNum]);
mergeData[index] += "|" + Double.toString(max[i]);
}
}
}
}
return index + 1;
}
存进Hbase:
将数据写入Hbase
// save data into Hbase
public static void saveData(String[] data, int[] avgCol, int avgColNum, int[] maxCol, int maxColNum, int length)
throws MasterNotRunningException, ZooKeeperConnectionException, IOException {
// create table descriptor
String tableName = "Result";
HTableDescriptor htd = new HTableDescriptor(TableName.valueOf(tableName));
// create column descriptor
HColumnDescriptor cf = new HColumnDescriptor("res");
htd.addFamily(cf);
// configure HBase
Configuration configuration = HBaseConfiguration.create();
HBaseAdmin hAdmin = new HBaseAdmin(configuration);
if (hAdmin.tableExists(tableName)) { // if table already exists, delete it
System.out.println("Table already exists, the former will be deleted");
hAdmin.disableTable(tableName);
hAdmin.deleteTable(tableName);
}
// create table
hAdmin.createTable(htd);
System.out.println("table " + tableName + " created successfully");
hAdmin.close();
// create data put
// data format: rowKey count avgCol maxCol
// record format: rowKey res:count, avg, max
HTable table = new HTable(configuration, tableName);
for (int i = 0; i < length; i++) {
String[] record = data[i].split("\\|");
Put put = new Put(record[0].getBytes());
put.add("res".getBytes(), "count".getBytes(), record[1].getBytes());
table.put(put); // put max
for (int j = 0; j < avgColNum; j++) { // put avg
String col = "avg(R";
col += (Integer.toString(avgCol[j]) + ")");
Put avgPut = new Put(record[0].getBytes());
avgPut.add("res".getBytes(), col.getBytes(), record[j + 2].getBytes());
table.put(avgPut);
}
for (int j = 0; j < maxColNum; j++) { // put max
String col = "max(R";
col += (Integer.toString(maxCol[j]) + ")");
Put maxPut = new Put(record[0].getBytes());
maxPut.add("res".getBytes(), col.getBytes(), record[j + avgColNum + 2].getBytes());
table.put(maxPut);
}
}
table.close();
System.out.println("ALL DONE!!!!!!!!!!!!!");
}
完整代码:
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DecimalFormat;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.log4j.*;
/**
* this class is to load table from HDFS then sort the table group by one of
* column, then put the outcome into the Hbase
*
* @param args
* @throws IOException
* @throws URISyntaxException
*
*/
public class Hw1Grp3 {
public static void main(String[] args) throws IOException, URISyntaxException {
// String aa = "R=/hw1-input/input/customer.tbl groupby:R2
// 'res:count,avg(R11),max(R0)'";
// some params will be used
String filePath = "";
int rowKey = 0;
// create array to save the operational column
String[] count = args[2].split(",");
int length = count.length;
int[] avgCol = new int[length - 1];
int avgColNum = 0;
int[] maxCol = new int[length - 1];
int maxColNum = 0;
for (int i = 0; i < args.length; i++) {// find the params from args[]
// locate the string
int beginIdx = 0;
int endIdx = 0;
String aa = args[i];
if (aa.contains("R=")) {// find the file path
beginIdx = aa.indexOf("R=") + 2;
filePath = "hdfs://localhost:9000" + aa.substring(beginIdx);
System.out.println("the file path is \t" + filePath);
} else if (aa.contains("groupby")) {
// find the row key, attention the length of number maybe longer than 1
beginIdx = aa.indexOf("by:R", endIdx) + 4;
rowKey = Integer.parseInt(aa.substring(beginIdx));
System.out.println("the rowKey is \t R" + rowKey);
} else {
for (int j = 0; j < count.length; j++) {
// locate the string
beginIdx = 0;
endIdx = 0;
if (count[j].contains("avg")) { // find the avg column(s), if it has
beginIdx = count[j].indexOf("avg(", endIdx) + 5;
endIdx = count[j].indexOf(")", beginIdx);
avgCol[avgColNum++] = Integer.parseInt(count[j].substring(beginIdx, endIdx));
System.out.println("the avg columu is R" + avgCol[avgColNum - 1]);
}
if (count[j].contains("max")) { // find the max column(s), if it has
beginIdx = count[j].indexOf("max(", endIdx) + 5;
endIdx = count[j].indexOf(")", beginIdx);
maxCol[maxColNum++] = Integer.parseInt(count[j].substring(beginIdx, endIdx));
System.out.println("the max column is R" + maxCol[maxColNum - 1]);
}
}
System.out.println("avg column number is:" + avgColNum);
System.out.println("max column number is:" + maxColNum);
}
}
// load data from file path
String[] data = loadData(filePath);
// select the avgCol and maxCol
String[] trimData = new String[data.length];
for (int index = 0; index < data.length; index++) { // trim data
String[] trim = data[index].split("\\|");
trimData[index] = trim[rowKey];
for (int i = 0; i < avgColNum; i++) // save avg column
trimData[index] += ("|" + trim[avgCol[i]]);
for (int i = 0; i < maxColNum; i++) // save max column
trimData[index] += ("|" + trim[maxCol[i]]);
if (index < 5) { // check the data
System.out.println(data[index]);
if (avgColNum != 0 && maxColNum != 0)
System.out.println(trimData[index] + " " + trim[avgCol[0]] + " " + trim[maxCol[0]]);
}
}
// sort data
sortData(trimData);
System.out.println("After sort and trim, the data is:~~~~~~~~~~~~~~");
for (int i = 0; i < 5; i++) {
System.out.println(trimData[i]);
}
// merge data
String[] mergedData = new String[trimData.length];
int mergeLen = 0;
mergeLen = mergeData(trimData, mergedData, avgColNum, maxColNum);
System.out.println("After merge, the data is:~~~~~~~~~~~~~~");
for (int i = 0; i < mergeLen; i++) {
System.out.println(mergedData[i]);
}
// save data
System.out.println("Start saving data into HBase>>>>>>>>>>>>");
saveData(mergedData, avgCol, avgColNum, maxCol, maxColNum, mergeLen);
}
// load data from hadoop
public static String[] loadData(String filePath) throws IOException, URISyntaxException {
// create the fileSystem
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(filePath), conf);
// create the inputStream
Path path = new Path(filePath);
FSDataInputStream in_stream = fs.open(path);
BufferedReader inForNum = new BufferedReader(new InputStreamReader(in_stream));
int num = 0;
String str;
// calculate the line number
while ((str = inForNum.readLine()) != null) {
num++;
if (num < 5)
System.out.println(str);
}
inForNum.close();
// create the inputStream
FSDataInputStream inStream = fs.open(path);
BufferedReader inForData = new BufferedReader(new InputStreamReader(inStream));
int index = 0;
String[] data = new String[num];
System.out.println("The Second time!!!!!!!");
// load data
while ((str = inForData.readLine()) != null) {
data[index++] = str;
if (index < 5)
System.out.println(str);
}
inForData.close();
fs.close();
return data;
}
public static void sortData(String[] data) { // sort the data with Bubble sort
String str;
for (int i = 0; i < data.length; i++) {
for (int j = 0; j < data.length - i - 1; j++) {
String a = data[j].split("\\|")[0];
String b = data[j + 1].split("\\|")[0];
if (a.compareTo(b) > 0) {
str = data[j];
data[j] = data[j + 1];
data[j + 1] = str;
}
}
}
}
// merge data with same row key and calculate the count, the avg, the max
public static int mergeData(String[] data, String[] mergeData, int avgColNum, int maxColNum) {
int countNum = 1;
String lastRK = "";
int index = -1;
// save the intermediate data
double[] avg = new double[avgColNum > 0 ? avgColNum : 1];
double[] sum = new double[avgColNum > 0 ? avgColNum : 1];
double[] max = new double[maxColNum > 0 ? maxColNum : 1];
DecimalFormat df = new DecimalFormat("#########0.00"); // define the format
for (String str : data) {
String[] strAarry = str.split("\\|");
String nowRK = strAarry[0];
if (lastRK.compareTo(nowRK) == 0) { // if current rowkey is same as the last
countNum++;
mergeData[index] = nowRK + "|" + Integer.toString(countNum);
if (avgColNum != 0) { // calculate the avg
for (int i = 1; i <= avgColNum; i++) {
sum[i - 1] += Double.parseDouble(strAarry[i]);
avg[i - 1] = sum[i - 1] / countNum * 1.0;
mergeData[index] += "|" + df.format(avg[i - 1]);
}
}
if (maxColNum != 0) { // calculate the max
for (int i = 0; i < maxColNum; i++) {
max[i] = Math.max(Double.parseDouble(strAarry[i + 1 + avgColNum]), max[i]);
mergeData[index] += "|" + Double.toString(max[i]);
}
}
} else { // if current rowkey isn't same as the last
lastRK = nowRK;
countNum = 1;
index++;
mergeData[index] = nowRK + "|" + Integer.toString(countNum);
if (avgColNum != 0) { // calculate the avg
for (int i = 1; i <= avgColNum; i++) {
sum[i - 1] = Double.parseDouble(strAarry[i]);
avg[i - 1] = sum[i - 1];
mergeData[index] += "|" + df.format(avg[i - 1]);
}
}
if (maxColNum != 0) { // calculate the max
for (int i = 0; i < maxColNum; i++) {
max[i] = Double.parseDouble(strAarry[i + 1 + avgColNum]);
mergeData[index] += "|" + Double.toString(max[i]);
}
}
}
}
return index + 1;
}
// save data into Hbase
public static void saveData(String[] data, int[] avgCol, int avgColNum, int[] maxCol, int maxColNum, int length)
throws MasterNotRunningException, ZooKeeperConnectionException, IOException {
// create table descriptor
String tableName = "Result";
HTableDescriptor htd = new HTableDescriptor(TableName.valueOf(tableName));
// create column descriptor
HColumnDescriptor cf = new HColumnDescriptor("res");
htd.addFamily(cf);
// configure HBase
Configuration configuration = HBaseConfiguration.create();
HBaseAdmin hAdmin = new HBaseAdmin(configuration);
if (hAdmin.tableExists(tableName)) { // if table already exists, delete it
System.out.println("Table already exists, the former will be deleted");
hAdmin.disableTable(tableName);
hAdmin.deleteTable(tableName);
}
// create table
hAdmin.createTable(htd);
System.out.println("table " + tableName + " created successfully");
hAdmin.close();
// create data put
// data format: rowKey count avgCol maxCol
// record format: rowKey res:count, avg, max
HTable table = new HTable(configuration, tableName);
for (int i = 0; i < length; i++) {
String[] record = data[i].split("\\|");
Put put = new Put(record[0].getBytes());
put.add("res".getBytes(), "count".getBytes(), record[1].getBytes());
table.put(put); // put max
for (int j = 0; j < avgColNum; j++) { // put avg
String col = "avg(R";
col += (Integer.toString(avgCol[j]) + ")");
Put avgPut = new Put(record[0].getBytes());
avgPut.add("res".getBytes(), col.getBytes(), record[j + 2].getBytes());
table.put(avgPut);
}
for (int j = 0; j < maxColNum; j++) { // put max
String col = "max(R";
col += (Integer.toString(maxCol[j]) + ")");
Put maxPut = new Put(record[0].getBytes());
maxPut.add("res".getBytes(), col.getBytes(), record[j + avgColNum + 2].getBytes());
table.put(maxPut);
}
}
table.close();
System.out.println("ALL DONE!!!!!!!!!!!!!");
}
}