本篇博文我们将手动添加数据,这个功能非常有用,我们可以根据自己的需要创造出满足自己需要的数据格式及内容,而且数据不需要清洗,话不多说,代码实战。
package com.dt.spark.SparkApps.sql.project;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Random;
/**
* 论坛数据自动生成代码,数据格式如下:
* date: 日期 格式为:yy-MM-dd
* timestamp: 时间戳
* userID: 用户ID
* pageID: 页面ID
* chanelID: 板块的ID
* action: 点击和注册
*
*/
public class SparkSQLDataManually {
/**
* 具体的论坛频道,在实际开发的时候可以从文件读取生成列
*/
static String[] channelNames = new String[]{
"Spark","Scala","Kafka","Flink","Hadoop","Storm",
"HBase","Impala","Java","Kudu","ML","DL"
};
static String[] actionNames = new String[]{"View","Register"};
static String yesterdayFormated;
public static void main(String[] args) throws IOException {
/**
* 通过传进来的参数生成指定大小规模的数据。
*/
long numberItems = 5000;
String path = ".";
if(args.length > 0){
numberItems = Integer.valueOf(args[0]);
path = args[1];
}
System.out.println("User log number is :" + numberItems);
/**
* 昨天的时间的生成
*/
yesterdayFormated = yesterday();
userlogs(numberItems,path);
}
@SuppressWarnings("resource")
private static void userlogs(long numberItems, String path) throws IOException {
StringBuffer userLogBuffer = new StringBuffer("");
Random random = new Random();
for(int i = 0;i < numberItems;i++){
long timestamp = new Date().getTime();
//用户ID,随机生成器实现
long userID = 0L;
long pageID = 0L;
//随机生成用户ID
userID = random.nextInt((int)numberItems);
//Page ID
//随机生成的页面ID
pageID = random.nextInt((int)numberItems);
//随机生成channel
String channel = channelNames[random.nextInt(12)];
//随机生成action行为
String action= actionNames[random.nextInt(2)];
userLogBuffer.append(yesterdayFormated)
.append("\t")
.append(timestamp)
.append("\t")
.append(userID)
.append("\t")
.append(pageID)
.append("\t")
.append(channel)
.append("\t")
.append(action)
.append("\n");
}
System.out.println(userLogBuffer.toString());
PrintWriter printWriter = null;
printWriter = new PrintWriter(new OutputStreamWriter(
new FileOutputStream(path + "userLog.log")));
printWriter.write(userLogBuffer.toString());
printWriter.close();
}
private static String yesterday() {
SimpleDateFormat date = new SimpleDateFormat("yy-MM-dd");
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.DATE, -1);
Date yesterday = cal.getTime();
return date.format(yesterday);
}
}
实验结果如下:
16-05-02 1462244815099 3978 1325 Scala View
16-05-02 1462244815099 7833 4357 Hadoop View
16-05-02 1462244815099 3148 44 HBase Register
16-05-02 1462244815099 1012 4215 HBase Register
16-05-02 1462244815099 9679 5166 HBase View
16-05-02 1462244815099 7755 7357 DL Register
16-05-02 1462244815099 8790 7303 Kudu View
16-05-02 1462244815099 8354 456 Java Register
16-05-02 1462244815100 1412 6501 Storm View
16-05-02 1462244815100 6523 7202 Flink View
16-05-02 1462244815100 4701 3808 Storm Register
.......................