大数据架构综合训练–交易数据分析
1 原始文件处理
1.1 创建JDBCUtils类
由于将文件导入到数据库需要用到CRUD等操作,所以使用最原始的方法直接创建一个JDBCUtils类封装这些操作。
(1) 首先创建jdbc.properties文件用于设置数据库连接配置
user=root
password=yourpwd
url=jdbc:mysql://localhost:3306/yourDataBase?useUnicode=true&characterEncoding=utf8&allowMultiQueries=true&rewriteBatchedStatements=true
driverClass=com.mysql.cj.jdbc.Driver
(2) 创建getConnection方法获取数据库连接
public static Connection getConnection() throws Exception{
//1.读取配置文件中的4个基本信息
InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream("jdbc.properties");
Properties properties = new Properties();
properties.load(is);
String user = properties.getProperty("user");
String password = properties.getProperty("password");
String url = properties.getProperty("url");
String driverClass = properties.getProperty("driverClass");
//2.加载驱动
Class.forName(driverClass);
//3.获取连接
Connection connection = DriverManager.getConnection(url, user, password);
return connection;
}
(3) 创建closeResource关闭资源方法
public static void closeResource(Connection conn, Statement ps){
try {
if (ps != null){
ps.close();
}
} catch (SQLException e) {
throw new RuntimeException(e);
}
try {
if (conn != null){
conn.close();
}
} catch (SQLException e) {
throw new RuntimeException(e);
}
}
(4) 创建createTable方法用来创建数据库
public static void createTable(String sql){
Connection conn = null;
PreparedStatement ps = null;
boolean execute = false;
try {
conn = JDBCUtils.getConnection();
ps = conn.prepareStatement(sql);
execute = ps.execute();
} catch (Exception e) {
e.printStackTrace();
} finally {
JDBCUtils.closeResource(conn,ps);
}
if (execute){
System.out.println("创建失败");
}else {
System.out.println("创建成功");
}
}
1.2创建ProfileUtils类
该类包含了关于文件的操作
(1) 创建getFile方法读取桌面上的文件
public static List<String> getFile(){
FileInputStream fis = null;
InputStreamReader isr = null;
BufferedReader bufferedReader = null;
List<String> data = null;
try {
fis = new FileInputStream("C:\\Users\\23694\\Desktop\\1-14new.txt");
isr = new InputStreamReader(fis, "GB2312");
bufferedReader = new BufferedReader(isr);
data = new ArrayList<>();
String str;
while ((str = bufferedReader.readLine()) != null){
if (str.trim().length() > 0){
data.add(str);
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (isr != null){
isr.close();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if (fis != null){
fis.close();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if (bufferedReader != null){
bufferedReader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return data;
}
(2) 创建getNewFile方法,其中调用了getFile方法来获取文件,该方法主要封装了处理数据的一系列操作,方便给下一步分析关键词和插入数据库使用。
public static List<String> getNewFile(){
List<String> file = profileUtils.getFile();
List<String> newFile = new ArrayList<>();
List<String> newFile1 = new ArrayList<>();
String[] newStrings;
String[] temp;
int i = 0;
for (String s : file) {
if (s.trim().length() > 10) {
s = s.replaceAll("\\(\\(", "");
s = s.replaceAll("\\)\\)", "");
s = s.replaceAll("None", "'None'");
s = s.replaceAll("\"", "'");
newStrings = s.split("\\), \\(");
for (String s1 : newStrings) {
temp = s1.split(", '");
newFile.add(Arrays.toString(temp).replaceAll("'", ""));
}
}
}
String[] strings1 = null;
int count = 0;
for (int j = 0; j < newFile.size(); j++) {
strings1 = newFile.get(j).replaceAll("\\[", "").replaceAll("\\]", "").split(",");
if (strings1.length == 4) {
for (int k = 0; k < 4; k++) {
newFile1.add(strings1[k]);
}
}
if (strings1.length == 5) {
for (int k = 0; k < 5; k++) {
if (k == 2) {
strings1[2] = strings1[2].concat(strings1[3]).replaceAll(",", "");
}
if (k != 3) {
newFile1.add(strings1[k]);
}
}
}
if (strings1.length == 6) {
for (int k = 0; k < 6; k++) {
if (k == 2) {
strings1[2] = strings1[2].concat(strings1[3]).concat(strings1[4].replaceAll(",", ""));
}
if (k != 4 && k != 3) {
newFile1.add(strings1[k]);
}
}
}
if (strings1.length == 7) {
for (int k = 0; k < 7; k++) {
if (k == 2) {
strings1[2] = strings1[2].concat(strings1[3]).concat(strings1[4].concat(strings1[5]).replaceAll(",", ""));
}
if (k != 4 && k != 3 && k != 5) {
newFile1.add(strings1[k]);
}
}
}
if (strings1.length == 8) {
for (int k = 0; k < 8; k++) {
if (k == 2) {
strings1[2] = strings1[2].concat(strings1[3]).concat(strings1[4].concat(strings1[5]).concat(strings1[6]).replaceAll(",", ""));
}
if (k != 4 && k != 3 && k != 5 && k != 6) {
newFile1.add(strings1[k]);
}
}
}
}
return newFile1;
}
(3) 以下是处理后部分文本
1.3创建insertIntoMysql类
插入前先创建好flow_data表,由于数据量较大约有14万数据,如果每创建好一个prepareStatement后就执行提交,会和数据库交互过多,所以使用addBatch方法,攒够指定的sql再执行提交任务,有助于提高效率,使用此方法需要在JDBC配置url中加上rewriteBatchedStatements=true
(1) fileInsert方法
//将数据插入到数据库中
public static void fileInsert() {
//创建表 只运行一次
JDBCUtils.createTable("create table if not exists flow_data (user_id INT(11),payment FLOAT,describe_info varchar(255),time INT(11))");
List<String> newFile = profileUtils.getNewFile();
Object[] objects = newFile.toArray();
Connection conn = null;
PreparedStatement ps = null;
try {
conn = JDBCUtils.getConnection();
String sql = "insert into flow_data(user_id,payment,describe_info,time)values(?,?,?,?)";
ps = conn.prepareStatement(sql);
//设置不自动提交
conn.setAutoCommit(false);
//批量提交
for (int o = 0; o < 559996; o = o + 4) {
ps.setObject(1, objects[o]);
ps.setObject(2, objects[o + 1]);
ps.setObject(3, objects[o + 2]);
ps.setObject(4, objects[o + 3]);
ps.addBatch();
if (o % 2000 == 0) {
ps.executeBatch();
}
if (o + 3 == 559995) {
ps.executeBatch();
ps.clearBatch();
}
}
//全部处理完后 执行提交
conn.commit();
} catch (Exception e) {
e.printStackTrace();
} finally {
JDBCUtils.closeResource(conn, ps);
}
}
(2) 结果图
(3) 直接使用sqlyog导出数据为csv形式
1.4创建keyWord类
设置pom.xml文件,导入要用到的依赖jieba分词
(1) pom.xml
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.2</version>
</dependency>
(2) 导入所需的两个类Keyword,TFIDFAnalyzer
(3) 创建keyWord类
public class keyWord {
public static List<String> getKeyword(){
TFIDFAnalyzer tfi = new TFIDFAnalyzer();
List<String> newFile = profileUtils.getNewFile();
Object[] array = newFile.toArray();
int a = 2;
StringBuilder builder = new StringBuilder();
for (int i = 0; i < array.length; i++) {
if (i == a ){
if (!String.valueOf(array[i]).trim().contains(" ")){
builder.append(array[i]);
}
a = a+4;
}
}
List<Keyword> anaRe = tfi.analyze(builder.toString(), 20);
List<String> result = new ArrayList<>();
for (Keyword keyword : anaRe){
result.add(keyword.getName());
}
return result;
}
}
(4) 选取20个关键词部分结果如下
2. 编写HDFSClient类
包含了put,close,mkdir等操作
public class HdfsClient {
private FileSystem fs;
//初始化操作
public void init() throws URISyntaxException, IOException, InterruptedException {
//连接的集群nn地址
URI uri = new URI("hdfs://hadoop102:8020");
//创建一个配置文件
Configuration configuration = new Configuration();
//用户
String user = "atguigu";
//获取客户端对象
fs = FileSystem.get(uri, configuration,user);
}
public void close() throws IOException {
fs.close();
}
public void Mkdir(String path) throws URISyntaxException, IOException, InterruptedException {
fs.mkdirs(new Path(path));
}
//上传
public void Put(String localPath,String remotePath) throws IOException {
//参数1:表示删除原数据;参数2:是否允许覆盖;参数3:原数据路径;参数4:目的地路径
fs.copyFromLocalFile(true,true,new Path(localPath),new Path(remotePath));
}
}
3. MapReduce
1.1每个顾客的消费额
(1) 配置pom.xml添加依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
(2) mapper
public class mapper extends Mapper<LongWritable,Text, IntWritable, FloatWritable> {
@Override
protected void map(LongWritable k1, Text v1, Mapper<LongWritable, Text, IntWritable, FloatWritable>.Context context) throws IOException, InterruptedException {
String[] data = v1.toString().split(",");
String userID = data[0];
String payMent = data[1];
context.write(
new IntWritable(Integer.parseInt(userID)),
new FloatWritable(Float.parseFloat(payMent))
);
}
}
(3) reducer
public class reducer extends Reducer<IntWritable, FloatWritable,IntWritable,FloatWritable> {
@Override
protected void reduce(IntWritable k3, Iterable<FloatWritable> v3, Reducer<IntWritable, FloatWritable, IntWritable, FloatWritable>.Context context) throws IOException, InterruptedException {
float sumPayment = 0f;
for (FloatWritable fw : v3){
sumPayment += fw.get();
}
context.write(
k3,new FloatWritable(sumPayment)
);
}
}
(4) aMain
public class aMain {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(aMain.class);
job.setMapperClass(mapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setOutputValueClass(FloatWritable.class);
job.setReducerClass(reducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(FloatWritable.class);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,out);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.waitForCompletion(true);
}
}
(5) 启动虚拟机,打开相关服务
(6) 调用API上传data.csv和jar包到hdfs
(7) 使用jar包完成数据的处理
1.2每个顾客每年的消费额
(1) mapper
public class mapper extends Mapper<LongWritable,Text, Text, FloatWritable> {
@Override
protected void map(LongWritable k1, Text v1, Mapper<LongWritable, Text, Text, FloatWritable>.Context context) throws IOException, InterruptedException {
String[] data = v1.toString().split(",");
String userID = data[0];
String payMent = data[1];
int time = Integer.parseInt(data[3]);
Calendar calendar = Calendar.getInstance();
calendar.setTime(new Date(time * 1000L));
int year = calendar.get(Calendar.YEAR);
String a = userID+" "+year;
context.write(
new Text(a),
new FloatWritable(Float.parseFloat(payMent))
);
}
}
(2) reducer
public class reducer extends Reducer<Text, FloatWritable,Text,FloatWritable> {
@Override
protected void reduce(Text k3, Iterable<FloatWritable> v3, Reducer<Text, FloatWritable, Text, FloatWritable>.Context context) throws IOException, InterruptedException {
float sumPayment = 0f;
for (FloatWritable fw : v3){
sumPayment += fw.get();
}
context.write(
k3,new FloatWritable(sumPayment)
);
}
}
(3) aMain
public class aMain {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(aMain.class);
job.setMapperClass(mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,out);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.waitForCompletion(true);
}
}
(4) 运行
1.3总消费构成
(1) mapper
public class mapper extends Mapper<LongWritable,Text, Text, FloatWritable> {
@Override
protected void map(LongWritable k1, Text v1, Mapper<LongWritable, Text, Text, FloatWritable>.Context context) throws IOException, InterruptedException {
String[] keywords = new String[]{"中袖","一网打尽","客杯","热卖","客来","长方","金城","嘉福元","玻璃瓶",
"BBQ","极睿","ipad2","ipad3","ipad4","快抢","肌漾","取款","网银","网络","跨行"};
String[] data = v1.toString().split(",");
String desc = data[2].trim();
String payMent = data[1];
for (int i = 0; i < keywords.length; i++) {
if (desc.contains(keywords[i])){
desc = keywords[i];
context.write(
new Text(desc),
new FloatWritable(Math.abs(Float.parseFloat(payMent)))
);
}else if ( i == 19) {
desc = "其他";
context.write(
new Text(desc),
new FloatWritable(Math.abs(Float.parseFloat(payMent)))
);
}
}
}
}
(2)reducer
public class reducer extends Reducer<Text, FloatWritable,Text,FloatWritable> {
@Override
protected void reduce(Text k3, Iterable<FloatWritable> v3, Reducer<Text, FloatWritable, Text, FloatWritable>.Context context) throws IOException, InterruptedException {
float sumPayment = 0f;
for (FloatWritable fw : v3){
sumPayment += fw.get();
}
context.write(
k3,new FloatWritable(sumPayment)
);
}
}
(3)aMain
public class aMain {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(aMain.class);
job.setMapperClass(mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,out);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.waitForCompletion(true);
}
}
(4)执行
1.4每一年的消费构成
(1)mapper
public class mapper extends Mapper<LongWritable,Text, Text, FloatWritable> {
@Override
protected void map(LongWritable k1, Text v1, Mapper<LongWritable, Text, Text, FloatWritable>.Context context) throws IOException, InterruptedException {
String[] keywords = new String[]{"中袖","一网打尽","客杯","热卖","客来","长方","金城","嘉福元","玻璃瓶",
"BBQ","极睿","ipad2","ipad3","ipad4","快抢","肌漾","取款","网银","网络","跨行"};
String[] data = v1.toString().split(",");
String desc = data[2].trim();
String payMent = data[1];
int time = Integer.parseInt(data[3]);
Calendar calendar = Calendar.getInstance();
calendar.setTime(new Date(time * 1000L));
int year = calendar.get(Calendar.YEAR);
for (int i = 0; i < keywords.length; i++) {
if (desc.contains(keywords[i])){
desc = keywords[i];
context.write(
new Text(desc+" "+year),
new FloatWritable(Math.abs(Float.parseFloat(payMent)))
);
}else if ( i == 19) {
desc = "其他";
context.write(
new Text(desc+" "+year),
new FloatWritable(Math.abs(Float.parseFloat(payMent)))
);
}
}
}
}
(2)reducer
public class reducer extends Reducer<Text, FloatWritable,Text,FloatWritable> {
@Override
protected void reduce(Text k3, Iterable<FloatWritable> v3, Reducer<Text, FloatWritable, Text, FloatWritable>.Context context) throws IOException, InterruptedException {
float sumPayment = 0f;
for (FloatWritable fw : v3){
sumPayment += fw.get();
}
context.write(
k3,new FloatWritable(sumPayment)
);
}
}
(3)aMain
public class aMain {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(aMain.class);
job.setMapperClass(mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);
Path out = new Path(args[1]);
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,out);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(out)) {
fs.delete(out, true);
}
job.waitForCompletion(true);
}
}
(4)执行