对采集的数据进行base64解码

大码小虎

已于 2022-07-01 15:32:52 修改

阅读量526

点赞数

分类专栏： spark 文章标签： spark

于 2022-06-18 16:54:13 首次发布

本文链接：https://blog.csdn.net/m0_67106804/article/details/125348192

版权

spark 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

方式一：设置 flume的拦截器

1.导入依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.zz.base</groupId>
    <artifactId>flume-interceptor</artifactId>
    <version>1.0</version>

    <repositories>
        <repository>
            <id>ali-maven</id>
            <url>http://maven.aliyun.com/nexus/content/groups/public</url>
            <releases>
                <enabled>true</enabled>
            </releases>
            <snapshots>
                <enabled>true</enabled>
                <updatePolicy>always</updatePolicy>
                <checksumPolicy>fail</checksumPolicy>
            </snapshots>
        </repository>
    </repositories>

    <pluginRepositories>
        <pluginRepository>
            <id>ali-maven</id>
            <url>http://maven.aliyun.com/nexus/content/groups/public</url>
        </pluginRepository>
    </pluginRepositories>

    <properties>
        <flume.version>1.9.0</flume.version>
        <fastjson.version>1.2.68</fastjson.version>
        <junit.version>4.13</junit.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-core</artifactId>
            <version>${flume.version}</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>${fastjson.version}</version>
            <scope>compile</scope>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>${junit.version}</version>
            <scope>test</scope>
        </dependency>
    </dependencies>
</project>

2.1分析数据格式

eg：eyJwcm9qZWN0IjoibmV3cyIsImN0aW1lIjoxNjU1MjgwMTc4ODQwLCJpcCI6IjM5LjEwNy45Ny4xNTQifQ==

每条数据用”-“连接

解码后是json文件eg:{"project":"news","ctime":1655280178840,"ip":"40.107.87.154"}

说明：后半段json格式与先半段不同，

2.2编写代码

import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
import com.google.common.collect.Lists;
import org.apache.commons.codec.binary.Base64;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;

import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;

/**
 * @Author tutu
 * @Date 2022-06-18
 * @Version 1.0
 * @Descripation flume的拦截器，对采集的数据进行base64解码
 */
public class B64Interceptor implements Interceptor { 
  
    //初始化方法，当拦截器初始化的时候调用一次
    public void initialize() {}
    //处理单条数据
    public Event intercept(Event event) {
        //1. 获取数据本身
        String text = new String(event.getBody());
        //2. 切割
        String[] textArray = text.split("-");

        byte[] body = null;
        //3. 判断
        if (textArray.length == 2) {
            try {
                //4. 获取到解码的字符串
                String meta = new String(Base64.decodeBase64(textArray[0]));
                String content = new String(Base64.decodeBase64(textArray[1]));

                //5. 将json的字符串转换为json的对象:ctime、project、ip
                JSONObject jsonMeta = JSONObject.parseObject(meta);

                //6. 获取到字段: ctime: 
                String ctime = JSONPath.eval(jsonMeta, "$.ctime").toString();
                DateFormat fmt = new SimpleDateFormat("yyyyMMdd");
                ctime = fmt.format(Double.parseDouble(ctime)); // 20220622

                //7. 将ctime的字段插入到flume的event的header中
                event.getHeaders().put("ctime", ctime);

                //8. 解析content
                JSONObject jsonContent = JSONObject.parseObject(content);

                //9. 将jsonContent和jsonMeta对象合并为一个json对象
                JSONObject jsonObject = new JSONObject();
                jsonObject.put("ctime", JSONPath.eval(jsonMeta, "$.ctime"));
                jsonObject.put("project", JSONPath.eval(jsonMeta, "$.project"));
                jsonObject.put("content",JSONPath.eval(jsonContent, "$.content"));

                //10. 复制body数组
                body = jsonObject.toString().getBytes();
            }catch (Exception e) {
                e.printStackTrace();
                return null;
            }
        }
        //11. 设置event的值
        event.setBody(body);
        return event;
    }

    /**
     * 自动被调用
     */
    public List<Event> intercept(List<Event> list) {
        //1. 创建数组返回这个结果
        ArrayList<Event> inter = Lists.newArrayListWithCapacity(list.size());
        //2. 遍历
        for(Event event : list) {
            Event e = intercept(event);
            if (e != null) inter.add(e);
        }
        return inter;
    }

    //关闭
    public void close() {}
   
  // 申明Builder，这个方法会在flume拦截器创建的时候自动被调用
   
    public static class Builder implements Interceptor.Builder {
        public Interceptor build() {
            return new B64Interceptor();
        }
        public void configure(Context context) {}
    }
}

3.打包上传测试就行

方式二、本案例使用 Strcutured Streaming 读取kafka中的数据，并对其进行base64解码

说明：使用的数据与上一个相同，本次就不再说明，base64中的内容

1.导入依赖

spark-version使用的是2.4.5版本

<dependencies>
        <!-- spark core -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.12</artifactId>
            <version>${spark-version}</version>
        </dependency>
        <!-- spark sql -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.12</artifactId>
            <version>${spark-version}</version>
        </dependency>
        <!-- spark streaming -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.12</artifactId>
            <version>${spark-version}</version>
        </dependency>
        <!-- spark-streaming-kafka-0-10-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
            <version>${spark-version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql-kafka-0-10_2.12</artifactId>
            <version>${spark-version}</version>
        </dependency>

2.1分析要求

读取kafka里的数据，并对其进行base64解码，利用Strcutured Streaming 求top3;

本次获取得数据是”-“后半段的数据，里面得json格式key就是以下列出的列名

import org.apache.commons.codec.binary.Base64
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{BooleanType, DateType, StringType, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession}

/**
 * @Author tutu
 * @Description TODO
 * @Date 2022/6/18 
 * @Version 1.0
 */
object MyStructured_Kafka extends Nolog {
    def main(args: Array[String]): Unit = {
        val spark: SparkSession = SparkUtils.getSparklocalSession()
        import spark.implicits._
        import org.apache.spark.sql.functions._

        //1. 读取kafka数据源
        val source: DataFrame = spark.readStream
          .format("kafka")
          .option("kafka.bootstrap.servers", "x.x.x.x:9092") //x.x.x.x自己ip
          .option("subscribe", "news") //创建的主题
          .option("startingOffsets", "earliest")
          .load()

        //2. 申明json的中那些列名,获取到整个json格式的列的元数 
        //这些列名是根据数据查看得出
        val propertiesType: StructType = new StructType()
          .add("model", StringType, true)
          .add("network_type", StringType, true)
          .add("is_charging", StringType, true)
          .add("app_version", StringType, true)
          .add("element_name", StringType, true)
          .add("element_page", StringType, true)
          .add("carrier", StringType, true)
          .add("os", StringType, true)
          .add("imei", StringType, true)
          .add("battery_level",StringType, true)
          .add("screen_width", StringType, true)
          .add("screen_height", StringType, true)
          .add("device_id", StringType, true)
          .add("client_time", StringType, true)
          .add("ip", StringType, true)
          .add("manufacture", StringType, true)
          .add("article_id",StringType, true)
          .add("action_type", StringType, true)

        val contentType: StructType = new StructType()
          .add("uuid", StringType, true)
          .add("distinct_id", StringType, true)
          .add("event", StringType, true)
          .add("properties", propertiesType, true)
          .add("type", StringType, true)

        val schema: StructType = new StructType()
          .add("content", contentType, true)

        //3. 处理数据
        val df: DataFrame = source
          .selectExpr("cast(value as string) as value")
          .map(line => {
              val str: String = line.get(0).toString()
              val arr: Array[String] = str.split("-")
              val str2 = new String(Base64.decodeBase64(arr(1)))
              //println(str2)
              str2
          })
          .select(from_json('value, schema).alias("parsed_value"))
          .selectExpr("parsed_value.content.properties.model as model")
        //对model进行分组聚合
          .groupBy('model)
          .count()
        //过滤掉空和null
          .filter('model =!= "null")
          .filter('model =!= "")
        //排序 得到前三
          .sort(desc("count")).limit(3)

       df.writeStream
          .outputMode(OutputMode.Complete())
          .format("console")
          .start()
          .awaitTermination()



    }
}

3.测试