使用 docker-compose 搭建 Doris 和 Hudi 测试环境

本机环境是 MacOS M1,系统版本为 12.5.1。

相关文件

docker-compose.yml
version: "3.3"

services:

  namenode:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-namenode:arm64
    platform: linux/arm64
    hostname: namenode
    container_name: namenode
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    ports:
      - "50070:50070"
      - "8020:8020"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    env_file:
      - ./hadoop.env
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://namenode:50070" ]
      interval: 30s
      timeout: 10s
      retries: 3
    networks:
      mynet:
        ipv4_address: 172.18.0.100

  datanode1:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-datanode:arm64
    platform: linux/arm64
    container_name: datanode1
    hostname: datanode1
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    env_file:
      - ./hadoop.env
    ports:
      - "50075:50075"
      - "50010:50010"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    links:
      - "namenode"
      - "historyserver"
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://datanode1:50075" ]
      interval: 30s
      timeout: 10s
      retries: 3
    depends_on:
      - namenode
    networks:
      mynet:
        ipv4_address: 172.18.0.101

  historyserver:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-history:arm64
    hostname: historyserver
    container_name: historyserver
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    depends_on:
      - "namenode"
    links:
      - "namenode"
    ports:
      - "58188:8188"
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://historyserver:8188" ]
      interval: 30s
      timeout: 10s
      retries: 3
    env_file:
      - ./hadoop.env
    volumes:
      - historyserver:/hadoop/yarn/timeline
    networks:
      mynet:
        ipv4_address: 172.18.0.102

  hive-metastore-postgresql:
    image: menorah84/hive-metastore-postgresql:2.3.0
    platform: linux/arm64
    environment:
      - POSTGRES_HOST_AUTH_METHOD=trust
    volumes:
      - hive-metastore-postgresql:/var/lib/postgresql
    hostname: hive-metastore-postgresql
    container_name: hive-metastore-postgresql
    networks:
      mynet:
        ipv4_address: 172.18.0.103

  hivemetastore:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
    platform: linux/arm64
    hostname: hivemetastore
    container_name: hivemetastore
    links:
      - "hive-metastore-postgresql"
      - "namenode"
    env_file:
      - ./hadoop.env
    command: /opt/hive/bin/hive --service metastore
    environment:
      SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
    ports:
      - "9083:9083"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    healthcheck:
      test: [ "CMD", "nc", "-z", "hivemetastore", "9083" ]
      interval: 30s
      timeout: 10s
      retries: 3
    depends_on:
      - "hive-metastore-postgresql"
      - "namenode"
    networks:
      mynet:
        ipv4_address: 172.18.0.104

  hiveserver:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
    platform: linux/arm64
    hostname: hiveserver
    container_name: hiveserver
    env_file:
      - ./hadoop.env
    environment:
      SERVICE_PRECONDITION: "hivemetastore:9083"
    ports:
      - "10000:10000"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    depends_on:
      - "hivemetastore"
    links:
      - "hivemetastore"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${HUDI_WS}:/var/hoodie/ws
    networks:
      mynet:
        ipv4_address: 172.18.0.105

  sparkmaster:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:arm64
    platform: linux/arm64
    hostname: sparkmaster
    container_name: sparkmaster
    env_file:
      - ./hadoop.env
    ports:
      - "8080:8080"
      - "7077:7077"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    environment:
      - INIT_DAEMON_STEP=setup_spark
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    networks:
      mynet:
        ipv4_address: 172.18.0.106

  spark-worker-1:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:arm64
    platform: linux/arm64
    hostname: spark-worker-1
    container_name: spark-worker-1
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    ports:
      - "8081:8081"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    networks:
      mynet:
        ipv4_address: 172.18.0.107

  zookeeper:
    image: 'registry.cn-hangzhou.aliyuncs.com/jensenchen/zookeeper:3.4.12'
    platform: linux/arm64
    hostname: zookeeper
    container_name: zookeeper
    ports:
      - "2181:2181"
    environment:
      - ALLOW_ANONYMOUS_LOGIN=yes
    networks:
      mynet:
        ipv4_address: 172.18.0.108

  kafka:
    image: 'registry.cn-hangzhou.aliyuncs.com/jensenchen/kafka:2.13-2.8.1'
    platform: linux/arm64
    hostname: kafkabroker
    container_name: kafkabroker
    ports:
      - "9092:9092"
    environment:
      - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
      - ALLOW_PLAINTEXT_LISTENER=yes
      - KAFKA_ADVERTISED_HOST_NAME=kafkabroker
    networks:
      mynet:
        ipv4_address: 172.18.0.109

  adhoc-1:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:arm64
    platform: linux/arm64
    hostname: adhoc-1
    container_name: adhoc-1
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    ports:
      - '4040:4040'
      # JVM debugging port (mapped to 5006 on the host)
      - "5006:5005"
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${HUDI_WS}:/var/hoodie/ws
    networks:
      mynet:
        ipv4_address: 172.18.0.110

  adhoc-2:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1
    platform: linux/arm64
    hostname: adhoc-2
    container_name: adhoc-2
    env_file:
      - ./hadoop.env
    ports:
      # JVM debugging port (mapped to 5005 on the host)
      - "5005:5005"
    depends_on:
      - sparkmaster
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${HUDI_WS}:/var/hoodie/ws
    networks:
      mynet:
        ipv4_address: 172.18.0.111

  doris-fe:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-fe-arm
    hostname: doris-fe
    container_name: doris-fe
    environment:
      - FE_SERVERS=fe1:172.18.0.112:9010
      - FE_ID=1
    networks:
      mynet:
        ipv4_address: 172.18.0.112

  doris-be:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-be-arm
    hostname: doris-fe
    container_name: doris-be
    environment:
      - FE_SERVERS=fe1:172.18.0.112:9010
      - BE_ADDR=172.18.0.113:9050
    depends_on:
      - doris-fe
    networks:
      mynet:
        ipv4_address: 172.18.0.113

volumes:
  namenode:
  historyserver:
  hive-metastore-postgresql:

networks:
  mynet:
    external: true

其中,hudi 的 docker-compose 是参考了 hudi 中的写法,但是在 hudi 中 hiveserver 和 adhoc 容器都需要挂载到本机的 hudi 目录,并且需要提前编译 hudi,这里将 hiveserver 和 adhoc 两个容器进行了重新编译,将需要的包和配置文件打在了镜像中,后续将不再依赖外部目录的挂载。

networks 需要使用自定义网络,预先需要创建好,因为 docker-compose 默认创建的网络名字是 {docker-compose.yml 文件所在的文件夹名}_{default},“_” 在主机名中是不合法字符,后续访问中会出现类似错误:

ERROR 1105 (HY000): HMSClientException, msg: org.apache.hadoop.hive.metastore.api.MetaException: Got exception: java.net.URISyntaxException Illegal character in hostname at index 30: thrift://hivemetastore.compose_default:9083

可以预先使用命令创建 mynet:

docker network create mynet --driver bridge --subnet 172.18.0.0/16 --gateway 172.18.0.1
特例说明

MacOS 由于内部实现容器的方式不同,在部署时宿主机直接修改 max_map_count 值可能无法成功,需要先创建以下容器:

docker run -it --privileged --pid=host --name=change_count debian nsenter -t 1 -m -u -n -i sh

容器创建成功执行以下命令:

sysctl -w vm.max_map_count=2000000

然后 exit 退出,创建 Doris Docker 集群。

hadoop.env
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083

HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
#HDFS_CONF_dfs_client_use_datanode_hostname=true
#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
HDFS_CONF_dfs_replication=1

CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*

YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
YARN_CONF_yarn_nodemanager_vmem___check___enabled=false
batch_1.json
{"volume": 483951, "symbol": "MSFT", "ts": "2018-08-31 09:30:00", "month": "08", "high": 111.74, "low": 111.55, "key": "MSFT_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 111.72, "open": 111.55, "day": "31"}
{"volume": 1533226, "symbol": "AAPL", "ts": "2018-08-31 09:30:00", "month": "08", "high": 227.3101, "low": 226.23, "key": "AAPL_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 227.3101, "open": 226.53, "day": "31"}
{"volume": 36179, "symbol": "GOOG", "ts": "2018-08-31 09:30:00", "month": "08", "high": 1236.0, "low": 1234.36, "key": "GOOG_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 1234.54, "open": 1236.0, "day": "31"}
{"volume": 456506, "symbol": "FB", "ts": "2018-08-31 09:30:00", "month": "08", "high": 177.5, "low": 176.465, "key": "FB_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 176.83, "open": 177.29, "day": "31"}
{"volume": 142747, "symbol": "NFLX", "ts": "2018-08-31 09:30:00", "month": "08", "high": 372.0, "low": 370.49, "key": "NFLX_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 371.9, "open": 370.49, "day": "31"}
{"volume": 126884, "symbol": "TSLA", "ts": "2018-08-31 09:30:00", "month": "08", "high": 301.81, "low": 300.11, "key": "TSLA_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 300.61, "open": 301.81, "day": "31"}
{"volume": 1201915, "symbol": "F", "ts": "2018-08-31 09:30:00", "month": "08", "high": 9.63, "low": 9.6, "key": "F_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 9.61, "open": 9.63, "day": "31"}
{"volume": 176474, "symbol": "AMZN", "ts": "2018-08-31 09:30:00", "month": "08", "high": 2010.8101, "low": 2007.0, "key": "AMZN_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 2010.5, "open": 2009.8199, "day": "31"}
{"volume": 142523, "symbol": "NVDA", "ts": "2018-08-31 09:30:00", "month": "08", "high": 277.1899, "low": 276.64, "key": "NVDA_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 277.1899, "open": 276.875, "day": "31"}
{"volume": 351118, "symbol": "INTC", "ts": "2018-08-31 09:30:00", "month": "08", "high": 48.06, "low": 47.96, "key": "INTC_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 48.03, "open": 47.961, "day": "31"}

这里将 batch_1.json 数据进行了缩减,如需获取全量可以从这里下载:https://github.com/apache/hudi/blob/master/docker/demo/data/batch_1.json

执行 cat batch_1.json | kcat -b kafkabroker -t stock_ticks -P 就可以将数据导入 Kafka 中。

start_demo.sh
#!/bin/bash

SCRIPT_PATH=$(cd `dirname $0`; pwd)
WS_ROOT=`dirname $SCRIPT_PATH`
COMPOSE_FILE_NAME="docker-compose_mac_aarch64.yml"

# restart cluster
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} down
sleep 5
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} up -d
sleep 15

docker exec -it adhoc-1 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
docker exec -it adhoc-2 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh

stop_demo.sh
#!/bin/bash

SCRIPT_PATH=$(cd `dirname $0`; pwd)
HUDI_DEMO_ENV=$1
# set up root directory
WS_ROOT=`dirname $SCRIPT_PATH`

COMPOSE_FILE_NAME="docker-compose_mac_aarch64.yml"

# shut down cluster
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} down

# remove houst mount directory
rm -rf /tmp/hadoop_data
rm -rf /tmp/hadoop_name

所涉及所有镜像为:

docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-datanode:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-namenode:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-history:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/kafka:2.13-2.8.1
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/zookeeper:3.4.12
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-fe-arm
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-be-arm

为了方便,可以先下载完成后再使用脚本启动 docker-compose 集群。

测试验证

使用 DeltaStreamer 导入数据

Hudi 提供了 DeltaStreamer 工具。该工具可以连接到各种数据源(包括Kafka)来拉取变更,并使用 upsert/insert 原语应用于 Hudi 表。这里使用该工具从 kafka 中下载 json 数据,并将其摄取到 COW 和 MOR 表中。如果文件系统中的表还不存在,此工具会自动初始化这些表。

docker exec -it adhoc-2 /bin/bash

# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow table in HDFS
spark-submit \
  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
  --table-type COPY_ON_WRITE \
  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
  --source-ordering-field ts  \
  --target-base-path /user/hive/warehouse/stock_ticks_cow \
  --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties \
  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider

# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor table in HDFS
spark-submit \
  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
  --table-type MERGE_ON_READ \
  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
  --source-ordering-field ts \
  --target-base-path /user/hive/warehouse/stock_ticks_mor \
  --target-table stock_ticks_mor \
  --props /var/demo/config/kafka-source.properties \
  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
  --disable-compaction

# As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs
# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.

exit

导入完成后,可以在本机打开以下链接查看数据:

http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow

http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor

将 Hudi 中的数据同步到 Hive

将 HDFS 上的表同步到 Hive 中,包括创建 Hive 表、添加 partitions 等。

docker exec -it adhoc-2 /bin/bash

# This command takes in HiveServer URL and COW Hudi table location in HDFS and sync the HDFS state to Hive
/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
  --jdbc-url jdbc:hive2://hiveserver:10000 \
  --user hive \
  --pass hive \
  --partitioned-by dt \
  --base-path /user/hive/warehouse/stock_ticks_cow \
  --database default \
  --table stock_ticks_cow \
  --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
.....
2020-01-25 19:51:28,953 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_cow
.....

# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR table type)
/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
  --jdbc-url jdbc:hive2://hiveserver:10000 \
  --user hive \
  --pass hive \
  --partitioned-by dt \
  --base-path /user/hive/warehouse/stock_ticks_mor \
  --database default \
  --table stock_ticks_mor \
  --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
...
2020-01-25 19:51:51,066 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_ro
...
2020-01-25 19:51:51,569 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_rt
....

exit
连接 Hive 进行查询
docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 \
  --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
  --hiveconf hive.stats.autogather=false

# List Tables
0: jdbc:hive2://hiveserver:10000> show tables;
+---------------------+--+
|      tab_name       |
+---------------------+--+
| stock_ticks_cow     |
| stock_ticks_mor_ro  |
| stock_ticks_mor_rt  |
+---------------------+--+
3 rows selected (1.199 seconds)
0: jdbc:hive2://hiveserver:10000>


# Look at partitions that were added
0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;
+----------------+--+
|   partition    |
+----------------+--+
| dt=2018-08-31  |
+----------------+--+
1 row selected (0.24 seconds)


# COPY-ON-WRITE Queries:
=========================


0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+---------+----------------------+--+
| symbol  |         _c1          |
+---------+----------------------+--+
| GOOG    | 2018-08-31 10:29:00  |
+---------+----------------------+--+

Now, run a projection query:

0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20230621074927233    | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
| 20230621074927233    | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
+----------------------+---------+----------------------+---------+------------+-----------+--+


# Merge-On-Read Queries:
==========================

Lets run similar queries against M-O-R table. Lets look at both 
ReadOptimized and Snapshot(realtime data) queries supported by M-O-R table

# Run ReadOptimized Query. Notice that the latest timestamp is 10:29
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol  |         _c1          |
+---------+----------------------+--+
| GOOG    | 2018-08-31 10:29:00  |
+---------+----------------------+--+
1 row selected (6.326 seconds)


# Run Snapshot Query. Notice that the latest timestamp is again 10:29

0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol  |         _c1          |
+---------+----------------------+--+
| GOOG    | 2018-08-31 10:29:00  |
+---------+----------------------+--+
1 row selected (1.606 seconds)


# Run Read Optimized and Snapshot project queries

0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
+----------------------+---------+----------------------+---------+------------+-----------+--+

0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
+----------------------+---------+----------------------+---------+------------+-----------+--+

exit
连接 fe
docker exec -ti doris-fe /bin/bash
# 登录容器后再登录 fe
mysql -h127.0.0.1 -uroot -P9030
Doris 创建 Hudi catalog
CREATE CATALOG hudi PROPERTIES (
    'type'='hms',
    'hive.metastore.uris' = 'thrift://172.18.0.104:9083',
    'hive.version' = '2.3.3'
);

CREATE CATALOG hudi2 PROPERTIES (
    'type'='hms',
    'hive.metastore.uris' = 'thrift://hivemetastore:9083',
    'hive.version' = '2.3.3'
);

CREATE CATALOG hudi3 PROPERTIES (
    'type'='hms',
    'hive.metastore.uris' = 'thrift://172.18.0.104:9083',
    'hadoop.username' = 'hive',
    'dfs.nameservices'='namenode',
    'dfs.ha.namenodes.namenode'='nn1',
    'dfs.namenode.rpc-address.namenode.nn1'='172.18.0.100:8082',
    'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider'
);

因为上面 hivemetastore 的 IP 为 172.18.0.104,所以这里这样指定。

查看 catalog
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type     | IsCurrent |
+-----------+-------------+----------+-----------+
|     10004 | hudi        | hms      |           |
|         0 | internal    | internal | yes       |
+-----------+-------------+----------+-----------+
切换 catalog
MySQL [(none)]> switch hudi;
查看 catalog
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type     | IsCurrent |
+-----------+-------------+----------+-----------+
|     10004 | hudi        | hms      |           |
|         0 | internal    | internal | yes       |
+-----------+-------------+----------+-----------+
查看 catalog
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type     | IsCurrent |
+-----------+-------------+----------+-----------+
|     10004 | hudi        | hms      |           |
|         0 | internal    | internal | yes       |
+-----------+-------------+----------+-----------+
指定数据库
MySQL [(none)]> use default;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A

Database changed
执行查询
MySQL [default]> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+--------+---------------------+
| symbol | max(`ts`)           |
+--------+---------------------+
| GOOG   | 2018-08-31 10:29:00 |
+--------+---------------------+
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
随着互联网的发展,数据的不断膨胀,从刚开始的关系型数据库到非关系型数据库,再到大数据技术,技术的不断演进最终是随着数据膨胀而不断改变,最初的数据仓库能解决我们的问题,但是随着时代发展,企业已经不满足于数据仓库,希望有更强大的技术来支撑数据的存储,包括结构化,非结构化的数据等,希望能够积累企业的数据,从中挖掘出更大的价值。基于这个背景,数据湖的技术应运而生。本课程基于真实的企业数据湖案例进行讲解,结合业务实现数据湖平台,让大家在实践中理解和掌握数据湖技术,未来数据湖的需求也会不断加大,希望同学们抓住这个机遇。项目中将以热门的互联网电商业务场景为案例讲解,具体分析指标包含:流量分析,订单分析,用户行为分析,营销分析,广告分析等,能承载海量数据的实时分析,数据分析涵盖全端(PC、移动、小程序)应用。Apache Hudi代表Hadoop Upserts anD Incrementals,管理大型分析数据集在HDFS上的存储。Hudi的主要目的是高效减少摄取过程中的数据延迟。Hudi的出现解决了现有hadoop体系的几个问题:1、HDFS的可伸缩性限制 2、需要在Hadoop中更快地呈现数据 3、没有直接支持对现有数据的更新和删除 4、快速的ETL和建模 5、要检索所有更新的记录,无论这些更新是添加到最近日期分区的新记录还是对旧数据的更新,Hudi都允许用户使用最后一个检查点时间戳,此过程不用执行扫描整个源表的查询。 本课程包含的技术: 开发工具为:IDEA、WebStorm Flink1.9.0、HudiClickHouseHadoop2.7.5 Hbase2.2.6Kafka2.1.0 Hive2.2.0HDFS、MapReduceSpark、ZookeeperBinlog、Canal、MySQLSpringBoot2.0.2.RELEASE SpringCloud Finchley.RELEASEVue.js、Nodejs、HighchartsLinux Shell编程课程亮点: 1.与企业接轨、真实工业界产品 2.ClickHouse高性能列式存储数据库 3.大数据热门技术Flink4.Flink join 实战 5.Hudi数据湖技术6.集成指标明细查询 7.主流微服务后端系统 8.数据库实时同步解决方案 9.涵盖主流前端技术VUE+jQuery+Ajax+NodeJS 10.集成SpringCloud实现统一整合方案 11.互联网大数据企业热门技术栈 12.支持海量数据的实时分析 13.支持全端实时数据分析 14.全程代码实操,提供全部代码和资料 15.提供答疑和提供企业技术方案咨询企业一线架构师讲授,代码在老师的指导下企业可以复用,提供企业解决方案。  版权归作者所有,盗版将进行法律维权。  
上层应用业务对实时数据的需求,主要包含两部分内容:1、 整体数据的实时分析。2、 AB实验效果的实时监控。这几部分数据需求,都需要进行的下钻分析支持,我们希望能够建立统一的实时OLAP数据仓库,并提供一套安全、可靠的、灵活的实时数据服务。目前每日新增的曝光日志达到几亿条记录,再细拆到AB实验更细维度时,数据量则多达上百亿记录,多维数据组合下的聚合查询要求秒级响应时间,这样的数据量也给团队带来了不小的挑战。OLAP层的技术选型,需要满足以下几点:1:数据延迟在分钟级,查询响应时间在秒级2:标准SQL交互引擎,降低使用成本3:支持join操作,方便维度增加属性信息4:流量数据可以近似去重,但订单行要精准去重5:高吞吐,每分钟数据量在千W级记录,每天数百亿条新增记录6:前端业务较多,查询并发度不能太低通过对比开源的几款实时OLAP引擎,可以发现Doris和ClickHouse能够满足上面的需求,但是ClickHouse的并发度太低是个潜在的风险,而且ClickHouse的数据导入没有事务支持,无法实现exactly once语义,对标准SQL的支持也是有限的。所以针对以上需求Doris完全能解决我们的问题,DorisDB是一个性能非常高的分布式、面向交互式查询的分布式数据库,非常的强大,随着互联网发展,数据量会越来越大,实时查询需求也会要求越来越高,DorisDB人才需求也会越来越大,越早掌握DorisDB,以后就会有更大的机遇。本课程基于真实热门的互联网电商业务场景为案例讲解,具体分析指标包含:AB版本分析,下砖分析,营销分析,订单分析,终端分析等,能承载海量数据的实时分析,数据分析涵盖全端(PC、移动、小程序)应用。整个课程,会带大家实践一个完整系统,大家可以根据自己的公司业务修改,既可以用到项目中去,价值是非常高的。本课程包含的技术:开发工具为:IDEA、WebStormFlink1.9.0DorisDBHadoop2.7.5Hbase2.2.6Kafka2.1.0Hive2.2.0HDFS、MapReduceFlume、ZookeeperBinlog、Canal、MySQLSpringBoot2.0.8.RELEASESpringCloud Finchley.SR2Vue.js、Nodejs、Highcharts、ElementUILinux Shell编程等课程亮点:1.与企业接轨、真实工业界产品2.DorisDB高性能分布式数据库3.大数据热门技术Flink4.支持ABtest版本实时监控分析5.支持下砖分析6.数据分析涵盖全端(PC、移动、小程序)应用7.主流微服务后端系统8.天级别与小时级别多时间方位分析9.数据库实时同步解决方案10.涵盖主流前端技术VUE+jQuery+Ajax+NodeJS+ElementUI11.集成SpringCloud实现统一整合方案12.互联网大数据企业热门技术栈13.支持海量数据的实时分析14.支持全端实时数据分析15.全程代码实操,提供全部代码和资料16.提供答疑和提供企业技术方案咨询企业一线架构师讲授,代码在老师的指导下企业可以复用,提供企业解决方案。  版权归作者所有,盗版将进行法律维权。 

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

一只努力的微服务

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值