druid导入数据的几种方式

最新推荐文章于 2024-04-17 09:51:04 发布

weixin_34360651

最新推荐文章于 2024-04-17 09:51:04 发布

阅读量2.4k

点赞数

文章标签：大数据 json 数据库

原文链接：https://my.oschina.net/u/2000675/blog/1572612

版权

2019独角兽企业重金招聘Python工程师标准>>>

Stream Push

通过 Tranquility连接外部数据源，读取数据源数据,然后将数据写到druid，它可以读取流数据或者批数据。

Server

Druid用 Tranquility Server连接Druid middleManagers and historical，Tranquility server启动方式：

bin/tranquility server -configFile <path_to_config_file>/server.json

Tranquility属于外部包，我们需要自己下载。

server.json文件中自定义属性和数据源

kafka 实例

启动Tranquility连接kafka将数据写入druid

bin/tranquility kafka -configFile <path_to_config_file>/kafka.json

配置如下：

{
  "dataSources" : {
    "metrics-kafka" : {
      "spec" : {
        "dataSchema" : {
          "dataSource" : "metrics-kafka",
          "parser" : {
            "type" : "string",
            "parseSpec" : {
              "timestampSpec" : {
                "column" : "timestamp",
                "format" : "auto"
              },
              "dimensionsSpec" : {
                "dimensions" : [],
                "dimensionExclusions" : [
                  "timestamp",
                  "value"
                ]
              },
              "format" : "json"
            }
          },
          "granularitySpec" : {
            "type" : "uniform",
            "segmentGranularity" : "hour",
            "queryGranularity" : "none"
          },
          "metricsSpec" : [
            {
              "type" : "count",
              "name" : "count"
            },
            {
              "name" : "value_sum",
              "type" : "doubleSum",
              "fieldName" : "value"
            },
            {
              "fieldName" : "value",
              "name" : "value_min",
              "type" : "doubleMin"
            },
            {
              "type" : "doubleMax",
              "name" : "value_max",
              "fieldName" : "value"
            }
          ]
        },
        "ioConfig" : {
          "type" : "realtime"
        },
        "tuningConfig" : {
          "type" : "realtime",
          "maxRowsInMemory" : "100000",
          "intermediatePersistPeriod" : "PT10M",
          "windowPeriod" : "PT10M"
        }
      },
      "properties" : {
        "task.partitions" : "1",
        "task.replicants" : "1",
        "topicPattern" : "metrics"
      }
    }
  },
  "properties" : {
    "zookeeper.connect" : "master.example.com",
    "druid.discovery.curator.path" : "/druid/discovery",
    "druid.selectors.indexing.serviceName" : "druid/overlord",
    "commit.periodMillis" : "15000",
    "consumer.numThreads" : "2",
    "kafka.zookeeper.connect" : "kafka-zk.example.com",
    "kafka.group.id" : "tranquility-kafka"
  }
}

Stream Pull

Druid Realtime Nodes通过Firehose连接数据源读取数据

kafka实例

[
  {
    "dataSchema" : {
      "dataSource" : "wikipedia",
      "parser" : {
        "type" : "string",
        "parseSpec" : {
          "format" : "json",
          "timestampSpec" : {
            "column" : "timestamp",
            "format" : "auto"
          },
          "dimensionsSpec" : {
            "dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
            "dimensionExclusions" : [],
            "spatialDimensions" : []
          }
        }
      },
      "metricsSpec" : [{
        "type" : "count",
        "name" : "count"
      }, {
        "type" : "doubleSum",
        "name" : "added",
        "fieldName" : "added"
      }, {
        "type" : "doubleSum",
        "name" : "deleted",
        "fieldName" : "deleted"
      }, {
        "type" : "doubleSum",
        "name" : "delta",
        "fieldName" : "delta"
      }],
      "granularitySpec" : {
        "type" : "uniform",
        "segmentGranularity" : "DAY",
        "queryGranularity" : "NONE"
      }
    },
    "ioConfig" : {
      "type" : "realtime",
      "firehose": {
        "type": "kafka-0.8",
        "consumerProps": {
          "zookeeper.connect": "localhost:2181",
          "zookeeper.connection.timeout.ms" : "15000",
          "zookeeper.session.timeout.ms" : "15000",
          "zookeeper.sync.time.ms" : "5000",
          "group.id": "druid-example",
          "fetch.message.max.bytes" : "1048586",
          "auto.offset.reset": "largest",
          "auto.commit.enable": "false"
        },
        "feed": "wikipedia"
      },
      "plumber": {
        "type": "realtime"
      }
    },
    "tuningConfig": {
      "type" : "realtime",
      "maxRowsInMemory": 75000,
      "intermediatePersistPeriod": "PT10m",
      "windowPeriod": "PT10m",
      "basePersistDirectory": "\/tmp\/realtime\/basePersist",
      "rejectionPolicy": {
        "type": "serverTime"
      }
    }
  }
]

从kafka读取数据的新方式

Kafka Indexing Service

Indexing Service是负责“生产”Segment的高可用、分布式、Master/Slave架构服务。主要由三类组件构成：负责运行索引任务(indexing task)的Peon，负责控制Peon的MiddleManager，负责任务分发给MiddleManager的Overlord；三者的关系可以解释为：Overlord是MiddleManager的Master，而MiddleManager又是Peon的Master。其中，Overlord和MiddleManager可以分布式部署，但是Peon和MiddleManager默认在同一台机器上。

{
  "type": "kafka",
  "dataSchema": {
    "dataSource": "metrics-kafka",
    "parser": {
      "type": "string",
      "parseSpec": {
        "format": "json",
        "timestampSpec": {
          "column": "timestamp",
          "format": "auto"
        },
        "dimensionsSpec": {
          "dimensions": [],
          "dimensionExclusions": [
            "timestamp",
            "value"
          ]
        }
      }
    },
    "metricsSpec": [
      {
        "name": "count",
        "type": "count"
      },
      {
        "name": "value_sum",
        "fieldName": "value",
        "type": "doubleSum"
      },
      {
        "name": "value_min",
        "fieldName": "value",
        "type": "doubleMin"
      },
      {
        "name": "value_max",
        "fieldName": "value",
        "type": "doubleMax"
      }
    ],
    "granularitySpec": {
      "type": "uniform",
      "segmentGranularity": "HOUR",
      "queryGranularity": "NONE"
    }
  },
  "tuningConfig": {
    "type": "kafka",
    "maxRowsPerSegment": 5000000
  },
  "ioConfig": {
    "topic": "metrics",
    "consumerProperties": {
      "bootstrap.servers": "localhost:9092"
    },
    "taskCount": 1,
    "replicas": 1,
    "taskDuration": "PT1H"
  }
}