5.2 ElasticSearch聚合分析之Bucket

1.terms
该分桶策略最简单,直接按照term来分桶,如果是text类型,则按照分词后的结果分桶(field的值需要设置为field_name.keyword),如对salary字段进行分桶。

POST /employee/_search
{
	"size": 0,
	"aggs": {
		"salary_terms": {
			"terms": {
				"field": "salary",
				"size": 10
			}
		}
	}
}
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 8,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "aggs_name" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : 20000.0,
          "doc_count" : 2
        },
        {
          "key" : 15000.0,
          "doc_count" : 1
        },
        {
          "key" : 18000.0,
          "doc_count" : 1
        },
        {
          "key" : 28000.0,
          "doc_count" : 1
        },
        {
          "key" : 29000.0,
          "doc_count" : 1
        },
        {
          "key" : 30000.0,
          "doc_count" : 1
        },
        {
          "key" : 50000.0,
          "doc_count" : 1
        }
      ]
    }
  }
}

2.range
通过指定数值的范围来设定分桶规则,如对salary字段按照小于18000、大于等于18000并且小于30000、大于等于30000三种情况分桶。

POST /employee/_search
{
	"size": 0,
	"aggs": {
		"salary_range": {
			"range": {
				"field": "salary",
				"ranges": [{
						"key": "小于18000",
						"to": 18000
					},
					{
						"key": "大于等于18000并且小于30000",
						"from": 20000,
						"to": 30000
					},
					{
						"key": "大于等于30000",
						"from": 30000
					}
				]
			}
		}
	}
}
{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 8,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "salary_range" : {
      "buckets" : [
        {
          "key" : "小于18000",
          "to" : 18000.0,
          "doc_count" : 1
        },
        {
          "key" : "大于等于18000并且小于30000",
          "from" : 20000.0,
          "to" : 30000.0,
          "doc_count" : 4
        },
        {
          "key" : "大于等于30000",
          "from" : 30000.0,
          "doc_count" : 2
        }
      ]
    }
  }
}

3.date_range
通过指定日期的范围来设定分桶规则,如对birthday字段按照"1980-9990"以及"1990-2020"两个时间段来分桶。

POST /employee/_search
{
	"size": 0,
	"aggs": {
		"birthday_date_range": {
			"date_range": {
				"field": "birthday",
				"format": "yyyy",
				"ranges": [{
						"from": "1980",
						"to": "1990"
					},
					{
						"from": "1990",
						"to": "2020"
					}
				]
			}
		}
	}
}
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 7,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "birthday_date_range" : {
      "buckets" : [
        {
          "key" : "1980-1990",
          "from" : 3.155328E11,
          "from_as_string" : "1980",
          "to" : 6.31152E11,
          "to_as_string" : "1990",
          "doc_count" : 2
        },
        {
          "key" : "1990-2020",
          "from" : 6.31152E11,
          "from_as_string" : "1990",
          "to" : 1.5778368E12,
          "to_as_string" : "2020",
          "doc_count" : 5
        }
      ]
    }
  }
}

from和to在指定日期时,可以使用date match。

4.historgram
直方图,以固定间隔的策略来分割数据,如对salary字段按照5000的间隔进行分桶。

POST /employee/_search
{
	"size": 0,
	"aggs": {
		"salary_histogram": {
			"histogram": {
				"field": "salary",
				"interval": 5000,
				"min_doc_count": 1,
				"extended_bounds": {
					"min": 10000,
					"max": 50000
				}
			}
		}
	}
}
  • interval:间隔数值
  • min_doc_count:直方图区间内最小文档个数
  • extended_bounds:指定数据范围
{
  "took" : 14,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 8,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "salary_histogram" : {
      "buckets" : [
        {
          "key" : 15000.0,
          "doc_count" : 2
        },
        {
          "key" : 20000.0,
          "doc_count" : 2
        },
        {
          "key" : 25000.0,
          "doc_count" : 2
        },
        {
          "key" : 30000.0,
          "doc_count" : 1
        },
        {
          "key" : 50000.0,
          "doc_count" : 1
        }
      ]
    }
  }
}

5.date_historgram
针对日期的直方图或者柱状图,是时序数据分析中常用的聚合分析类型,如对birthday字段按照3660天(10年)的间隔进行分桶。

POST /employee/_search
{
	"size": 0,
	"aggs": {
		"birthday_date_histogram": {
			"date_histogram": {
				"field": "birthday",
				"fixed_interval": "3660d",
				"min_doc_count": 1,
				"format": "yyyy",
				"extended_bounds": {
		          	"min": 1980,
		          	"max": 2020
			    }
			}
		}
	}
}
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 7,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "birthday_date_histogram" : {
      "buckets" : [
        {
          "key_as_string" : "1980",
          "key" : 316224000000,
          "doc_count" : 2
        },
        {
          "key_as_string" : "1990",
          "key" : 632448000000,
          "doc_count" : 5
        }
      ]
    }
  }
}

6.Bucket + Metric
(1).简介
bucket聚合分析允许通过添加子分析来进一步分析,该子分析可以是metric也可以是bucket。

(2).分桶后再分桶
如先对职位进行分桶,然后再对年龄进行分桶。

POST /employee/_search
{
	"size": 0,
	"aggs": {
		"job_terms": {
			"terms": {
				"field": "job",
				"size": 10
			},
			"aggs": {
				"age_range": {
					"range": {
						"field": "age",
						"ranges": [{
								"to": 25
							},
							{
								"from": 25,
								"to": 35
							},
							{
								"from": 35
							}
						]
					}
				}
			}
		}
	}
}
{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 7,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "job_terms" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Java engineer",
          "doc_count" : 4,
          "age_range" : {
            "buckets" : [
              {
                "key" : "*-25.0",
                "to" : 25.0,
                "doc_count" : 1
              },
              {
                "key" : "25.0-35.0",
                "from" : 25.0,
                "to" : 35.0,
                "doc_count" : 3
              },
              {
                "key" : "35.0-*",
                "from" : 35.0,
                "doc_count" : 0
              }
            ]
          }
        },
        {
          "key" : "Vue engineer",
          "doc_count" : 2,
          "age_range" : {
            "buckets" : [
              {
                "key" : "*-25.0",
                "to" : 25.0,
                "doc_count" : 0
              },
              {
                "key" : "25.0-35.0",
                "from" : 25.0,
                "to" : 35.0,
                "doc_count" : 2
              },
              {
                "key" : "35.0-*",
                "from" : 35.0,
                "doc_count" : 0
              }
            ]
          }
        },
        {
          "key" : "Technical director",
          "doc_count" : 1,
          "age_range" : {
            "buckets" : [
              {
                "key" : "*-25.0",
                "to" : 25.0,
                "doc_count" : 0
              },
              {
                "key" : "25.0-35.0",
                "from" : 25.0,
                "to" : 35.0,
                "doc_count" : 0
              },
              {
                "key" : "35.0-*",
                "from" : 35.0,
                "doc_count" : 1
              }
            ]
          }
        }
      ]
    }
  }
}

(3).分桶后进行数据分析
先对职位进行分桶,然后再对薪水进行分析。

POST /employee/_search
{
	"size": 0,
	"aggs": {
		"job_terms": {
			"terms": {
				"field": "job",
				"size": 10
			},
			"aggs": {
				"salary_stats": {
					"stats": {
						"field": "salary"
					}
				}
			}
		}
	}
}
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 7,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "job_terms" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Java engineer",
          "doc_count" : 4,
          "salary_stats" : {
            "count" : 4,
            "min" : 15000.0,
            "max" : 30000.0,
            "avg" : 23500.0,
            "sum" : 94000.0
          }
        },
        {
          "key" : "Vue engineer",
          "doc_count" : 2,
          "salary_stats" : {
            "count" : 2,
            "min" : 18000.0,
            "max" : 28000.0,
            "avg" : 23000.0,
            "sum" : 46000.0
          }
        },
        {
          "key" : "Technical director",
          "doc_count" : 1,
          "salary_stats" : {
            "count" : 1,
            "min" : 50000.0,
            "max" : 50000.0,
            "avg" : 50000.0,
            "sum" : 50000.0
          }
        }
      ]
    }
  }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值