【10】把 Elasticsearch 当数据库使：Drill Down 下钻

阅读量：6847 次

发布时间：2019-06-26

本文共 7534 字，大约阅读时间需要 25 分钟。

使用可以用 SQL 进行 elasticsearch 的查询。传统的 SQL 的语义是每一层查询是对下一层查询的综合，也就是每SELECT一层行数就会变少。比如

SELECT sum(per_sector) AS total FROM (    SELECT sector, count(*) AS per_sector FROM (        SELECT sector, ipo_year FROM symbol    ))

Elasticsearch是不支持这样的嵌套SELECT的。它支持一个更实用的功能，嵌套下钻（Drill Down）。与传统SQL的语义正好相反，Elasticsearch的嵌套不是SELECT FROM 而是 SELECT INSIDE，每经过一层SELECT其行数不减少反而增加。所以我把Elasticsearch的这种嵌套查询不称之为SELECT FROM而是SELECT INSIDE以示区别，当然为了习惯，用SELECT FROM也是支持的。

SELECT INSIDE all_symbols GROUP BY ipo_year

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);    WITH per_ipo_year AS (SELECT ipo_year, MAX(market_cap) AS max_this_year INSIDE all_symbols         GROUP BY ipo_year LIMIT 2)EOF

第一行SELECT定义了all_symbols，第二行SELECT是在第一次查询的基础之上进行自己的二次聚合。这样多层SELECT的好处是每一层可以定义自己的SELECT字段。比如GROUP BY a,b,c其实做了三次下钻，但是限于SQL的语法这样写是没法给没一层下钻的分桶计算指标，但是用上面这种WITH的语法，就可以每钻一层计算一层的指标。

{"_bucket_path": ["per_ipo_year"], "max_this_year": 54171930444.0, "max_all_times": 522690000000.0, "ipo_year": 2014}{"_bucket_path": ["per_ipo_year"], "max_this_year": 5416144671.0, "max_all_times": 522690000000.0, "ipo_year": 2015}

Elasticsearch

{  "aggs": {    "max_all_times": {      "max": {        "field": "market_cap"      }    },     "ipo_year": {      "terms": {        "field": "ipo_year",         "size": 2      },       "aggs": {        "max_this_year": {          "max": {            "field": "market_cap"          }        }      }    }  },   "size": 0}

下钻的概念直接看Elasticsearch的查询其实更清楚。每下钻一层，括号就往右深了一层。从某种意义上来说，Elasticsearch的DSL其实更接近聚合的实质。

{  "hits": {    "hits": [],     "total": 6714,     "max_score": 0.0  },   "_shards": {    "successful": 1,     "failed": 0,     "total": 1  },   "took": 2,   "aggregations": {    "max_all_times": {      "value": 522690000000.0    },     "ipo_year": {      "buckets": [        {          "max_this_year": {            "value": 54171930444.0          },           "key": 2014,           "doc_count": 390        },         {          "max_this_year": {            "value": 5416144671.0          },           "key": 2015,           "doc_count": 334        }      ],       "sum_other_doc_count": 2174,       "doc_count_error_upper_bound": 0    }  },   "timed_out": false}

Profile

[  {    "query": [      {        "query_type": "MatchAllDocsQuery",        "lucene": "*:*",        "time": "0.2003040000ms",        "breakdown": {          "score": 0,          "create_weight": 9025,          "next_doc": 162380,          "match": 0,          "build_scorer": 28899,          "advance": 0        }      }    ],    "rewrite_time": 2523,    "collector": [      {        "name": "MultiCollector",        "reason": "search_multi",        "time": "2.325354000ms",        "children": [          {            "name": "TotalHitCountCollector",            "reason": "search_count",            "time": "0.2740410000ms"          },          {            "name": "BucketCollector: [[max_all_times, ipo_year]]",            "reason": "aggregation",            "time": "1.295439000ms"          }        ]      }    ]  }]

SELECT INSIDE 多次

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);    WITH per_ipo_year AS (SELECT ipo_year, MAX(market_cap) AS max_this_year INSIDE all_symbols         GROUP BY ipo_year LIMIT 2);    WITH per_sector AS (SELECT sector, MAX(market_cap) AS max_this_sector INSIDE per_ipo_year         GROUP BY sector LIMIT 2)EOF

这个和GROUP BY ipo_year, sector其实是差不多的，区别在于对每一层下钻都可以选取这一层的指标出来。

{"sector": "Health Care", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 54171930444.0, "ipo_year": 2014, "max_all_times": 522690000000.0, "max_this_sector": 2660000000.0}{"sector": "Finance", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 54171930444.0, "ipo_year": 2014, "max_all_times": 522690000000.0, "max_this_sector": 5530000000.0}{"sector": "Finance", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 5416144671.0, "ipo_year": 2015, "max_all_times": 522690000000.0, "max_this_sector": 2740000000.0}{"sector": "Health Care", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 5416144671.0, "ipo_year": 2015, "max_all_times": 522690000000.0, "max_this_sector": 5416144671.0}

Elasticsearch

{  "aggs": {    "max_all_times": {      "max": {        "field": "market_cap"      }    },     "ipo_year": {      "terms": {        "field": "ipo_year",         "size": 2      },       "aggs": {        "sector": {          "terms": {            "field": "sector",             "size": 2          },           "aggs": {            "max_this_sector": {              "max": {                "field": "market_cap"              }            }          }        },         "max_this_year": {          "max": {            "field": "market_cap"          }        }      }    }  },   "size": 0}

{  "hits": {    "hits": [],     "total": 6714,     "max_score": 0.0  },   "_shards": {    "successful": 1,     "failed": 0,     "total": 1  },   "took": 8,   "aggregations": {    "max_all_times": {      "value": 522690000000.0    },     "ipo_year": {      "buckets": [        {          "sector": {            "buckets": [              {                "max_this_sector": {                  "value": 2660000000.0                },                 "key": "Health Care",                 "doc_count": 104              },               {                "max_this_sector": {                  "value": 5530000000.0                },                 "key": "Finance",                 "doc_count": 70              }            ],             "sum_other_doc_count": 216,             "doc_count_error_upper_bound": 0          },           "max_this_year": {            "value": 54171930444.0          },           "key": 2014,           "doc_count": 390        },         {          "sector": {            "buckets": [              {                "max_this_sector": {                  "value": 2740000000.0                },                 "key": "Finance",                 "doc_count": 92              },               {                "max_this_sector": {                  "value": 5416144671.0                },                 "key": "Health Care",                 "doc_count": 92              }            ],             "sum_other_doc_count": 150,             "doc_count_error_upper_bound": 0          },           "max_this_year": {            "value": 5416144671.0          },           "key": 2015,           "doc_count": 334        }      ],       "sum_other_doc_count": 2174,       "doc_count_error_upper_bound": 0    }  },   "timed_out": false}

Profile

[  {    "query": [      {        "query_type": "MatchAllDocsQuery",        "lucene": "*:*",        "time": "0.2576120000ms",        "breakdown": {          "score": 0,          "create_weight": 63193,          "next_doc": 165400,          "match": 0,          "build_scorer": 29019,          "advance": 0        }      }    ],    "rewrite_time": 3205,    "collector": [      {        "name": "MultiCollector",        "reason": "search_multi",        "time": "6.292688000ms",        "children": [          {            "name": "TotalHitCountCollector",            "reason": "search_count",            "time": "0.2599140000ms"          },          {            "name": "BucketCollector: [[max_all_times, ipo_year]]",            "reason": "aggregation",            "time": "5.172211000ms"          }        ]      }    ]  }]

相信看过前面例子，你应该理解了什么叫下钻了。每下钻一层，前面的一行就会被再次分裂到多个桶里。每一层都可以搞自己的指标计算。

转载地址：http://fjlul.baihongyu.com/

你可能感兴趣的文章