@dungan 2019-12-24T10:37:06.000000Z 字数 9321 阅读 161

Elasticsearch 搜索

Elasticsearch

词项查询

词项查询 能够让你在结构化的数据中精确查找某个值，与全文查询不同，词项查询不分析搜索词。相反，词项查询会精确匹配字段中的字词。

term

term 用来从倒排索引中查找包含某个精确值的文档，非常适合用来查找类型是 keyword(string),数字,日期 的字段。

可以通过 boost 来修改关键词的权重，这样该关键词所在的文档就会优先排在结果集的最前面，boost 的默认值是1.0。

{
  "query": {
    "bool": {
      "should": [
        {
          "term": {
            "status": {
              "value": "urgent",
              "boost": 2.0 
            }
          }
        },
        {
          "term": {
            "status": "normal" 
          }
        }
      ]
    }
  }
}

terms

terms 用来查询文档中包含词项组中任一单词的文档。

{
    "query": {
        "terms": {
            "user": [
                "张三",
                "李四",
                "王五"
            ]
        }
    }
}

terms-set

terms-set 用来查询词项组中最少应包含几个单词这种场景，因此在构建索引时要设置一个字段用来控制最少包含数量。

PUT /my-index
{
    "mappings": {
        "_doc": {
            "properties": {
                "required_matches": {
                    "type": "long"
                }
            }
        }
    }
}
# 字段值是数组格式
PUT /my-index/_doc/1?refresh
{
    "codes": ["ghi", "jkl"],
    "required_matches": 2
}
PUT /my-index/_doc/2?refresh
{
    "codes": ["def", "ghi"],
    "required_matches": 2
}
# 查询
{
    "query": {
        "terms_set": {
            "codes" : {
                "terms" : ["abc", "def", "ghi"],
                "minimum_should_match_field": "required_matches"
            }
        }
    }
}
# 可以看到 id = 2 的这条文档被查询出来,因为它的元素值满足 terms-set 条件
{
  "took": 13,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped" : 0,
    "failed": 0
  },
  "hits": {
    "total": 1,
    "max_score": 0.5753642,
    "hits": [
      {
        "_index": "my-index",
        "_type": "_doc",
        "_id": "2",
        "_score": 0.5753642,
        "_source": {
          "codes": ["def", "ghi"],
          "required_matches": 2
        }
      }
    ]
  }
}

只获取特定的字段

通过 _source 可以设置哪些字段出现在结果集中，类似于 mysql 的 select field1,field2 ... from table 。

{
    "query": {
        "term": {
            "title": "php"
        }
    },
    "_source": [
        "title",
        "author"
    ]
}

范围查询

{
    "query": {
        "range": {
            "postdate": {
                "gte": "2017-01-01",
                "lte": "2017-12-31",
                "format": "yyyy-MM-dd"
            }
        }
    }
}

查询最近一个月发的帖子。

{
    "query": {
        "filter": {
            "range": {
                "postDate": {
                    "gte": "now-30d"
                }
            }
        }
    }
}

exists

查询字段中不含 null 或 [] 的文档。

# user 字段不含 null 或 [] 的文档会被查出来 。
{
    "query": {
        "exists": {
            "field": "user"
        }
    }
}

反之，如果想查询字段中包含 null 或 [] 的文档，可以这样。

{
    "query": {
        "bool": {
            "must_not": {
                "exists": {
                    "field": "user"
                }
            }
        }
    }
}

通配符查询(Wildcard)

通配符 * 表示匹配零个或多个字符。

# 查找 user_name 包含以ki开头和y结尾的文档, (kiy, kity,kimchy) 这些项都会被匹配到
{
    "query": {
        "wildcard": {
            "user_name": "ki*y"
        }
    }
}

正则查询(Regexp)

正则查询 让你可以使用比 通配符查询 更复杂的模式进行查询。

{
    "query": {
        "regexp": {
            "user_name": "ki*y"
        }
    }
}

更多关于 ES 正则表达式的语法见这里。

模糊查询 (Fuzzy)

因为要查询的输入词有可能有拼写错误，因此这种场景就可以使用模糊查询来修正错误词。

可以通过设置 fuzziness 来控制查询词和文档中词项的模糊字符数，fuzziness 的默认值为 AUTO，当查询词长度大于 5 个字符时，AUTO 的模糊值等同于指定值 2 。

# 插入数据
POST /my_index/my_type/_bulk
{ "index": { "_id": 1 }}
{ "text": "Surprise me!"}
{ "index": { "_id": 2 }}
{ "text": "That was surprising."}
{ "index": { "_id": 3 }}
{ "text": "I wasn't surprised."}
# 为词 surprize 运行一个 fuzzy 查询
GET /my_index/my_type/_search
{
  "query": {
    "fuzzy": {
      "text": "surprize",
      "fuzziness": AUTO
    }
  }
}

上面的例子中，surprise 比较 surprise 和 surprised，都在 2 以内，所以文档 1 和 3 匹配。如果当你将 fuzziness 设置为1后，你会发现只有文档 1 才被匹配出来。

Ids

从文档的元数据字段 _id 中去匹配给定的 id。

{
    "query": {
        "ids": {
            "values": [
                "2",
                "3"
            ]
        }
    }
}

分数过滤

{
  "min_score":"3.0",
  "query": {
    "term": {
        "content": "测试"
    }
  }
}

高亮关键字

# 将 content 中的关键词 '测试' 高亮
{
    "query": {
        "match": {
            "content": "小米 测试"
        }
    },
    "highlight": {
        "fields": {
            "content": {}
        }
    }
}
# 结果
{
    ...
    "highlight": {
        "content": [
            "<em>测试</em><em>测试</em><em>测试</em><em>测试</em>"
        ]
    }
    ...
}

es 默认使用 em 标签来高亮关键词，但也支持使用 pre_tags 和 post_tags 来自定义标签。

{
    "query": {
        "match": {
            "content": "小米 测试"
        }
    },
    "highlight": {
        "fields": {
            "content": {
                "pre_tags" : ["<span font-color='red'>"],
                "post_tags": ["</span>"]
            }
        }
    }
}
# 结果
{
    ...
    "highlight": {
        "content": [
            "<span font-color='red'>测试</span> hello"
        ]
    }
    ...
}

默认情况下只有包含查询匹配的字段才会高亮显示，但是如果其他字段(例如 title)也有我们要查找的关键字，这时就要对 require_field_match 属性进行设置，这样就能对多个匹配的字段高亮。

require_field_match 默认值为 true，即只对单个字段高亮。

{
    "query": {
        "match": {
            "content": "小米 测试"
        }
    },
    "highlight": {
        "require_field_match": false,
        "fields": {
            "content": {},
            "title": {}
        }
    }
}
# 结果
{
    ... 
    "highlight": {
        "title": [
            "<em>小米</em> 8 青春版"
        ],
        "content": [
            "<em>测试</em> hello"
        ]
    }
    ...
}

复合查询

Bool 查询

must 等同于 AND，must_not 等同于 NOT，should 等同于 OR。

# 书名包含 ElasticSearch 或者（OR） Solr,并且（AND）它的作者是 Clinton Gormley 不是（NOT）Radu Gheorge
{
    "query": {
        "bool": {
            "must": {
                "bool": {
                    "should": [
                        {
                            "match": {
                                "title": "Elasticsearch"
                            }
                        },
                        {
                            "match": {
                                "title": "Solr"
                            }
                        }
                    ],
                    "must": {
                        "match": {
                            "authors": "clinton gormely"
                        }
                    },
                    "must_not": {
                        "match": {
                            "authors": "radu gheorge"
                        }
                    }
                }
            }
        }
    }
}

假设我们要从文章表中查找 已审核，并且文章类型是 科技类，并且发布日期是 9月份 以后的，并且作者是( 张三 或 李四 或 王五) 这几人中的任一一人的满足条件的所有的文章。

在 mysql 中的我们查询条件如下：

select * from article where audit=1 and article_type=2 and publish_at >= 2019-09-01 and (author='张三' or author='李四' or author='王五');

在ES中的 Bool 查询构造如下：

{
    "query": {
        "bool": {
            "must": [
                {
                    "term": {
                        "audit": 1
                    }
                },
                {
                    "term": {
                        "article_type": 2
                    }
                },
                {
                    "range": {
                        "publish_at": {
                            "gte": "2019-09-01",
                            "format": "yyyy-MM-dd"
                        }
                    }
                },
                {
                    "bool": {
                        "should": [
                            {
                                "match_phrase": {
                                    "author": "张三"
                                }
                            },
                            {
                                "match_phrase": {
                                    "author": "李四"
                                }
                            },
                            {
                                "match_phrase": {
                                    "author": "王五"
                                }
                            }
                        ]
                    }
                }
            ]
        }
    }
}

可以看到 Bool 查询的本质就是让我们组装出复杂的查询条件。

Bool + Filter 查询

过滤只在第一次运行，以减少所需的查询面积，并且，在第一次使用后过滤会被缓存，大大提高了性能。

{
    "query": {
        "filter": {
            "query": {
                "multi_match": {
                    "query": "elasticsearch",
                    "fields": [
                        "title",
                        "summary"
                    ]
                }
            },
            "filter": {
                "bool": {
                    "must": {
                        "range": {
                            "num_reviews": {
                                "gte": 20
                            }
                        }
                    },
                    "must_not": {
                        "range": {
                            "publish_date": {
                                "lte": "2014-12-31"
                            }
                        }
                    },
                    "should": {
                        "term": {
                            "publisher": "oreilly"
                        }
                    }
                }
            }
        }
    }
}

多重过滤：filter 嵌套 bool 查询可以实现多重过滤(可以看做是括号中的子查询)。

全文查询

全文检索

搜索标题中含有 java 或 elasticsearch 的 blog。

{
    "query": {
        "match": {
            "title": "java elasticsearch"
        }
    }
}
# 等同于
{
    "bool": {
        "should": [
            {
                "term": {
                    "title": "java"
                }
            },
            {
                "term": {
                    "title": "elasticsearch"
                }
            }
        ]
    }
}

控制搜索精度
如果想搜索标题中同时含有 java 和 elasticsearch 两个关键字的blog，可以通过 operator 实现。

注意：这种属于同时满足多个关键字的查询，这点要和短语查询有所区分。

{
    "query": {
        "match": {
            "title": {
                "query": "java elasticsearch",
                "operator": "and"
            }
        }
    }
}
# 等同于
{
    "bool": {
        "must": [
            {
                "term": {
                    "title": "java"
                }
            },
            {
                "term": {
                    "title": "elasticsearch"
                }
            }
        ]
    }
}

如果要搜索至少包含三个关键字的blog，可以通过设置 minimum_should_match 实现。

{
  "query": {
    "match": {
      "title": {
        "query": "java elasticsearch spark hadoop",
        "minimum_should_match": 3
      }
    }
  }
}
# 等同于
{
    "bool": {
        "should": [
            {
                "term": {
                    "title": "java"
                }
            },
            {
                "term": {
                    "title": "elasticsearch"
                }
            },
            {
                "term": {
                    "title": "spark"
                }
            },
            {
                "term": {
                    "title": "hadoop"
                }
            }
        ],
        "minimum_should_match": 3
    }
}

基于boost的细粒度搜索条件权重控制

假设我们要搜索标题中包含 java 的帖子，同时如果标题中包含 hadoop 和 elasticsearch 就优先搜索出来，同时，如果一个帖子包含 java hadoop，一个帖子包含java elasticsearch，包含 hadoop 的帖子要比 elasticsearch 优先搜索出来。

{
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "title": "java"
                    }
                }
            ],
            "should": [
                {
                    "match": {
                        "title": {
                            "query": "hadoop",
                            "boost": 5
                        }
                    }
                },
                {
                    "match": {
                        "title": {
                            "query": "elasticsearch",
                            "boost": 3
                        }
                    }
                },
                {
                    "match": {
                        "title": {
                            "query": "spark",
                            "boost": 1
                        }
                    }
                }
            ]
        }
    }
}

短语查询(Match Phrase)

默认情况下，查询项之间必须紧密相连，但可以设置 slop 值来指定查询项之间可以 间隔多少个词，这样即使短语之间有其他词，也可以被查找出来。

加上slop的 match-phrase就是近似匹配了(proximity-match)，近似匹配可以搜索到很多结果，但是距离越近的会优先返回，也就是相关度分数就会越高。

{
    "query": {
        "match_phrase": {
            "title": "hello world",
            "slop":1
        }
    }
}

前缀查询(Prefix)

前缀查询能够进行即时搜索类型的匹配，或者说提供一个查询时的初级自动补全功能，无需以任何方式准备你的数据。和 match_phrase 查询类似，它也支持 slop 参数。

# 查询姓名以 tc 开头的用户
{
    "query": {
        "match_phrase_prefix": {
            "user_name": "tc"
        }
    }
}

前缀查询也支持给关键字设置权重(boost)。

{
    "query": {
        "match_phrase_prefix": {
            "user_name": {
                "value": "tcl",
                "boost": 2
            }
        }
    }
}

多字段查找(Multi Match)

multi_match 是 match 的作为在多个字段运行相同操作的一个速记法。fields 指定从哪些字段查找。

在 fields 中我们可以修改字段的权重(boost)，从而让字段所在的行出现在结果集的前面。

# 这里我们将 title 字段的分数提高三倍,这样 title 字段所在的结果行会出现在前面
{
    "query": {
        "multi_match": {
            "query": "小米 测试",
            "fields": ["title^3", "content"]
        }
    }
}

multi_match 默认以 best_fields 类型执行，此外还支持 most_fields，phrase(多字段短语查找) 等类型查询。

best_fields 类型调优

如果一个字段中同时包含了我们要查找的多个关键词，我们希望这个字段所在的文档(我们叫它最佳匹配)出现在结果集的最前面，那么这时候使用 dis_max 就能实现这种效果。

假设我们有一个让用户搜索博客文章的网站。其中有两个文档如下：

PUT /test_index/_create/1
{
    "title": "Quick brown rabbits",
    "body":  "Brown rabbits are commonly seen."
}
PUT /test_index/_create/2
{
    "title": "Keeping pets healthy",
    "body":  "My quick brown fox eats rabbits on a regular basis."
}

进行查询：

GET /test_index/_search
{
    "query": {
        "bool": {
            "should": [
                { "match": { "title": "Brown fox" }},
                { "match": { "body":  "Brown fox" }}
            ]
        }
    }
}
# 输出
{
    ...
    "hits": [
        {
            "_index": "test_index",
            "_type": "_doc",
            "_id": "2",
            "_score": 0.77041256,
            "_source": {
                "title": "Keeping pets healthy",
                "body": "My quick brown fox eats rabbits on a regular basis."
            }
        },
        {
            "_index": "test_index",
            "_type": "_doc",
            "_id": "1",
            "_score": 0.6931472,
            "_source": {
                "title": "Quick brown rabbits",
                "body": "Brown rabbits are commonly seen."
            }
        }
    ]
    ...
}

由于 ES 的文档打分机制，导致文档1出现在结果集的最前面，但文档一的任一个字段都没有同时包含 Brown fox 这两个关键词，相反，文档2的 title 字段却同时包含了这两个关键词，因此对于我们来说 文档2才是最佳匹配。

现在换成 dis_max 查询，可以看到文档2出现在了前面：

{
    "query": {
        "dis_max": {
            "queries": [
                {
                    "match": {
                        "title": "Brown fox"
                    }
                },
                {
                    "match": {
                        "body": "Brown fox"
                    }
                }
            ]
        }
    }
}
# 输出
{
    ...
    "hits": [
        {
            "_index": "test_index",
            "_type": "_doc",
            "_id": "2",
            "_score": 0.77041256,
            "_source": {
                "title": "Keeping pets healthy",
                "body": "My quick brown fox eats rabbits on a regular basis."
            }
        },
        {
            "_index": "test_index",
            "_type": "_doc",
            "_id": "1",
            "_score": 0.6931472,
            "_source": {
                "title": "Quick brown rabbits",
                "body": "Brown rabbits are commonly seen."
            }
        }
    ]
    ...
}

most_fields 类型
most_fields 会使得匹配的字段数量最多的那个文档出现在最前面，这点和前面的 best_fields 不一样。

# 创建数据
PUT /test_index/_create/1
{
    "street":   "5 Poland Street",
    "city":     "Poland",
    "country":  "United W1V",
    "postcode": "W1V 3DG"
}
PUT /test_index/_create/2
{
    "street":   "5 Poland Street W1V",
    "city":     "London",
    "country":  "United Kingdom",
    "postcode": "3DG"
}
# 查询
{
  "query": {
    "multi_match": {
      "query": "Poland Street W1V",
      "type": "most_fields", 
      "fields": ["street", "city", "country", "postcode"]
    }
  }
}
# 输出
# 如果用 best_fields, 那么 doc2 会在 doc1 的前面.
{
    ...
    "hits": [
        {
            "_index": "test_index",
            "_type": "_doc",
            "_id": "1",
            "_score": 2.3835402,
            "_source": {
                "street": "5 Poland Street",
                "city": "Poland",
                "country": "United W1V",
                "postcode": "W1V 3DG"
            }
        },
        {
            "_index": "test_index",
            "_type": "_doc",
            "_id": "2",
            "_score": 0.99938464,
            "_source": {
                "street": "5 Poland Street W1V",
                "city": "London",
                "country": "United Kingdom",
                "postcode": "3DG"
            }
        }
    ]
    ...
}

most_fields 的问题

它被设计用来找到匹配任意单词的多数字段，而不是找到跨越所有字段的最匹配的单词。

它不能使用operator或者minimum_should_match参数来减少低相关度结果带来的长尾效应。

cross_fields 类型
cross_fields 可以将所有的字段联合成一个大的字段，然后在这个大字段中搜索每个词条，通过混合字段的倒排文档频度来解决词条频度问题，从而完美解决了 most_fields 的问题。

{
  "query": {
    "multi_match": {
      "query": "Poland Street W1V",
      "type": "cross_fields", 
      "operator": "and", 
      "fields": ["street", "city", "country", "postcode"]
    }
  }
}

phrase 类型
phrase 类型能够让我们在多个字段间进行短语查找。

{
    "query": {
        "multi_match" : {
            "query": "hello world",
            "fields": ["title", "content"],
            "type": "phrase",
            "slop": 1
        }
    }
}