多字段特性及Mapping中配置自定义Analyzer

最新推荐文章于 2023-10-08 23:30:00 发布

原创最新推荐文章于 2023-10-08 23:30:00 发布 · 182 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#elasticsearch

ES学习笔记专栏收录该内容

39 篇文章

订阅专栏

本文通过示例展示了如何在Elasticsearch中配置和使用自定义Analyzer，包括char_filter的映射替换、正则表达式处理、停用词过滤等，详细解析了Analyzer在文本分析过程中的作用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

POST _analyze
{
"tokenizer": "keyword",
"char_filter": ["html_strip"],
"text": "<b>hello world</b>"
}

//使用char filter进行替换
POST _analyze
{
"tokenizer": "standard",
"char_filter": [{
"type":"mapping",
"mappings":["- => _"]
}],
"text": "123-456,I-test! test-990 650-555-1234"
}

//使用char filter 替换表情符号
POST _analyze
{
"tokenizer": "standard",
"char_filter": [
{
"type":"mapping",
"mappings":[":) => happy",":( => sad"]
}
],
"text": ["I am felling :)","Felling :( today"]
}

//正则表达式
GET _analyze
{
"tokenizer": "standard",
"char_filter": [
{
"type":"pattern_replace",
"pattern":"http://(.*)",
"replacement":"$1"
}],
"text": "http://www.elastic.co"
}

POST _analyze
{
"tokenizer": "path_hierarchy",
"text": "/user/ymruan/a/b/c/d/e"
}

//whitespace与stop
GET _analyze
{
"tokenizer":"whitespace",
"filter":["stop"],
"text":["The rain in Spain falls mainly on the plain."]
}

//remove 加入lowercase后，The 被当成stopword删除
GET _analyze
{
"tokenizer": "whitespace",
"filter": ["lowercase","stop"],
"text": ["The girls in China are playing this game!"]
}

DELETE my_index
PUT my_index
{
"settings": {
"analysis": {
"analyzer": {
"my_custom_analyzer":{
"type":"custom",
"char_filter":[
"emoticons"],
"tokenizer":"punctuation",
"filter":[
"lowercase","english_stop"
]
}
},
"tokenizer": {
"punctuation":{
"type":"pattern",
"pattern":"[ .,!?]"
}
},
"char_filter": {
"emoticons":{
"type":"mapping",
"mappings":[
":) => _happy_",
":( => _sad_"
]
}
},
"filter": {
"english_stop":{
"type":"stop",
"stopwords":"_english_"
}
}
}
}
}