/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.io.FastStringReader; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettingsService; import org.elasticsearch.indices.analysis.IndicesAnalysisService; import java.io.Reader; import java.util.List; import java.util.Map; @AnalysisSettingsRequired public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { private final SynonymMap synonymMap; private final boolean ignoreCase; @Inject public SynonymTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); //同义词流 Reader rulesReader = null; //获取配置中的synonyms的同义词配置 if (settings.getAsArray("synonyms", null) != null) { List<String> rules = Analysis.getWordList(env, settings, "synonyms"); StringBuilder sb = new StringBuilder(); for (String line : rules) { sb.append(line).append(System.getProperty("line.separator")); } rulesReader = new FastStringReader(sb.toString()); //获取配置文件中同义词配置synonyms_path } else if (settings.get("synonyms_path") != null) { //获取配置路径的同义词文件流 rulesReader = Analysis.getReaderFromFile(env, settings, "synonyms_path"); } else { throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured"); } this.ignoreCase = settings.getAsBoolean("ignore_case", false); boolean expand = settings.getAsBoolean("expand", true); //获取 tokenizer String tokenizerName = settings.get("tokenizer", "whitespace"); //获取TokenizerFactoryFactory TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(tokenizerName); if (tokenizerFactoryFactory == null) { tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName); } if (tokenizerFactoryFactory == null) { throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter"); } final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, Settings.builder().put(indexSettingsService.getSettings()).put(settings).build()); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create(); TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; try { SynonymMap.Builder parser = null; if ("wordnet".equalsIgnoreCase(settings.get("format"))) { parser = new WordnetSynonymParser(true, expand, analyzer); //解析同义词数据流 ((WordnetSynonymParser) parser).parse(rulesReader); } else { parser = new SolrSynonymParser(true, expand, analyzer); ((SolrSynonymParser) parser).parse(rulesReader); } synonymMap = parser.build(); } catch (Exception e) { throw new IllegalArgumentException("failed to build synonyms", e); } } @Override public TokenStream create(TokenStream tokenStream) { // fst is null means no synonyms //使用 lucene 中的 SynonymFilter return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase); } }
/** * @return null If no settings set for "settingsPrefix" then return <code>null</code>. * @throws IllegalArgumentException * If the Reader can not be instantiated. * 获取配置同义词流 */ public static Reader getReaderFromFile(Environment env, Settings settings, String settingPrefix) { String filePath = settings.get(settingPrefix, null); if (filePath == null) { return null; } final Path path = env.configFile().resolve(filePath); try { return FileSystemUtils.newBufferedReader(path.toUri().toURL(), Charsets.UTF_8); } catch (IOException ioe) { String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, ioe.getMessage()); throw new IllegalArgumentException(message); } }
package org.apache.lucene.analysis.synonym;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
/**
* Parser for wordnet prolog format
* <p>
* See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
* @lucene.experimental SynonymMap 解析子类
*/ // TODO: allow you to specify syntactic categories (e.g. just nouns, etc) public class WordnetSynonymParser extends SynonymMap.Parser { private final boolean expand; public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { super(dedup, analyzer); this.expand = expand; } @Override public void parse(Reader in) throws IOException, ParseException { //一行一行解析 LineNumberReader br = new LineNumberReader(in); try { String line = null; String lastSynSetID = ""; CharsRef synset[] = new CharsRef[8]; int synsetSize = 0; while ((line = br.readLine()) != null) { String synSetID = line.substring(2, 11); if (!synSetID.equals(lastSynSetID)) { addInternal(synset, synsetSize); synsetSize = 0; } if (synset.length <= synsetSize+1) { synset = Arrays.copyOf(synset, synset.length * 2); } synset[synsetSize] = parseSynonym(line, new CharsRefBuilder()); synsetSize++; lastSynSetID = synSetID; } // final synset in the file addInternal(synset, synsetSize); } catch (IllegalArgumentException e) { ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); ex.initCause(e); throw ex; } finally { br.close(); } } private CharsRef parseSynonym(String line, CharsRefBuilder reuse) throws IOException { if (reuse == null) { reuse = new CharsRefBuilder(); } int start = line.indexOf('\'')+1; int end = line.lastIndexOf('\''); String text = line.substring(start, end).replace("''", "'"); return analyze(text, reuse); } private void addInternal(CharsRef synset[], int size) { if (size <= 1) { return; // nothing to do } if (expand) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { add(synset[i], synset[j], false); } } } else { for (int i = 0; i < size; i++) { add(synset[i], synset[0], false); } } } }
相关推荐
Elasticsearch(简称ES)是一个开源的分布式搜索和分析引擎,最初由Elastic公司创建。它属于Elastic Stack(ELK Stack)的核心组件之一,用于实时地存储、检索和分析大量数据。
这个"SpringBoot整合Elasticsearch完整源码"提供了实现这一目标的详细步骤和代码实例。以下是对相关知识点的详细说明: 1. **Spring Boot**: Spring Boot是由Pivotal Team创建的,旨在简化Spring应用的初始搭建...
**Elasticsearch实战源码详解** Elasticsearch是一款开源的、分布式的全文搜索引擎,由Java编写,设计用于处理海量数据的快速检索。黄申翻译的《Elasticsearch实战》一书,深入浅出地介绍了Elasticsearch的核心概念...
在本例中,我们将详细探讨如何使用Intellij IDEA来编译和调试Elasticsearch 6.1.0源码。Elasticsearch是一个基于Lucene构建的开源搜索引擎,它允许用户快速执行全文搜索、结构化搜索甚至是复杂分析。Intellij IDEA是...
在Elasticsearch 5.0.1的源码中,我们可以看到以下几个重要的知识领域: 1. **Lucene库**:Elasticsearch的核心是基于Apache Lucene构建的,Lucene是一个高性能、全功能的文本搜索库。在源码中,你会看到Lucene如何...
**Elasticsearch(ES)详解** Elasticsearch是一款开源、分布式、实时的全文搜索引擎,它基于Lucene构建,被广泛用于大数据分析、日志聚合、实时搜索和索引等场景。其强大的功能和易用性使得它在IT行业中备受青睐。...
文档中提到的知识点涵盖Elasticsearch源码调试环境的搭建,包括所需的硬件环境、前置软件安装与配置、Elasticsearch源码和安装包的下载、以及项目初始化的详细步骤。 首先,文档指出深入理解一个系统离不开对其源码...
Elasticsearch源码分析 Elasticsearch是一款基于Lucene的分布式、RESTful搜索和数据分析引擎。它的源码解析对于我们深入理解其内部工作原理至关重要。在深入探讨之前,我们需要知道几个核心概念:分布式、RESTful ...
修改源码,打开server/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java 找到 if (Natives.definitelyRunningAsRoot()) 代码,将throw new RuntimeException替换为logger.warn,将异常改为警告日志即可。 ...
**Elasticsearch实战练习源码解析** Elasticsearch(简称ES)是一款强大的开源搜索引擎,它以其分布式、实时、可扩展的特性,在大数据处理和实时分析领域广泛应用。本篇将通过"**ElasticSearchTest**"这个源码文件...
Scrapy + Elasticsearch + Django打造全文搜索引擎源码 Scrapy + Elasticsearch + Django打造全文搜索引擎源码 Scrapy + Elasticsearch + Django打造全文搜索引擎源码 Scrapy + Elasticsearch + Django打造...
《Mastering Elasticsearch Second Edition》是Elasticsearch领域的权威著作,其源码提供了深入理解这个分布式搜索引擎内部机制的宝贵资源。Elasticsearch是一个基于Lucene的开源全文搜索和分析引擎,广泛应用于日志...
这个"SpringBoot整合Elasticsearch完整源码"提供了实现这一目标的详细步骤和代码实例。以下是对相关知识点的详细说明: 1. **Spring Boot**: Spring Boot是由Pivotal Team创建的,旨在简化Spring应用的初始搭建...
**Elasticsearch 6.7.2 源码解析** Elasticsearch 是一个流行的开源搜索引擎,基于 Lucene 库,广泛应用于日志分析、实时数据分析、全文搜索等多个领域。其6.7.2版本是它发展过程中的一个重要里程碑,包含了许多...
**Elasticsearch Analysis IK 5.3.2 源码分析** Elasticsearch是一款流行的开源全文搜索引擎,它提供了一种分布式、实时的搜索和分析引擎。在处理中文文本时,由于中文的特殊性(词语边界不明显),需要借助特定的...
Elasticsearch是一款强大的开源搜索引擎,基于Lucene库构建,广泛应用于大数据分析、日志收集、实时搜索等领域。本文将深入解析Elasticsearch的核心概念、架构原理,并结合实战案例,帮助你全面掌握这一技术。 首先...
Elasticsearch 6.6.1源码分析 Elasticsearch是一个开源的、分布式、全文检索的搜索引擎,它以其高效、灵活和可扩展性而受到广大开发者和企业的青睐。6.6.1版本是Elasticsearch发展过程中的一个重要里程碑,它在6.x...
基于SSM架构结合全文搜索引擎ElasticSearch的电影搜索系统项目源码.zip 基于SSM架构结合全文搜索引擎ElasticSearch的电影搜索系统项目源码.zip 基于SSM架构结合全文搜索引擎ElasticSearch的电影搜索系统项目源码.zip...
ES(elasticSearch6.4.0)之java API源码demo-完整注释版,本版本为上一demo版本升级版,封装了ES的javaAPI,支持了模糊查询,排序查询,解析,分页查询等功能,如果有问题请留言。我会及时回复。