`
m635674608
  • 浏览: 5053177 次
  • 性别: Icon_minigender_1
  • 来自: 南京
社区版块
存档分类
最新评论

es SynonymTokenFilterFactory 源码

 
阅读更多
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.io.FastStringReader;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettingsService;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;

import java.io.Reader;
import java.util.List;
import java.util.Map;

@AnalysisSettingsRequired
public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {

    private final SynonymMap synonymMap;
    private final boolean ignoreCase;

    @Inject
    public SynonymTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories,
                                     @Assisted String name, @Assisted Settings settings) {
        super(index, indexSettingsService.getSettings(), name, settings);
        //同义词流
        Reader rulesReader = null;
        //获取配置中的synonyms的同义词配置
        if (settings.getAsArray("synonyms", null) != null) {
            List<String> rules = Analysis.getWordList(env, settings, "synonyms");
            StringBuilder sb = new StringBuilder();
            for (String line : rules) {
                sb.append(line).append(System.getProperty("line.separator"));
            }
            rulesReader = new FastStringReader(sb.toString());
        //获取配置文件中同义词配置synonyms_path
        } else if (settings.get("synonyms_path") != null) {
            //获取配置路径的同义词文件流   
            rulesReader = Analysis.getReaderFromFile(env, settings, "synonyms_path");
        } else {
            throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
        }

        this.ignoreCase = settings.getAsBoolean("ignore_case", false);
        boolean expand = settings.getAsBoolean("expand", true);

        //获取 tokenizer
        String tokenizerName = settings.get("tokenizer", "whitespace");


        //获取TokenizerFactoryFactory
        TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(tokenizerName);
        if (tokenizerFactoryFactory == null) {
            tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName);
        }
        if (tokenizerFactoryFactory == null) {
            throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
        }

        final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, Settings.builder().put(indexSettingsService.getSettings()).put(settings).build());

        Analyzer analyzer = new Analyzer() {
            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();
                TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
                return new TokenStreamComponents(tokenizer, stream);
            }
        };

        try {
            SynonymMap.Builder parser = null;

            if ("wordnet".equalsIgnoreCase(settings.get("format"))) {
                parser = new WordnetSynonymParser(true, expand, analyzer);
                //解析同义词数据流
                ((WordnetSynonymParser) parser).parse(rulesReader);
            } else {
                parser = new SolrSynonymParser(true, expand, analyzer);
                ((SolrSynonymParser) parser).parse(rulesReader);
            }

            synonymMap = parser.build();
        } catch (Exception e) {
            throw new IllegalArgumentException("failed to build synonyms", e);
        }
    }

    @Override
    public TokenStream create(TokenStream tokenStream) {
        // fst is null means no synonyms
        //使用 lucene 中的 SynonymFilter
        return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
    }
}

 

  

    /**
     * @return null If no settings set for "settingsPrefix" then return <code>null</code>.
     * @throws IllegalArgumentException
     *          If the Reader can not be instantiated.
     * 获取配置同义词流 
     */
    public static Reader getReaderFromFile(Environment env, Settings settings, String settingPrefix) {
        String filePath = settings.get(settingPrefix, null);

        if (filePath == null) {
            return null;
        }

        final Path path = env.configFile().resolve(filePath);

        try {
            return FileSystemUtils.newBufferedReader(path.toUri().toURL(), Charsets.UTF_8);
        } catch (IOException ioe) {
            String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, ioe.getMessage());
            throw new IllegalArgumentException(message);
        }
    }

   

package org.apache.lucene.analysis.synonym;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.Arrays;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;

/**
 * Parser for wordnet prolog format
 * <p>
 * See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
 * @lucene.experimental SynonymMap 解析子类

 

 */
// TODO: allow you to specify syntactic categories (e.g. just nouns, etc)
public class WordnetSynonymParser extends SynonymMap.Parser {
  private final boolean expand;
  
  public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
    super(dedup, analyzer);
    this.expand = expand;
  }

  @Override
  public void parse(Reader in) throws IOException, ParseException {
    //一行一行解析
    LineNumberReader br = new LineNumberReader(in);
    try {
      String line = null;
      String lastSynSetID = "";
      CharsRef synset[] = new CharsRef[8];
      int synsetSize = 0;
      
      while ((line = br.readLine()) != null) {
        String synSetID = line.substring(2, 11);

        if (!synSetID.equals(lastSynSetID)) {
          addInternal(synset, synsetSize);
          synsetSize = 0;
        }

        if (synset.length <= synsetSize+1) {
          synset = Arrays.copyOf(synset, synset.length * 2);
        }
        
        synset[synsetSize] = parseSynonym(line, new CharsRefBuilder());
        synsetSize++;
        lastSynSetID = synSetID;
      }
      
      // final synset in the file
      addInternal(synset, synsetSize);
    } catch (IllegalArgumentException e) {
      ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
      ex.initCause(e);
      throw ex;
    } finally {
      br.close();
    }
  }
 
  private CharsRef parseSynonym(String line, CharsRefBuilder reuse) throws IOException {
    if (reuse == null) {
      reuse = new CharsRefBuilder();
    }
    
    int start = line.indexOf('\'')+1;
    int end = line.lastIndexOf('\'');
    
    String text = line.substring(start, end).replace("''", "'");
    return analyze(text, reuse);
  }
  
  private void addInternal(CharsRef synset[], int size) {
    if (size <= 1) {
      return; // nothing to do
    }
    
    if (expand) {
      for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
          add(synset[i], synset[j], false);
        }
      }
    } else {
      for (int i = 0; i < size; i++) {
        add(synset[i], synset[0], false);
      }
    }
  }
}

 

分享到:
评论

相关推荐

    ElasticSearch(ES)8.1源码 github tag 8.1.1 .zip

    Elasticsearch(简称ES)是一个开源的分布式搜索和分析引擎,最初由Elastic公司创建。它属于Elastic Stack(ELK Stack)的核心组件之一,用于实时地存储、检索和分析大量数据。

    SpringBoot整合Elasticsearch完整源码

    这个"SpringBoot整合Elasticsearch完整源码"提供了实现这一目标的详细步骤和代码实例。以下是对相关知识点的详细说明: 1. **Spring Boot**: Spring Boot是由Pivotal Team创建的,旨在简化Spring应用的初始搭建...

    elasticsearch实战源码 (黄申译)

    **Elasticsearch实战源码详解** Elasticsearch是一款开源的、分布式的全文搜索引擎,由Java编写,设计用于处理海量数据的快速检索。黄申翻译的《Elasticsearch实战》一书,深入浅出地介绍了Elasticsearch的核心概念...

    Intellij IDEA编译调试Elasticsearch 6.1.0源码

    在本例中,我们将详细探讨如何使用Intellij IDEA来编译和调试Elasticsearch 6.1.0源码。Elasticsearch是一个基于Lucene构建的开源搜索引擎,它允许用户快速执行全文搜索、结构化搜索甚至是复杂分析。Intellij IDEA是...

    Elasticsearch-5.0.1-src源码

    在Elasticsearch 5.0.1的源码中,我们可以看到以下几个重要的知识领域: 1. **Lucene库**:Elasticsearch的核心是基于Apache Lucene构建的,Lucene是一个高性能、全功能的文本搜索库。在源码中,你会看到Lucene如何...

    elasticsearch_ES_Elasticsearch界面_elasticsearch_源码

    **Elasticsearch(ES)详解** Elasticsearch是一款开源、分布式、实时的全文搜索引擎,它基于Lucene构建,被广泛用于大数据分析、日志聚合、实时搜索和索引等场景。其强大的功能和易用性使得它在IT行业中备受青睐。...

    Intellij IDEA调试Elasticsearch 5.4.0源码.pdf

    文档中提到的知识点涵盖Elasticsearch源码调试环境的搭建,包括所需的硬件环境、前置软件安装与配置、Elasticsearch源码和安装包的下载、以及项目初始化的详细步骤。 首先,文档指出深入理解一个系统离不开对其源码...

    elasticsearch源码

    Elasticsearch源码分析 Elasticsearch是一款基于Lucene的分布式、RESTful搜索和数据分析引擎。它的源码解析对于我们深入理解其内部工作原理至关重要。在深入探讨之前,我们需要知道几个核心概念:分布式、RESTful ...

    elasticsearch6.2.4修改编译源码root用户启动

    修改源码,打开server/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java 找到 if (Natives.definitelyRunningAsRoot()) 代码,将throw new RuntimeException替换为logger.warn,将异常改为警告日志即可。 ...

    ElasticSearch练习源码

    **Elasticsearch实战练习源码解析** Elasticsearch(简称ES)是一款强大的开源搜索引擎,它以其分布式、实时、可扩展的特性,在大数据处理和实时分析领域广泛应用。本篇将通过"**ElasticSearchTest**"这个源码文件...

    Scrapy + Elasticsearch + Django打造全文搜索引擎源码.zip

    Scrapy + Elasticsearch + Django打造全文搜索引擎源码 Scrapy + Elasticsearch + Django打造全文搜索引擎源码 Scrapy + Elasticsearch + Django打造全文搜索引擎源码 Scrapy + Elasticsearch + Django打造...

    Mastering Elasticsearch Second Edition code源码

    《Mastering Elasticsearch Second Edition》是Elasticsearch领域的权威著作,其源码提供了深入理解这个分布式搜索引擎内部机制的宝贵资源。Elasticsearch是一个基于Lucene的开源全文搜索和分析引擎,广泛应用于日志...

    基于SpringBoot整合Elasticsearch完整源码分享给需要的同学

    这个"SpringBoot整合Elasticsearch完整源码"提供了实现这一目标的详细步骤和代码实例。以下是对相关知识点的详细说明: 1. **Spring Boot**: Spring Boot是由Pivotal Team创建的,旨在简化Spring应用的初始搭建...

    elasticsearch6.7.2源码

    **Elasticsearch 6.7.2 源码解析** Elasticsearch 是一个流行的开源搜索引擎,基于 Lucene 库,广泛应用于日志分析、实时数据分析、全文搜索等多个领域。其6.7.2版本是它发展过程中的一个重要里程碑,包含了许多...

    elasticsearch-analysis-ik-5.3.2 源码

    **Elasticsearch Analysis IK 5.3.2 源码分析** Elasticsearch是一款流行的开源全文搜索引擎,它提供了一种分布式、实时的搜索和分析引擎。在处理中文文本时,由于中文的特殊性(词语边界不明显),需要借助特定的...

    Elasticsearch技术解析与实战+Elasticsearch权威指南

    Elasticsearch是一款强大的开源搜索引擎,基于Lucene库构建,广泛应用于大数据分析、日志收集、实时搜索等领域。本文将深入解析Elasticsearch的核心概念、架构原理,并结合实战案例,帮助你全面掌握这一技术。 首先...

    elasticsearch6.6.1源码

    Elasticsearch 6.6.1源码分析 Elasticsearch是一个开源的、分布式、全文检索的搜索引擎,它以其高效、灵活和可扩展性而受到广大开发者和企业的青睐。6.6.1版本是Elasticsearch发展过程中的一个重要里程碑,它在6.x...

    基于SSM架构结合全文搜索引擎ElasticSearch的电影搜索系统项目源码.zip

    基于SSM架构结合全文搜索引擎ElasticSearch的电影搜索系统项目源码.zip 基于SSM架构结合全文搜索引擎ElasticSearch的电影搜索系统项目源码.zip 基于SSM架构结合全文搜索引擎ElasticSearch的电影搜索系统项目源码.zip...

    ES(elasticSearch6.4.0)之java API源码demo-完整注释版

    ES(elasticSearch6.4.0)之java API源码demo-完整注释版,本版本为上一demo版本升级版,封装了ES的javaAPI,支持了模糊查询,排序查询,解析,分页查询等功能,如果有问题请留言。我会及时回复。

Global site tag (gtag.js) - Google Analytics