Lucene-2.2.0 源代码阅读学习(28)

pavel

浏览: 942123 次
性别:
来自: 北京

最近访客更多访客>>

macmilan

just_Word

沈寅麟

spedit

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

lucene

lucene Apache 网络协议虚拟机 F#

关于检索的核心IndexSearcher类。

IndexSearcher是Lucene的检索实现的最核心的实现类，它继承自抽象类Searcher，该抽象类中包含了用于检索的一些核心的方法的实现。而Searcher抽象类有实现了Searchable接口，Searchable接口是实现检索的抽象网络协议，可以基于此协议来实现对远程服务器上的索引目录的访问。这一点，可以从Searchable接口所继承的java.rmi.Remote接口来说明。

java.rmi.Remote接口在JDK中给出了说明，如下所示：

也就是说，继承java.rmi.Remote的接口具有的特性是：

1、远程接口用来识别那些继承java.rmi.Remote的接口类，这些接口被非本地虚拟机调用；

2、继承java.rmi.Remote的接口类具有远程可用的特性；

3、实现了java.rmi.Remote接口的子接口的实现类，可以对远程对象进行管理。

下面就对与检索相关的一些接口及一些抽象类做一个概览，有助于后面对这些接口的实现类进行学习研究：

Searchable接口类

Searchable接口的实现如下所示：

package org.apache.lucene.search;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.CorruptIndexException;

import java.io.IOException;

public interface Searchable extends java.rmi.Remote {
/* 用于检索的核心方法，指定了权重Weight和过滤器Filter参数。因为返回值为void类型，所以实际检索出来的Document都被存放在HitCollector中，该HitCollector类收集了那些得分大于0的Document。*/
void search(Weight weight, Filter filter, HitCollector results)
throws IOException;

// 释放一个IndexSearcher检索器所关联的资源
void close() throws IOException;

// 返回根据指定词条检索出来的Document的数量
int docFreq(Term term) throws IOException;

// 返回根据指定词条数组中所列词条检索出来的Document的数量的一个数组
int[] docFreqs(Term[] terms) throws IOException;

// 返回一个整数值：最大可能的Document的数量 + 1
int maxDoc() throws IOException;

// 检索的方法，返回检索出来的得分(Hits)排在前n位的Document
TopDocs search(Weight weight, Filter filter, int n) throws IOException;

// 获取编号为i的Document，(注意：是内部编号，可以在上面测试程序中执行System.out.println(searcher.doc(24));，打印出结果为Document<stored/uncompressed,indexed<path:E:\Lucene\txt1\mytxt\FAQ.txt> stored/uncompressed,indexed<modified:200604130754>>)
Document doc(int i) throws CorruptIndexException, IOException;

// 获取在位置n上的Document；FieldSelector接口类似于一个文件过滤器，它有一个方法FieldSelectorResult accept(String fieldName);
Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException;

// 重新设置Query(即，重写先前设定的Query)
Query rewrite(Query query) throws IOException;

// 返回一个Explanation，该Explanation用于计算得分
Explanation explain(Weight weight, int doc) throws IOException;

// 指定一种排序方式，在此基础上返回得分在前n位的Document
TopFieldDocs search(Weight weight, Filter filter, int n, Sort sort)
throws IOException;

}

Searcher抽象类

package org.apache.lucene.search;

import java.io.IOException;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Term;
import org.apache.lucene.document.Document;

// 该抽象类实现了Searchable接口
public abstract class Searcher implements Searchable {

// 查询与指定Query匹配的Document，返回Hits实例，该Hits内容相当丰富
public final Hits search(Query query) throws IOException {
return search(query, (Filter)null); // 调用下面的search()方法
}

public Hits search(Query query, Filter filter) throws IOException {
return new Hits(this, query, filter);
}

// 指定了Sort
public Hits search(Query query, Sort sort)
throws IOException {
return new Hits(this, query, null, sort);
}

// 指定了Filter和Sort
public Hits search(Query query, Filter filter, Sort sort)
throws IOException {
return new Hits(this, query, filter, sort);
}

// 实现了Searchable接口中方法，指定一种排序方式，在此基础上返回得分在前n位的Document
public TopFieldDocs search(Query query, Filter filter, int n,
Sort sort) throws IOException {
return search(createWeight(query), filter, n, sort); // 调用abstract public TopDocs search(Weight weight, Filter filter, int n) throws IOException;
}

public void search(Query query, HitCollector results)
throws IOException {
search(query, (Filter)null, results);
}

public void search(Query query, Filter filter, HitCollector results)
throws IOException {
search(createWeight(query), filter, results);
}

   public TopDocs search(Query query, Filter filter, int n)
    throws IOException {
    return search(createWeight(query), filter, n);
}

public Explanation explain(Query query, int doc) throws IOException {
return explain(createWeight(query), doc);
}

// 为一个Searcher设置一个Similarity
public void setSimilarity(Similarity similarity) {
this.similarity = similarity;
}

public Similarity getSimilarity() {
return this.similarity;
}

// 根据指定的Query，创建一个用于记录该Query状态的Weight
protected Weight createWeight(Query query) throws IOException {
return query.weight(this);
}

// 实现了接口Searchable中的方法
public int[] docFreqs(Term[] terms) throws IOException {
 int[] result = new int[terms.length];
 for (int i = 0; i < terms.length; i++) {
 result[i] = docFreq(terms[i]);
 }
 return result;
}

// 一些abstract方法，在接口Searchable中列举过
abstract public void search(Weight weight, Filter filter, HitCollector results) throws IOException;
abstract public void close() throws IOException;
abstract public int docFreq(Term term) throws IOException;
abstract public int maxDoc() throws IOException;
abstract public TopDocs search(Weight weight, Filter filter, int n) throws IOException;
abstract public Document doc(int i) throws CorruptIndexException, IOException;
abstract public Query rewrite(Query query) throws IOException;
abstract public Explanation explain(Weight weight, int doc) throws IOException;
abstract public TopFieldDocs search(Weight weight, Filter filter, int n, Sort sort) throws IOException;
}

Weight接口类

创建一个Weight的目的是，使得一个已经定制的Query实例不在检索过程中被修改，以至于该Query实例可以被重用，而无需重复创建。

一个Query实例是独立于IndexSearcher检索器的。Query的这种独立的状态应该被记录在一个Weight中。

Weight接口的源代码如下所示：

package org.apache.lucene.search;

import java.io.IOException;

import org.apache.lucene.index.IndexReader;

public interface Weight extends java.io.Serializable {
// 获取该Weight所关联的Query实例
Query getQuery();

// 获取一个Query的Weight值
float getValue();

/** The sum of squared weights of contained query clauses. */
float sumOfSquaredWeights() throws IOException;

// 为一个Query设置标准化因子
void normalize(float norm);

// 为一个Weight创建一个Scorer(Scorer是与Document的得分相关的)
Scorer scorer(IndexReader reader) throws IOException;

// 为编号为i的Document计算得分，返回Explanation记录了该Document的得分
Explanation explain(IndexReader reader, int doc) throws IOException;
}

HitCollector抽象类

package org.apache.lucene.search;

// 抽象类用于收集检索出来的Document
public abstract class HitCollector {
// 根据Document的编号和得分，筛选符合条件的Document
public abstract void collect(int doc, float score);
}

Scorer抽象类

package org.apache.lucene.search;

import java.io.IOException;

// 用于管理与查询Query匹配的Document的得分
public abstract class Scorer {
private Similarity similarity;

// Constructs a Scorer.
protected Scorer(Similarity similarity) {
this.similarity = similarity;
}

public Similarity getSimilarity() {
return this.similarity;
}

// 遍历HitCollector，收集所有匹配的Document
public void score(HitCollector hc) throws IOException {
    while (next()) {
      hc.collect(doc(), score());
    }
}

// 在指定范围内(编号<max的Document)收集匹配的Document
protected boolean score(HitCollector hc, int max) throws IOException {
 while (doc() < max) {
 hc.collect(doc(), score());
 if (!next())
 return false;
 }
 return true;
}

/** Advances to the next document matching the query. */
public abstract boolean next() throws IOException;

// 获取当前Document的编号
public abstract int doc();

// 获取当前匹配的Document的得分
public abstract float score() throws IOException;

/** Skips to the first match beyond the current whose document number is
 * greater than or equal to a given target.
 * When this method is used the {@link #explain(int)} method should not be used.
 * @param target The target document number.
 * @return true iff there is such a match.
 * Behaves as if written: <pre>
 * boolean skipTo(int target) {
 * do {
 * if (!next())
 * return false;
 * } while (target > doc());
 * return true;
 * }
 * </pre>Most implementations are considerably more efficient than that.
 */
public abstract boolean skipTo(int target) throws IOException;
public abstract Explanation explain(int doc) throws IOException;

}

Similarity抽象类

关于该抽象类的说明，可以参考源代码说明，如下所示：

org.apache.lucene.search.Similarity

Expert: Scoring API.

Subclasses implement search scoring.

The score of query q for document d correlates to the cosine-distance or dot-product between document and query vectors in a Vector Space Model (VSM) of Information Retrieval. A document whose vector is closer to the query vector in that model is scored higher. The score is computed as follows:

score(q,d) = coord(q,d) · queryNorm(q) ·	∑	( tf(t in d) · idf(t)² · t.getBoost() · norm(t,d) )
	t in q

where

tf(t in d) correlates to the term's frequency, defined as the number of times term t appears in the currently scored document d. Documents that have more occurrences of a given term receive a higher score. The default computation for tf(t in d) in DefaultSimilarity is:

tf(t in d) = frequency^½
idf(t) stands for Inverse Document Frequency. This value correlates to the inverse of docFreq (the number of documents in which the term t appears). This means rarer terms give higher contribution to the total score. The default computation for idf(t) in DefaultSimilarity is:

idf(t) = 1 + log (

numDocs

–––––––––

docFreq+1

)
coord(q,d) is a score factor based on how many of the query terms are found in the specified document. Typically, a document that contains more of the query's terms will receive a higher score than another document with fewer query terms. This is a search time factor computed in coord(q,d) by the Similarity in effect at search time.
queryNorm(q) is a normalizing factor used to make scores between queries comparable. This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), but rather just attempts to make scores from different queries (or even different indexes) comparable. This is a search time factor computed by the Similarity in effect at search time. The default computation in DefaultSimilarity is:

queryNorm(q) = queryNorm(sumOfSquaredWeights) =

1

––––––––––––––

sumOfSquaredWeights^½

The sum of squared weights (of the query terms) is computed by the query org.apache.lucene.search.Weight object. For example, a boolean query computes this value as:

sumOfSquaredWeights = q.getBoost() ² · ∑ ( idf(t) · t.getBoost() ) ²

t in q
t.getBoost() is a search time boost of term t in the query q as specified in the query text (see query syntax), or as set by application calls to setBoost(). Notice that there is really no direct API for accessing a boost of one term in a multi term query, but rather multi terms are represented in a query as multi TermQuery objects, and so the boost of a term in the query is accessible by calling the sub-query getBoost().
norm(t,d) encapsulates a few (indexing time) boost and length factors:
- Document boost - set by calling doc.setBoost() before adding the document to the index.
- Field boost - set by calling field.setBoost() before adding the field to a document.
- lengthNorm(field) - computed when the document is added to the index in accordance with the number of tokens of this field in the document, so that shorter fields contribute more to the score. LengthNorm is computed by the Similarity class in effect at indexing.
When a document is added to the index, all the above factors are multiplied. If the document has multiple fields with the same name, all their boosts are multiplied together:

norm(t,d) = doc.getBoost() · lengthNorm(field) · ∏ f.getBoost()

field f in d named as t

However the resulted norm value is encoded as a single byte before being stored. At search time, the norm byte value is read from the index directory and decoded back to a float norm value. This encoding/decoding, while reducing index size, comes with the price of precision loss - it is not guaranteed that decode(encode(x)) = x. For instance, decode(encode(0.89)) = 0.75. Also notice that search time is too late to modify this norm part of scoring, e.g. by using a different Similarity for search.

See Also:

setDefault(Similarity)

org.apache.lucene.index.IndexWriter.setSimilarity(Similarity)

Searcher.setSimilarity(Similarity)

该抽象类的源代码如下所示：

package org.apache.lucene.search;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.SmallFloat;

import java.io.IOException;
import java.io.Serializable;
import java.util.Collection;
import java.util.Iterator;

public abstract class Similarity implements Serializable {
// DefaultSimilarity是Similarity的子类
private static Similarity defaultImpl = new DefaultSimilarity();

public static void setDefault(Similarity similarity) {
Similarity.defaultImpl = similarity;
}

public static Similarity getDefault() {
return Similarity.defaultImpl;
}

// 标准化因子列表
private static final float[] NORM_TABLE = new float[256];

static { // 静态加载
 for (int i = 0; i < 256; i++)
 NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); // 将Cache中的字节转化成浮点数
}

// 解码标准化因子(从byte变为float)
public static float decodeNorm(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}

// 获取解码标准化因子列表
public static float[] getNormDecoder() {
return NORM_TABLE;
}

// 指定了名称为fieldName的Field，以及该Field中包含的词条的数量numTokens，计算该Field的标准化因子长度
public abstract float lengthNorm(String fieldName, int numTokens);

// 给定了一个Query的每个词条的Weight的平方值，计算一个Query的标准化因子
public abstract float queryNorm(float sumOfSquaredWeights);

// 为一个索引中存储的标准化因子解码(从float到byte)
public static byte encodeNorm(float f) {
return SmallFloat.floatToByte315(f);
}

// 计算一个Document中的词条的得分因子
public float tf(int freq) {
return tf((float)freq);
}

/** Computes the amount of a sloppy phrase match, based on an edit distance.
 * This value is summed for each sloppy phrase match in a document to form
 * the frequency that is passed to {@link #tf(float)}.
 *
 * A phrase match with a small edit distance to a document passage more
 * closely matches the document, so implementations of this method usually
 * return larger values when the edit distance is small and smaller values
 * when it is large.
 *
 * @see PhraseQuery#setSlop(int)
 * @param distance the edit distance of this sloppy phrase match
 * @return the frequency increment for this match
 */
public abstract float sloppyFreq(int distance);

/** Computes a score factor based on a term or phrase's frequency in a
 * document. This value is multiplied by the {@link #idf(Term, Searcher)}
 * factor for each term in the query and these products are then summed to
 * form the initial score for a document.
 *
 * Terms and phrases repeated in a document indicate the topic of the
 * document, so implementations of this method usually return larger values
 * when <code>freq</code> is large, and smaller values when <code>freq</code>
 * is small.
 *
 * @param freq the frequency of a term within a document
 * @return a score factor based on a term's within-document frequency
 */
public abstract float tf(float freq);

/** Computes a score factor for a simple term.
 *
 * The default implementation is:<pre>
 * return idf(searcher.docFreq(term), searcher.maxDoc());
 * </pre>
 *
 * Note that {@link Searcher#maxDoc()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs()} because it is proportional to
 * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate,
 * so is the other, and in the same direction.
 *
 * @param term the term in question
 * @param searcher the document collection being searched
 * @return a score factor for the term
 */
public float idf(Term term, Searcher searcher) throws IOException {
 return idf(searcher.docFreq(term), searcher.maxDoc());
}

// 为一个短语计算得分因子
public float idf(Collection terms, Searcher searcher) throws IOException {
    float idf = 0.0f;
    Iterator i = terms.iterator();
    while (i.hasNext()) {
      idf += idf((Term)i.next(), searcher);
    }
    return idf;
}

/** Computes a score factor based on a term's document frequency (the number
   * of documents which contain the term). This value is multiplied by the
   * {@link #tf(int)} factor for each term in the query and these products are
   * then summed to form the initial score for a document.
   */
public abstract float idf(int docFreq, int numDocs);

/** Computes a score factor based on the fraction of all query terms that a
* document contains. This value is multiplied into scores.
*/
public abstract float coord(int overlap, int maxOverlap);

/**
   * Calculate a scoring factor based on the data in the payload. Overriding implementations
   * are responsible for interpreting what is in the payload. Lucene makes no assumptions about
   * what is in the byte array.
   */
public float scorePayload(byte [] payload, int offset, int length)
{
    //Do nothing
    return 1;
}

}

分享到：

Lucene-2.2.0 源代码阅读学习(29) | Lucene-2.2.0 源代码阅读学习(27)

2009-02-06 14:48
浏览 1315
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论