Lucene-2.2.0 源代码阅读学习(17)

pavel

浏览: 941414 次
性别:
来自: 北京

最近访客更多访客>>

macmilan

just_Word

沈寅麟

spedit

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

lucene

lucene Apache thread

根据 Lucene-2.2.0 源代码阅读学习(16) 中对IndexFileDeleter类和CommitPoint类的源代码的阅读学习，在此进行总结：

一个提交点所具有的信息如下所示：

     long gen;    // 下次提交索引段segments_N的版本
    List files;    // 属于当前索引目录的索引段的一个列表
    String segmentsFileName;    // 一个索引段
    boolean deleted;    // 删除标志

一个提交点具有的行为：

1、通过getSegmentsFileName()方法，得到一个索引段文件的名称；

2、通过delete()方法，获取到具有deleted标志(当delete为false时，即还没有被删除)的提交点，加入到commitsToDelete列表中，真正删除是在CommitPoint类的外部类IndexFileDeleter类中的deleteCommits()方法中；

3、该类的compareTo()实现了自然排序的功能，排序是根据gen = segmentInfos.getGeneration();返回的整数值进行实现的。也就是说，如果把一个个的CommitPoint加入到列表中的时候，它是有序的，可以很方便地获取最早的提交点和最近提交点。

在IndexFileDeleter类和CommitPoint类中，都涉及到了关于索引段Segment的内容，研究一下SegmentInfos类和SegmentInfo类。

先看一下SegmentInfo类的结构，然后再学习代码：

SegmentInfos类实现的源代码：

package org.apache.lucene.index;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Vector;

final class SegmentInfos extends Vector {

//
public static final int FORMAT_LOCKLESS = -2;

//
public static final int FORMAT_SINGLE_NORM_FILE = -3;

// 用于指向最近的文件的格式(因为Lucene2.1以后对索引文件的格式进行了优化的改变)，可以参考官方文档http://lucene.apache.org/java/2_2_0/fileformats.html#Segments%20File
private static final int CURRENT_FORMAT = FORMAT_SINGLE_NORM_FILE;

public int counter = 0;    // 用于命名当前最新的索引段文件
/**
   * 统计索引文件变化的频率(如添加索引、删除索引会使索引文件的格式发生变化)
   * 根据当前的时间(精确到毫秒)创建一个唯一的版本号数字串.
   */
private long version = System.currentTimeMillis();

private long generation = 0; // 下次提交时"segments_N"的N=generation
private long lastGeneration = 0; // 最后一次成功读取或者写入，"segments_N"中N=lastGeneration

/**
* 如果索引文件不是null的，则构造一个输出流，输出segments_N文件
*/
private static PrintStream infoStream;

public final SegmentInfo info(int i) {
return (SegmentInfo) elementAt(i);
}

/**
   * 从指定的文件列表files中获取当前segments_N文件的版本号(generation)
   */
public static long getCurrentSegmentGeneration(String[] files) {
    if (files == null) {    // 如果指定的索引目录中没有索引文件，返回-1
      return -1;
    }
    long max = -1;    // 不存在任何索引文件，当默认当前版本号为-1
    for (int i = 0; i < files.length; i++) {    // 对索引目录中所有索引文件遍历，取出segments_N中最大的N的作为当前版本号
      String file = files[i];

// IndexFileNames.SEGMENTS="segments"，segments是生成的索引文件，在IndexFileNames类中定义了所有的索引文件名

// IndexFileNames.SEGMENTS_GEN="segments.gen"
      if (file.startsWith(IndexFileNames.SEGMENTS) && !file.equals(IndexFileNames.SEGMENTS_GEN)) {
        long gen = generationFromSegmentsFileName(file); // 调用后面的方法，获取索引文件的版本号(generation)
        if (gen > max) {
          max = gen;
        }
      }
    }
    return max;    //   将segments_N中最大的N返回，作为当前版本号(generation)
}

/**
   * 重载的方法，从指定的索引目录中获取当前segments_N文件的版本号(generation)
   */
public static long getCurrentSegmentGeneration(Directory directory) throws IOException {
    String[] files = directory.list();
    if (files == null)
      throw new IOException("cannot read directory " + directory + ": list() returned null");
    return getCurrentSegmentGeneration(files);     //调用getCurrentSegmentGeneration()方法，从索引目录中读取的文件列表files中获取当前segments_N文件的版本号(generation)
}

/**
* 指定索引文件列表，获取当前segments_N文件的名称
*/

public static String getCurrentSegmentFileName(String[] files) throws IOException {
return IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "",getCurrentSegmentGeneration(files)); // 调用了IndexFileNames类的fileNameFromGeneration()方法，在后面有讲解
}

/**
   * 重载的方法，指定索引目录，获取当前segments_N文件的名称
   */
public static String getCurrentSegmentFileName(Directory directory) throws IOException {
    return IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS,"",
getCurrentSegmentGeneration(directory));
}

  /**
   * 重载的方法，根据索引文件的信息，即最后成功读取或写入时的版本号lastGeneration，获取当前segments_N文件的名称
   */
public String getCurrentSegmentFileName() {
    return IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS,"",lastGeneration);
}

/**
   * 从索引文件名称的字符串中解析索引文件的版本号，即segments_N中的N，并且最后返回N的值
   */
public static long generationFromSegmentsFileName(String fileName) {
    if (fileName.equals(IndexFileNames.SEGMENTS)) {    // 如果文件名称为segments，没有扩展名，则返回0
      return 0;
    } else if (fileName.startsWith(IndexFileNames.SEGMENTS)) {
      return Long.parseLong(fileName.substring(1+IndexFileNames.SEGMENTS.length()),Character.MAX_RADIX);    // 取segments_N中的子串N，并将N转换为Long型
    } else {    // 解析失败，抛出异常
      throw new IllegalArgumentException("fileName \"" + fileName + "\" is not a segments file");
    }
}

/**
   * 获取下一个将被写入索引目录的segments_N文件
   */
public String getNextSegmentFileName() {
    long nextGeneration;

    if (generation == -1) {    // 如果当前索引目录中没有任何索引文件，则最新写入的索引文件的版本号为1，即segments_1
      nextGeneration = 1;
    } else {
      nextGeneration = generation+1;   // 否则，当前的版本号+1为将要写入的索引文件的版本号
    }

// 返回将要写入索引目录的索引文件的名称，即文件名segments_N，N用nextGeneration替换
return IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS,"",nextGeneration);
}

/**
   * 读取指定的索引文件
   */
public final void read(Directory directory, String segmentFileName) throws CorruptIndexException, IOException {
    boolean success = false;

IndexInput input = directory.openInput(segmentFileName); // 为索引文件segmentFileName创建一个输入流

generation = generationFromSegmentsFileName(segmentFileName); // 下次要提交的索引文件的版本号

lastGeneration = generation; // 最后成功读取或写入索引文件的版本号

    try {
      int format = input.readInt();    // 读取4个字节，返回一个Int型整数，索引文件中具有版本号的记录
      if(format < 0){     // 如果文件包含了外部的版本号
        // 要解析成内部能够使用的信息
        if (format < CURRENT_FORMAT)    // 如果读取到的Int整数小于当前从索引文件中获取的版本号，则是错误的
          throw new CorruptIndexException("Unknown format version: " + format);
        version = input.readLong(); // 读取版本号Long串
        counter = input.readInt(); // 读取用于命名当前的索引文件的gen值
      }
      else{     // 索引文件没有外部格式信息，就去当前从索引文件中读取到的整数值为当前的索引文件命名
        counter = format;
      }

      for (int i = input.readInt(); i > 0; i--) { // 读取索引段信息
        addElement(new SegmentInfo(directory, format, input));    //   构造一个用于管理索引文件的SegmentInfo对象，添加到SegmentInfos向量列表中去
      }

      if(format >= 0){    // 对于旧格式的索引文件，版本号信息可能在文件的末尾
        if (input.getFilePointer() >= input.length())
          version = System.currentTimeMillis(); // 如果旧文件格式没有版本号信息，则设置当前版本号
        else
          version = input.readLong(); // 否则，如果不是旧格式索引文件，直接从索引文件中读取版本号
      }
      success = true;    // 获取到索引文件的版本号，则标志位success置true，表示可以生成当前版本的索引文件(名称)
    }
    finally {
      input.close();
      if (!success) {
        clear();
      }
    }
}

/**
* 如果读取索引文件失败，重新尝试再次去读取
*/
public final void read(Directory directory) throws CorruptIndexException, IOException {

generation = lastGeneration = -1;

new FindSegmentsFile(directory) { // FindSegmentsFile是一个静态抽象内部类，在此实现从索引目录中加载索引文件

      protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException {
        read(directory, segmentFileName);    // 初始化一个FindSegmentsFile的实例时，调用上面实现的读取索引文件的read方法
        return null;
      }
    }.run();    //   调用继承自抽象类FindSegmentsFile的run方法进行读取，(run方法的实现比较复杂)
}

/**
* 执行写入当前的索引文件操作
*/

public final void write(Directory directory) throws IOException {

String segmentFileName = getNextSegmentFileName();

    // Always advance the generation on write:
    if (generation == -1) {
      generation = 1;
    } else {
      generation++;
    }

IndexOutput output = directory.createOutput(segmentFileName); // 构造一个索引文件输出流

boolean success = false;

    try {
      output.writeInt(CURRENT_FORMAT); // 写入FORMAT
      output.writeLong(++version);    // 写入版本号
      output.writeInt(counter);    //   写入当前的索引文件的外部信息(即segment_N中的N的值)
      output.writeInt(size());   // 写入该SegmentInfos中的每个SegmentInfo的信息
      for (int i = 0; i < size(); i++) {
        info(i).write(output);
      }
    }
    finally {
      try {
        output.close();    // 关闭索引文件输出流，成功写入索引目录
        success = true;
      } finally {
        if (!success) {    // 如果写入失败，执行回滚操作，删除非法的写入失败的索引文件
          directory.deleteFile(segmentFileName);
        }
      }
    }

    try {
      output = directory.createOutput(IndexFileNames.SEGMENTS_GEN);    // 创建segment.gen文件，打开一个输出文件流
      try {    // 写入维护所需要的信息
        output.writeInt(FORMAT_LOCKLESS);
        output.writeLong(generation);
        output.writeLong(generation);
      } finally {
        output.close();
      }
    } catch (IOException e) {
      // It's OK if we fail to write this file since it's
      // used only as one of the retry fallbacks.
    }

    lastGeneration = generation;
}

/**
   * 克隆一个SegmentInfos
   */

public Object clone() {
    SegmentInfos sis = (SegmentInfos) super.clone();
    for(int i=0;i<sis.size();i++) {
      sis.setElementAt(((SegmentInfo) sis.elementAt(i)).clone(), i);
    }
    return sis;
}

/**
   * SegmentInfos生成的版本号
   */
public long getVersion() {
    return version;
}
public long getGeneration() {
    return generation;
}

/**
   * 从segments文件中读取当前的版本号.
   */
public static long readCurrentVersion(Directory directory)
    throws CorruptIndexException, IOException {

return ((Long) new FindSegmentsFile(directory) {
protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException {

IndexInput input = directory.openInput(segmentFileName);

          int format = 0;
          long version = 0;
          try {
            format = input.readInt();
            if(format < 0){
              if (format < CURRENT_FORMAT)
                throw new CorruptIndexException("Unknown format version: " + format);
              version = input.readLong();   // read version
            }
          }
          finally {
            input.close();
          }

          if(format < 0)
            return new Long(version);

          // We cannot be sure about the format of the file.
          // Therefore we have to read the whole file and cannot simply seek to the version entry.
          SegmentInfos sis = new SegmentInfos();
          sis.read(directory, segmentFileName);
          return new Long(sis.getVersion());
        }
      }.run()).longValue();
}

/**
   * segments 文件输出流
   */
public static void setInfoStream(PrintStream infoStream) {
    SegmentInfos.infoStream = infoStream;
}

/* Advanced configuration of retry logic in loading
segments_N file */
private static int defaultGenFileRetryCount = 10;
private static int defaultGenFileRetryPauseMsec = 50;
private static int defaultGenLookaheadCount = 10;

/**
   * Advanced: set how many times to try loading the
   * segments.gen file contents to determine current segment
   * generation. This file is only referenced when the
   * primary method (listing the directory) fails.
   */
public static void setDefaultGenFileRetryCount(int count) {
    defaultGenFileRetryCount = count;
}

public static int getDefaultGenFileRetryCount() {
return defaultGenFileRetryCount;
}

/**
   * Advanced: set how many milliseconds to pause in between
   * attempts to load the segments.gen file.
   */
public static void setDefaultGenFileRetryPauseMsec(int msec) {
    defaultGenFileRetryPauseMsec = msec;
}

public static int getDefaultGenFileRetryPauseMsec() {
return defaultGenFileRetryPauseMsec;
}

/**
   * Advanced: set how many times to try incrementing the
   * gen when loading the segments file. This only runs if
   * the primary (listing directory) and secondary (opening
   * segments.gen file) methods fail to find the segments
   * file.
   */
public static void setDefaultGenLookaheadCount(int count) {
    defaultGenLookaheadCount = count;
}

public static int getDefaultGenLookahedCount() {
    return defaultGenLookaheadCount;
}

public static PrintStream getInfoStream() {
return infoStream;
}

private static void message(String message) {
    if (infoStream != null) {
      infoStream.println(Thread.currentThread().getName() + ": " + message);
    }
}

////********这里是FindSegmentsFile抽象静态内部类的定义，可以参考Lucene实现源代码********////
}

从SegmentInfos类的实现过程可以看出，该类主要是对SegmentInfo进行管理的。在每次执行打开索引目录、打开索引文件、写入文件等等，都需要对SegmentInfos进行维护。

因为SegmentInfos记录了对索引文件进行操作(如：建立索引、删除索引)而生成的一些索引文件格式、版本号的信息，所以每当索引文件有操作需求，都要从SegmentInfos中获取当前的一些详细记录，SegmentInfos是操作索引文件的依据，同时操作索引文件结束后，要及时更新SegmentInfos的记录信息，为下次操作索引文件提供准确的信息。

SegmentInfos类主要通过两个文件来维护这些信息：segment_N和segment.gen文件。

segment_N文件存储的是当前正处于激活状态的索引文件的信息，也就是当前操作的索引文件的维护信息。

segment.gen文件是专门用于管理segment_N文件的。这里，segment_N文件是动态变化的，比如每次写入新的索引文件或者删除索引文件都涉及到当前索引文件的版本问题。segment.gen主要管理的的操作索引文件的版本信息的。

在处理提交点的时候，也要参考索引文件的版本，都需要从segment.gen中读取；根据实际的操作，还要在操作结束的时候更新segment.gen文件，保证下次操作的正确性。

分享到：

Lucene-2.2.0 源代码阅读学习(18) | Lucene-2.2.0 源代码阅读学习(16)

2009-02-05 17:00
浏览 1197
评论(0)
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论