用Java实现搜索引擎布尔运算 -

chen_yongkai

浏览: 62189 次
性别:
来自: 福州

最近访客更多访客>>

法萨芬

longxm

lingqiang522

xubukang

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (10)

社区版块

存档分类

用Java实现搜索引擎布尔运算

java 搜索引擎布尔运算

索引类：


import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

public class Index implements Serializable {
	/**
	 * 
	 */
	private static final long serialVersionUID = 7362753433812661741L;
	private Map<String, BitSet> indexMap;

	private void writeObject(ObjectOutputStream out) throws IOException {
		// 压缩
		ByteArrayOutputStream buf = new ByteArrayOutputStream();
		ObjectOutputStream objOut = new ObjectOutputStream(new GZIPOutputStream(buf));
		objOut.writeObject(indexMap);
		objOut.close();
		out.writeObject(buf.toByteArray());
	}

	@SuppressWarnings("unchecked")
	private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
		byte[] buf = (byte[]) in.readObject();
		ObjectInputStream objIn = new ObjectInputStream(new GZIPInputStream(
				new ByteArrayInputStream(buf)));
		indexMap = (Map<String, BitSet>) objIn.readObject();
		objIn.close();
	}

	public Index(int indexSize) {
		int initialCapacity = indexSize * 4 / 3;
		indexMap = new HashMap<String, BitSet>(initialCapacity);
	}

	public Index() {
		this(12);
	}

	public void setId(Collection<String> c, int id) {

		for (String key : c) {
			BitSet bit = indexMap.get(key);
			if (bit == null) {
				bit = new BitSet();
				indexMap.put(key, bit);
			}
			bit.set(id);
		}
	}

	public void setId(String[] c, int id) {

		for (String key : c) {
			BitSet bit = indexMap.get(key);
			if (bit == null) {
				bit = new BitSet();
				indexMap.put(key, bit);
			}
			bit.set(id);
		}
	}

	public int[] getIdSetWithAnd(String... keys) {
		checkKeys(keys);
		int n = keys.length;
		BitSet[] bits = new BitSet[n];
		int i = 0;
		for (String key : keys) {
			BitSet bit = indexMap.get(key);
			if (bit != null) {
				bits[i++] = bit;
			}
		}
		if (i == 0)
			return null;
		BitSet bit = (BitSet) bits[0].clone();
		for (int j = 1; j < i; j++) {
			bit.and(bits[j]);
		}
		return getIdSet(bit);
	}

	public int[] getIdSetWithOr(String... keys) {
		checkKeys(keys);
		int n = keys.length;
		BitSet[] bits = new BitSet[n];
		int i = 0;
		for (String key : keys) {
			BitSet bit = indexMap.get(key);
			if (bit != null) {
				bits[i++] = bit;
			}
		}
		if (i == 0)
			return null;
		BitSet bit = (BitSet) bits[0].clone();
		for (int j = 1; j < i; j++) {
			bit.or(bits[j]);
		}
		return getIdSet(bit);
	}

	private static void checkKeys(String... keys) {
		if (keys == null)
			throw new NullPointerException("keys is null.");
		if (keys.length < 2) {
			throw new IllegalArgumentException("keys' length is less than 2.");
		}
	}

	public int[] getIdSet(String key) {
		BitSet bit = indexMap.get(key);
		if (bit == null)
			return null;
		else {
			return getIdSet(bit);
		}
	}

	private int[] getIdSet(BitSet bit) {
		int n = bit.size();
		int[] ids = new int[n];
		int j = 0;
		for (int i = 0; i < n; i++) {
			if (bit.get(i)) {
				ids[j++] = i;
			}
		}
		if (j == n)
			return ids;
		else {
			int[] arr = new int[j];
			System.arraycopy(ids, 0, arr, 0, j);
			return arr;
		}
	}
}

小小测试：


import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.regex.Pattern;

import bluechip.io.SerializeUtils;
import bluechip.io.file.AbstractFileProcessor;
import bluechip.io.file.FileProcessor;

public class IndexTest {

	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		//统计一下运行时间
		long time = System.currentTimeMillis();
		File file = new File("d:/index.dat");

		Index data = null;
		try {
			//到从文件读取序列化对象
			data = SerializeUtils.readObject(file);
		} catch (Exception ex) {
			final Index index = new Index(4000);
			final Pattern pattern = Pattern.compile("\\s+");//简单的分词
			FileProcessor fp = new AbstractFileProcessor(new File("D:/英文版世界名著[下]/罪与罚.txt")) {

				@Override
				protected void processLine(String line) throws IOException {
					String[] words = pattern.split(line);
					//一行一条记录
					index.setId(words, this.getLineNumber());
				}
			};

			fp.process();
			data = index;
			//序列化存储到文件
			SerializeUtils.writeObject(data, file);
		}
		//查找存在下列单词的行号
		int[] ids = data.getIdSetWithAnd("his", "and", "was", "were", "as", "to");
		System.out.println(Arrays.toString(ids));
		System.out.println(ids.length);
		System.out.println(System.currentTimeMillis() - time);
	}

}

分享到：