Lucene 字符编码问题

全部 Hibernate Spring Struts iBATIS 企业应用 Lucene SOA Java综合 Tomcat 设计模式 OO JBoss

浏览 3611 次

锁定老帖子主题：Lucene 字符编码问题精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
作者	正文
liuxinglanyue 等级: 初级会员性别: 文章: 25 积分: 60 来自: 杭州	发表时间：2010-12-27 相关推荐: php可以用lucene吗,php – Lucene外国字符问题 Lucene之超链接传中文乱码 Lucene 源码解析 lucene Lucene课程更多相关推荐 Java综合 Lucene 现在如果一个txt文件中包含了ANSI编码的文本文件和Unicode编码的文本文件，如下图这种：当用Lucene来建索引搜索时，这个文档中的内容是搜索不到的。需要搜索的文本在附件中提供。创建索引的源代码： import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class IndexFiles { // 主要代码索引docDir文件夹下文档，索引文件在INDEX_DIR文件夹中 @SuppressWarnings("deprecation") public static void main(String[] args) { File indexDir = new File("e:\\Lucene\\index"); File docDir = new File("e:\\Lucene\\content"); try { // 索引器 IndexWriter standardWriter = new IndexWriter(FSDirectory .open(indexDir), new StandardAnalyzer( Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); // 不建立复合式索引文件，默认的情况下是复合式的索引文件 standardWriter.setUseCompoundFile(false); String[] files = docDir.list(); for (String fileStr : files) { File file = new File(docDir, fileStr); if (!file.isDirectory()) { Document doc = new Document(); // 文件名称，可查询，不分词 String fileName = file.getName().substring(0, file.getName().indexOf(".")); System.out.println("fileName:"+fileName); doc.add(new Field("name", fileName, Field.Store.YES, Field.Index.NOT_ANALYZED)); // 文件路径，可查询，不分词 String filePath = file.getPath(); doc.add(new Field("path", filePath, Field.Store.YES, Field.Index.NOT_ANALYZED)); // 文件内容，需要检索 doc.add(new Field("content", new FileReader(file))); standardWriter.addDocument(doc); } } standardWriter.optimize(); // 关闭索引器 standardWriter.close(); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } } 搜索的源代码： import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * 检索索引 / public class SearchFiles { /* Simple command-line based search demo. */ @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { String index = "E:\\Lucene\\index"; String field = "content"; String queries = null; boolean raw = false; // 要显示条数 int hitsPerPage = 10; // searching, so read-only=true IndexReader reader = IndexReader.open( FSDirectory.open(new File(index)), true); // only Searcher searcher = new IndexSearcher(reader); Analyzer standardAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, standardAnalyzer); while (true) { if (queries == null) // prompt the user System.out.println("Enter query: "); String line = in.readLine(); if (line == null \|\| line.length() == -1) break; line = line.trim(); if (line.length() == 0) break; Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null); } reader.close(); } public static void doPagingSearch(BufferedReader in, Searcher searcher, Query query, int hitsPerPage, boolean raw, boolean interactive) throws IOException { TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, false); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; int end, numTotalHits = collector.getTotalHits(); System.out.println(numTotalHits + " total matching documents"); int start = 0; end = Math.min(hits.length, start + hitsPerPage); for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); String path = doc.get("path"); if (path != null) { System.out.println((i + 1) + ". " + path); } else { System.out .println((i + 1) + ". " + "No path for this document"); } } } } 大小: 24.1 KB 需要搜索的文本.rar (255.1 KB) 下载次数: 25 查看图片附件声明：ITeye文章版权属于作者，受法律保护。没有作者书面许可不得转载。推荐链接
返回顶楼

ralfbawg 等级: 初级会员性别: 文章: 13 积分: 60 来自: 广州	发表时间：2010-12-29 doc.add(new Field("content", new FileReader(file))); 这个方法换成 doc.add(new Field("contents", new InputStreamReader(new FileInputStream(file.getCanonicalPath()), charset))); FileReader用的是系统默认的编码,这样就导致一种编码方式的文件可能以另一种编码方式读取进来进行索引,结果导致在检索时,检索不到.
返回顶楼	回帖地址 0 0 请登录后投票

论坛首页 → Java企业应用版

跳转论坛: