如何将Lucene索引写入Hadoop？

全部 Ruby Python PHP Flash C++ .net Rails Flex C C# Django

浏览 13444 次

锁定老帖子主题：如何将Lucene索引写入Hadoop？精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
作者	正文
qindongliang1922 等级: 性别: 文章: 170 积分: 840 来自: 北京	发表时间：2014-07-03 相关推荐: 如何将Lucene索引写入Hadoop1.x的HDFS系统如何将Lucene索引写入Hadoop2.x？ Hadoop数据传输：如何将数据移入和移出Hadoop？什么是hadoop? 基于hadoop创建lucene索引（一）编程模型一更多相关推荐 Lucene Hadoop 全文检索 Mapreduce Solr Hadoop是Lucene的子项目，现在发展如火如荼，如何利用Hadoop的分布式处理能力，来给Lucene提高建索引的效率呢，如此一来，便能充分利用HDFS的所有优点，但众所周知，HDFS系统，对随机读支持的并不友好，而像Lucene这种全文检索的框架，几乎所有的检索操作，都离不开随机读写的操作，那么如何才能使Lucene结合hadoop完美的工作呢，其实hadoop的版本里，在一个contrib的工具包里面，带了Lucene索引的工具类，不过貌似是用的人很少，散仙没有用过这个，在这里就不多评价了。在solr4.4之后的项目，里面已经集成了像HDFS写入索引的jar包，如果你是在solr里面，那么很容易就能够，把索引建在HDFS上，只需要在solrconfig.xml里面配置Directory的实现类为HDFSDirectory即可，但是solr4.4里面的jar仅仅支持，最新版的hadoop，也就2.0之后的，直接在1.x的hadoop里使用，会出现异常，这是由于，2.x和1.x的hadoop的API变化，散仙改了部分源码后，可以支持对1.x的hadoop进行索引，查询操作，在文末，散仙会把这几个类，给上传上来，用时，只需把这几个类导入工程即可。下面看下散仙的测试demo的源码： <pre name="code" class="java">package indexhadoop; import hdfs.HdfsDirectory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; /** * * @author qindongliang * 将索引存储在HDFS上的demo * 支持hadoop1.x的版本 * * / public class MyIndex { public static void main(String[] args)throws Exception { //long a=System.currentTimeMillis(); //add(); // long b=System.currentTimeMillis(); // System.out.println("耗时: "+(b-a)+"毫秒"); query("中国"); //delete("3");//删除指定ID的数据 } /* * 得到HDFS的writer * * / public static IndexWriter getIndexWriter() throws Exception{ Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_46); IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_46, analyzer); Configuration conf=new Configuration(); //Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt"); //Path path=new Path("hdfs://10.2.143.5:9090/root/myfile"); Path path=new Path("hdfs://192.168.75.130:9000/root/index"); HdfsDirectory directory=new HdfsDirectory(path, conf); IndexWriter writer=new IndexWriter(directory, config); return writer; } / * 建索引的方法 * * / public static void add()throws Exception{ IndexWriter writer=getIndexWriter(); // doc.add(new StringField("id", "3", Store.YES)); // doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架", Store.YES)); // doc.add(new TextField("content", "今天发工资了吗", Store.YES)); // Document doc2=new Document(); // doc.add(new StringField("id", "4", Store.YES)); // doc2.add(new StringField("name", "今天天气不错呀", Store.YES)); // doc2.add(new TextField("content", "钱存储在银行靠谱吗", Store.YES)); // Document doc3=new Document(); // doc3.add(new StringField("id", "5", Store.YES)); // doc3.add(new StringField("name", "没有根的野草，飘忽的命途！", Store.YES)); // doc3.add(new TextField("content", "你工资多少呀！", Store.YES)); // writer.addDocument(doc); // writer.addDocument(doc2); // writer.addDocument(doc3); for(int i=6;i<10000;i++){ Document doc=new Document(); doc.add(new StringField("id", i+"", Store.YES)); doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架"+i, Store.YES)); doc.add(new TextField("content", "今天发工资了吗"+i, Store.YES)); writer.addDocument(doc); if(i%1000==0){ writer.commit(); } } writer.forceMerge(1); writer.commit(); System.out.println("索引10000条数据添加成功!"); writer.close(); } /* * 添加索引 * * / public static void add(Document d)throws Exception{ IndexWriter writer=getIndexWriter(); writer.addDocument(d); writer.forceMerge(1); writer.commit(); System.out.println("索引10000条数据添加成功!"); writer.close(); } / * 根据指定ID * 删除HDFS上的一些数据 * * * / public static void delete(String id)throws Exception{ IndexWriter writer=getIndexWriter(); writer.deleteDocuments(new Term("id", id));//删除指定ID的数据 writer.forceMerge(1);//清除已经删除的索引空间 writer.commit();//提交变化 System.out.println("id为"+id+"的数据已经删除成功........."); } / * 检索的方法 * * **/ public static void query(String queryTerm)throws Exception{ System.out.println("本次检索内容: "+queryTerm); Configuration conf=new Configuration(); //Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt"); // Path path=new Path("hdfs://192.168.75.130:9000/root/index"); Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1"); Directory directory=new HdfsDirectory(path, conf); IndexReader reader=DirectoryReader.open(directory); System.out.println("总数据量: "+reader.numDocs()); long a=System.currentTimeMillis(); IndexSearcher searcher=new IndexSearcher(reader); QueryParser parse=new QueryParser(Version.LUCENE_46, "city", new SmartChineseAnalyzer(Version.LUCENE_46)); Query query=parse.parse(queryTerm); TopDocs docs=searcher.search(query, 100); System.out.println("本次命中结果: "+docs.totalHits+" 条" ); // for(ScoreDoc sc:docs.scoreDocs){ // // System.out.println("评分: "+sc.score+" id : "+searcher.doc(sc.doc).get("id")+" name: "+searcher.doc(sc.doc).get("name")+" 字段内容: "+searcher.doc(sc.doc).get("content")); // // } long b=System.currentTimeMillis(); System.out.println("第一次耗时:"+(b-a)+" 毫秒"); System.out.println("============================================"); long c=System.currentTimeMillis(); query=parse.parse(queryTerm); docs=searcher.search(query, 100); System.out.println("本次命中结果: "+docs.totalHits+" 条" ); // for(ScoreDoc sc:docs.scoreDocs){ // // System.out.println("评分: "+sc.score+" id : "+searcher.doc(sc.doc).get("id")+" name: "+searcher.doc(sc.doc).get("name")+" 字段内容: "+searcher.doc(sc.doc).get("content")); // // } long d=System.currentTimeMillis(); System.out.println("第二次耗时:"+(d-c)+" 毫秒"); reader.close(); directory.close(); System.out.println("检索完毕..............."); } } </pre> 上面是散仙测试的例子，经测试，对HDFS上的lucene索引的增删改查都没问题，但有一点需要注意，lucene结合hadoop，确实能大大提升建索引的速度，但是在检索上却没有任何优势，虽然也可以检索，但是速度比较慢，目前的存储实现，是利用了block cache的缓存特性，能使得检索性能差强人意，但是数据量大的时候，检索性能非常糟糕，这一点到现在还没有任何比较好的解决方法，除非，以后给lucene，或solr，增加类似Hbase的数据结构，如此以来，检索上可能会好很多。上面的代码能够将索引，写入1.x的hadoop中，后续，散仙会给出，在hadoop2.x中建索引的例子，以及如何使用MapReduce并行建索引。 hdfs.zip (9.3 KB) 下载次数: 137 solrblockcache.zip (21.1 KB) 下载次数: 113 声明：ITeye文章版权属于作者，受法律保护。没有作者书面许可不得转载。推荐链接
返回顶楼

论坛首页 → 编程语言技术版

跳转论坛: