`

lucene得到聚类的数量

 
阅读更多

1.先定义一个baseCollertor

 

public abstract class BaseCollector extends TopDocsCollector<BaseScoreDoc> {

	BaseScoreDoc pqTop;
	int docBase = 0;
	Scorer scorer;
	private Comparable cpb = Comparable.DFAULT_COMPARABLE;

	protected BaseCollector(int numHits,Comparable cpb) {
		super(new HitQueue(numHits, true,cpb));
		if(cpb != null){
			this.cpb = cpb;
		}
		pqTop = pq.top();		
	}

	protected BaseCollector(int numHits) {
		super(new HitQueue(numHits, true));
		pqTop = pq.top();
	}
	
	/**
	 * 关键代码,别乱改
	 */
	public void collect(int doc) throws IOException {
		// This collector cannot handle these scores:
		float score = scorer.score() ;
		assert score != Float.NEGATIVE_INFINITY;
		assert !Float.isNaN(score);
		BaseScoreDoc csb = new BaseScoreDoc(doc,score);
		csb.doc = doc;
		csb.score = score;
		process(csb);
		if(csb.f < 0){
			return ;
		}
		
		totalHits++;
		if(cpb.lessThan(csb, pqTop)){
			return;
		}

		pqTop.f = csb.f;
		pqTop.sortValue = csb.sortValue;
		pqTop.doc = doc + docBase;
		pqTop.score = score;
		pqTop = pq.updateTop(); 
	}

	public abstract void process(BaseScoreDoc csb);

	@Override
	protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
		if (results == null) {
			return EMPTY_TOPDOCS;
		}
		float maxScore = Float.NaN;
		if (start == 0) {
			maxScore = results[0].score;
		} else {
			for (int i = pq.size(); i > 1; i--) {
				pq.pop();
			}
			maxScore = pq.pop().score;
		}

		return new TopDocs(totalHits, results, maxScore);
	}

	@Override
	public void setNextReader(IndexReader reader, int base) {
		docBase = base;
	}

	@Override
	public void setScorer(Scorer scorer) throws IOException {
		this.scorer = scorer;
	}

	@Override
	public boolean acceptsDocsOutOfOrder() {
		return false;
	}

 

 

2.写自己的collertor,有两个分类,一个是单位名称分类,一个是地区分类

 

public class AnimalManagementCollector extends BaseCollector {
    private Map<String, Integer> unitMap = new HashMap<String, Integer>();//单位名称
    private Map<String, Integer> zoneMap = new HashMap<String, Integer>();//地区

    public AnimalManagementCollector(int numHits) {
        super(numHits, Comparable.DESC_COMPARABLE);
    }

    @Override
    public void process(BaseScoreDoc csb) {
        int doc = csb.doc;
        String unit_cache = InstrumentFields.UNIT_CACHE[doc];
        String zone_cache = InstrumentFields.ZONE_CACHE[doc];

        if (!(StringUtil.isEmpty(unit_cache))) {
            if (unitMap.containsKey(unit_cache)) {
                unitMap.put(unit_cache, unitMap.get(unit_cache) + 1);
            } else {
                unitMap.put(unit_cache, 1);
            }
        }

        if (!(StringUtil.isEmpty(zone_cache))) {
            if (zoneMap.containsKey(zone_cache)) {
                zoneMap.put(zone_cache, zoneMap.get(zone_cache) + 1);
            } else {
                zoneMap.put(zone_cache, 1);
            }
        }
    }

    public Map<String, Integer> getUnitMap() {
        return unitMap;
    }

    public void setUnitMap(Map<String, Integer> unitMap) {
        this.unitMap = unitMap;
    }

    public Map<String, Integer> getZoneMap() {
        return zoneMap;
    }

    public void setZoneMap(Map<String, Integer> zoneMap) {
        this.zoneMap = zoneMap;
    }

 

3.定制field

 

public class AnimalManagementFields {
    public static String[] UNIT_CACHE;      //单位名称
    public static String[] ZONE_CACHE;      //地区

    public synchronized void init(IndexReader ir) {
        readCache(ir);
    }

    public static void readCache(IndexReader ir) {
        int maxDoc = ir.maxDoc();
        final String[] tempUnit = new String[maxDoc + 1];
        final String[] tempZone = new String[maxDoc + 1];

        FieldExtractor.extract(ir, "unit1", new FieldExtractor.FieldWalker() {
            @Override
            public void stroll(int doc, String value) {
                try {
                    tempUnit[doc] = value;
                } catch (Exception e) {
                }
            }
        });

        FieldExtractor.extract(ir, "zone", new FieldExtractor.FieldWalker() {
            @Override
            public void stroll(int doc, String value) {
                try {
                    tempZone[doc] = value;
                } catch (Exception e) {
                }
            }
        });

        UNIT_CACHE = tempUnit;
        ZONE_CACHE = tempZone;
    }
}

 

 

4.在web.xml设置初始化

 

    <servlet>
        <servlet-name>Init</servlet-name>
        <servlet-class>com.dayainfo.action.InitServlet</servlet-class>
        <load-on-startup>1</load-on-startup>
    </servlet>

 

 

5.在InitServlet中初始化

 

public class InitServlet extends HttpServlet {

    private static final long serialVersionUID = 1L;
    private Logger logger = Logger.getLogger(InitServlet.class);

    public void init(ServletConfig config) throws ServletException {

        try {
            long beginTime1 = System.currentTimeMillis();
            AnimalManagementFields animalManagementFields = new AnimalManagementFields();
            animalManagementFields.init(SQLCreatReader.getReader(SystemConstant.ANIMAL_MANAGEMENT_LICENCE_INDEX_KEY));
            long endTime1 = System.currentTimeMillis();
            logger.info("初始化_动物管理许可证_聚类信息耗时:" + StringUtil.millsecondChange(endTime1 - beginTime1, 1) + "秒");

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

 

6.在搜索中使用

public class AnimalManagementSearchService {
    private AnimalManagementReturnParam animalManagementReturnParam = new AnimalManagementReturnParam();
    private int totalData;
    private ScoreDoc[] scoreDocs;

    public void handleInstrumentSearch(AnimalManagementReceiveParam animalManagementReceiveParam) throws IOException {
        long beginTime = System.currentTimeMillis();
        int numHits = animalManagementReceiveParam.getPageSize() * (animalManagementReceiveParam.getCurrentPage());
        AnimalManagementCollector animalManagementCollector = new AnimalManagementCollector(numHits);
        List<AnimalManagementLicenceBean> instrumentBeanListWithPage = luceneSearch(animalManagementCollector, animalManagementReceiveParam);
        animalManagementReturnParam.setAnimalManagementLicenceList(instrumentBeanListWithPage);

        animalManagementReturnParam.setUnitMap(animalManagementCollector.getUnitMap()); //单位名称
        animalManagementReturnParam.setZoneMap(animalManagementCollector.getZoneMap()); //地区

        long endtime = System.currentTimeMillis();
        animalManagementReturnParam.setTotalTime(StringUtil.millsecondChange(endtime - beginTime, 1));
    }

    //在索引中搜索数据
    public List<AnimalManagementLicenceBean> luceneSearch(AnimalManagementCollector animalManagementCollector, AnimalManagementReceiveParam animalManagementReceiveParam) throws IOException {

        QueryTerm term = new FuzzyQueryTerm();
        DXSearcher dxSearcher = new DXSearcher(SQLCreatReader.getReader(SystemConstant.ANIMAL_MANAGEMENT_LICENCE_INDEX_KEY));

        if ("1".equals(animalManagementReceiveParam.getFlag())) {       //分类检索
            term.addTerm("flag", "1", false);
        } else {
            if ((!StringUtil.isEmpty(animalManagementReceiveParam.getUnit()))) {
                term.addTerm("unit1", animalManagementReceiveParam.getUnit(), false);
            }
            if ((!StringUtil.isEmpty(animalManagementReceiveParam.getZone()))) {
                term.addTerm("zone", animalManagementReceiveParam.getZone(), false);
            }

            if (!StringUtil.isEmpty(animalManagementReceiveParam.getField())) {
                if ("1".equals(animalManagementReceiveParam.getField())) {          //全部字段
                    QueryTerm term1 = new FuzzyQueryTerm();
                    term1.addTerm("lic_number", animalManagementReceiveParam.getSw(), 2);
                    term1.addTerm("unit", animalManagementReceiveParam.getSw(), 2);
                    term.addTerm(term1, 1);
                } else if ("2".equals(animalManagementReceiveParam.getField())) {    //许可证编号
                    term.addTerm("lic_number", animalManagementReceiveParam.getSw());
                } else if ("3".equals(animalManagementReceiveParam.getField())) {    //单位名称
                    term.addTerm("unit", animalManagementReceiveParam.getSw());
                }
            }
        }
        dxSearcher.search(term, animalManagementCollector);
        if (term.getQuery() != null) {
            System.out.println("搜索字段:" + term.getQuery().toString());
        }
        int begin = animalManagementReceiveParam.getPageSize() * (animalManagementReceiveParam.getCurrentPage() - 1);
        int end = animalManagementReceiveParam.getPageSize();
        scoreDocs = animalManagementCollector.topDocs(begin, end).scoreDocs;
        totalData = animalManagementCollector.getTotalHits();
        animalManagementReturnParam.setTotalData(totalData);

        List<AnimalManagementLicenceBean> instrumentBeanList = new ArrayList<AnimalManagementLicenceBean>();
        FieldHighlighter fieldHighlighter = new FieldHighlighter(animalManagementReceiveParam.getSw());
        for (int i = 0; i < scoreDocs.length; i++) {
            ScoreDoc scoreDoc = scoreDocs[i];
            int docID = scoreDoc.doc;
            Document doc = dxSearcher.doc(docID);
            AnimalManagementLicenceBean animalManagementLicenceBean = new AnimalManagementLicenceBean();
            if (!StringUtil.isEmpty(doc.get("dxid"))) {
                animalManagementLicenceBean.setDxid(doc.get("dxid"));
            }
            if (!StringUtil.isEmpty(doc.get("title"))) {
                animalManagementLicenceBean.setTitle(fieldHighlighter.getTextFragment(doc.get("title"), false));
            }
            if (!StringUtil.isEmpty(doc.get("type"))) {
                animalManagementLicenceBean.setType(fieldHighlighter.getTextFragment(doc.get("type"), false));
            }
            if (!StringUtil.isEmpty(doc.get("lic_number"))) {
                animalManagementLicenceBean.setLic_number(fieldHighlighter.getTextFragment(doc.get("lic_number"), false));
            }
            if (!StringUtil.isEmpty(doc.get("unit"))) {
                animalManagementLicenceBean.setUnit(fieldHighlighter.getTextFragment(doc.get("unit"), false));
            }
            if (!StringUtil.isEmpty(doc.get("unit1"))) {
                animalManagementLicenceBean.setUnit1(fieldHighlighter.getTextFragment(doc.get("unit1"), false));
            }
            if (!StringUtil.isEmpty(doc.get("enable_range"))) {
                animalManagementLicenceBean.setEnable_range(fieldHighlighter.getTextFragment(doc.get("enable_range"), false));
            }
            if (!StringUtil.isEmpty(doc.get("zone"))) {
                animalManagementLicenceBean.setZone(fieldHighlighter.getTextFragment(doc.get("zone"), false));
            }
            if (!StringUtil.isEmpty(doc.get("url"))) {
                animalManagementLicenceBean.setUrl(fieldHighlighter.getTextFragment(doc.get("url"), false));
            }
            instrumentBeanList.add(animalManagementLicenceBean);
        }
        return instrumentBeanList;
    }

    public AnimalManagementReturnParam getAnimalManagementReturnParam() {
        return animalManagementReturnParam;
    }

    public void setAnimalManagementReturnParam(AnimalManagementReturnParam animalManagementReturnParam) {
        this.animalManagementReturnParam = animalManagementReturnParam;
    }
}

 

 

 

 

  • 大小: 24.9 KB
分享到:
评论

相关推荐

    基于文本聚类与分布式Lucene的知识检索.pdf

    随着信息技术的迅速发展,非结构化知识呈现爆炸性增长,...在未来的应用中,这种基于文本聚类与分布式Lucene的知识检索技术,有望在大数据环境下得到更广泛的应用,为非结构化知识的管理和检索提供更加高效的技术支持。

    TFIDF文本聚类

    - **初始化与迭代**:设定聚类数量K,分配初始中心点,然后迭代调整文档的类别归属,直到满足停止条件(如达到预设迭代次数、中心点不再改变等)。 8. **优化与应用**: 可以通过调整参数,如K值、距离阈值等,...

    Lucene搜索例子

    4. **分页实现**: 根据每页显示的文档数量,计算当前页的起始位置(skipTo),然后迭代`ScoreDoc`获取当前页的文档。 **Web环境中的集成** 1. **Servlet或Controller**: 在Web应用中,可以使用Servlet或Spring MVC...

    lucene分析

    3. 聚类和分类:Lucene可以与其他算法结合,实现文档的聚类和分类,提升用户体验。 4. 自定义评分函数:通过实现自定义的Similarity类,可以调整评分策略,满足特定的搜索需求。 总结,Lucene作为一款强大的全文...

    【java、lucene、python】互联网搜索引擎课程报告二:建立搜索引擎

    通过本项目的实施,我们不仅实现了基本的文本搜索功能,还深入了解了如何利用Lucene构建高效稳定的搜索引擎系统,同时掌握了文档相似度计算和聚类的基本原理和技术实现。这对于进一步研究和优化搜索引擎具有重要的...

    SemanticClustering:论文手稿中提供的语义聚类方法的源代码

    3. **配置文件**:定义算法参数,如聚类数量、模型参数等。 4. **脚本**:用于运行和调参的Shell或Python脚本,帮助用户快速启动和调整算法。 5. **评估指标**:可能包括准确率、召回率、F1分数等,用于衡量聚类...

    使用C sharp开发搜索引擎 C#搜索引擎开发实战 29-搜索界面(共12页).ppt

    如使用`C#`基础、搜索引擎基础知识、网络爬虫技术、正则表达式、Html解析(如HtmlAgilityPack)、正文提取、文本处理(如排重、关键词提取、拼写检查、文本摘要、文本分类和聚类)、信息提取、中文分词(如jieba分词...

    TFIDF算法java实现

    它的计算公式为:IDF(t) = log(N / (1 + df(t))),其中N是文档集合的总数,df(t)是包含词t的文档数量。如果一个词在所有文档中都很常见,其IDF值就会很低,反之,如果一个词只在少数文档中出现,其IDF值会较高。 3....

    java管理系统源码

    - 商品管理:数据库设计,包括商品类别、商品信息、库存数量等。 - 订单管理:处理购销订单,跟踪订单状态,可能使用Spring Boot集成Quartz进行定时任务处理。 - 库存控制:实时更新库存,防止超卖,可能使用消息...

    solr in action 完整版

    Solr是一个基于Apache Lucene的开源搜索平台,用于在网站上提供强大的搜索功能。它能够提供全文搜索、命中高亮显示、自动拼写修正、结果聚类等功能。Solr的灵活性使其能够处理各种类型的数据,并且可以很容易地与...

Global site tag (gtag.js) - Google Analytics