Elasticsearch聚合功能Facet

donlianli

浏览: 341486 次
性别:
来自: 北京

最近访客更多访客>>

asia007

clive_hua

liuxuqing2010

FixedStar2K

博主相关

博客

微博

相册

留言

关于我

博客专栏

: Elasticsearch...
浏览量：219127

文章分类

社区版块

存档分类

博客分类：

ElasticSearch

elasticsearch facet 分组

在常规数据库中，我们都知道有一个sql就是group，分组。如果主表只有对应的一个列记录的分组的ID，那么还好统计，比如说每本书book表，有一个分类catId，记录是属于哪一类的书，那么直接按照catId进行分组即可。可是在实际应用种，并非如此简单。一本书往往属于多个分类，比如：某本书既属于科技类书，又属于儿童类书，要求按照这两种条件进行筛选，都能筛选出来，如果要求按照分类进行统计数量，数据库怎么group?我们且抛开种种解决方案，来看看Elasticsearch里面对这种需求，是多么的容易统计。

首先，我们需要造些数据，需要用到一个模型，这个模型定义了一个type，就算类型吧，我们用这个属性来演示常规的group。还有一个catIds的列表模型，这个来解决我们上面描述的一本书对应多个分类的需求。模型定义如下：

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

import com.donlianli.es.ESUtils;
/**
 * 这个是为分组定义的一个模型
 * catIds通常为一对多的分类ID
 * @author donlian
 */
public class FacetTestModel implements Serializable {
	private static final long serialVersionUID = 3174577828007649745L;
	/**
	 * 随便编写的一些值，type属性只能取这里面的其中一个
	 */
	private String[] types= new String[]{
			"type1","type2","type3","type4","type5","type6","type7",
			"type11","type12","type13","type14","type15","type16","type17"
	};
	//主ID
	private long id;
	//类型，为types之一
	private String type;
	/**
	 * 所属分类，范围为1-50
	 */
	private List<Integer> catIds;
	
	public FacetTestModel(){
		Random r = new Random();
		int n = Math.abs(r.nextInt());
		int index = n%14;
		this.type = types[index];
		this.id = Math.abs(r.nextLong());
		
		n = n%50;
		catIds = new ArrayList<Integer>();
		catIds.add(n);
		int ys = n%3;
		if(ys!=0){
			for(int i=1;i<ys+1;i++){
				catIds.add(n+i);
			}
		}
	}
	public static void main(String[] argv){
		for(int i=0;i<10;i++){
			FacetTestModel f = new FacetTestModel();
			System.out.println(ESUtils.toJson(f));
		}
	}
	public long getId() {
		return id;
	}
	public void setId(long id) {
		this.id = id;
	}
	public String getType() {
		return type;
	}
	public void setType(String type) {
		this.type = type;
	}
	public List<Integer> getCatIds() {
		return catIds;
	}
	public void setCatIds(List<Integer> catIds) {
		this.catIds = catIds;
	}
}

接着就是初始化数据。

import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.client.Client;

import com.donlianli.es.ESUtils;
import com.donlianli.es.model.FacetTestModel;

public class BulkIndexTest {
	
	public static void main(String[] args) {
		Client client = ESUtils.getClient();
		BulkRequestBuilder bulkRequest = client.prepareBulk();
		for(int i=0;i<10;i++){
			String json = ESUtils.toJson(new FacetTestModel());
			IndexRequestBuilder indexRequest = client.prepareIndex("test", "test")
			//指定不重复的ID		
	        .setSource(json).setId(String.valueOf(i));
			//添加到builder中
			bulkRequest.add(indexRequest);
		}
		
		BulkResponse bulkResponse = bulkRequest.execute().actionGet();
		if (bulkResponse.hasFailures()) {
			System.out.println(bulkResponse.buildFailureMessage());
		}
	}
}

接下来，我们首先对type进行统计。在elasticsearch中，分组的功能叫facet，不知道为啥起这个名称。总之，就是对type的每一个值的数量进行统计，注意，要设置里面的size条件，否则默认只返回10个。

import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.search.facet.FacetBuilders;
import org.elasticsearch.search.facet.Facets;
import org.elasticsearch.search.facet.terms.TermsFacet;
import org.elasticsearch.search.facet.terms.TermsFacetBuilder;

import com.donlianli.es.ESUtils;

public class GroupTest {
	public static void  main(String[] argv){
		Client client = ESUtils.getClient();
		TermsFacetBuilder facetBuilder = FacetBuilders.termsFacet("typeFacetName");
		facetBuilder.field("type").size(Integer.MAX_VALUE);
		facetBuilder.facetFilter(FilterBuilders.matchAllFilter());
		SearchResponse response = client.prepareSearch("test")
				.setTypes("test")
				.addFacet(facetBuilder)
		        .setFilter(FilterBuilders.matchAllFilter())
		        .execute()
		        .actionGet();
		Facets f = response.getFacets();
		//跟上面的名称一样
		TermsFacet facet = (TermsFacet)f.getFacets().get("typeFacetName");
		for(TermsFacet.Entry tf :facet.getEntries()){
			System.out.println(tf.getTerm()+"\t:\t" + tf.getCount());
		}
		client.close();
	}
}

运行程序后，大概得到如下结果：

type3	:	4
type7	:	1
type6	:	1
type4	:	1
type13	:	1
type12	:	1
type11	:	1

正好10个。初始化代码能对的上。

下面，我们就要对catIds进行统计了，再统计之前，我们先看看es里面都存储的是那些数据。

{id=3683174899323317453, catIds=[4, 5], type=type3}
{id=271209313870366004, catIds=[26, 27, 28], type=type3}
{id=348654892174153835, catIds=[41, 42, 43], type=type4}
{id=6826187683023110944, catIds=[46, 47], type=type7}
{id=3437591661789488747, catIds=[22, 23], type=type3}
{id=6365837443081614150, catIds=[37, 38], type=type11}
{id=2387331048448677498, catIds=[20, 21, 22], type=type3}
{id=5595404824923951817, catIds=[31, 32], type=type13}
{id=3593797446463621044, catIds=[30], type=type12}
{id=5824112111832084165, catIds=[1, 2], type=type6}

怎么对catIds进行统计呢，代码跟上面进行单个统计一样。

import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.search.facet.FacetBuilders;
import org.elasticsearch.search.facet.Facets;
import org.elasticsearch.search.facet.terms.TermsFacet;
import org.elasticsearch.search.facet.terms.TermsFacetBuilder;

import com.donlianli.es.ESUtils;

public class GroupTest2 {
	public static void  main(String[] argv){
		Client client = ESUtils.getClient();
		TermsFacetBuilder facetBuilder = FacetBuilders.termsFacet("catIdName");
		facetBuilder.field("catIds").size(Integer.MAX_VALUE);
		facetBuilder.facetFilter(FilterBuilders.matchAllFilter());
		SearchResponse response = client.prepareSearch("test")
				.setTypes("test")
				.addFacet(facetBuilder)
		        .setFilter(FilterBuilders.matchAllFilter())
		        .execute()
		        .actionGet();
		Facets f = response.facets();
		//跟上面的名称一样
		TermsFacet facet = (TermsFacet)f.getFacets().get("catIdName");
		for(TermsFacet.Entry tf :facet.entries()){
			System.out.println("键:"+tf.getTerm()+"\t;数量:\t" + tf.getCount());
		}
		client.close();
	}
}

运行结果：

键:22	;数量:	2
键:47	;数量:	1
键:46	;数量:	1
键:43	;数量:	1
键:42	;数量:	1
键:41	;数量:	1
键:38	;数量:	1
键:37	;数量:	1
键:32	;数量:	1
键:31	;数量:	1
键:30	;数量:	1
键:28	;数量:	1
键:27	;数量:	1
键:26	;数量:	1
键:23	;数量:	1
键:21	;数量:	1
键:20	;数量:	1
键:5	;数量:	1
键:4	;数量:	1
键:2	;数量:	1
键:1	;数量:	1

再和上面的数据对对，是不是除了22，其他的都是一个？

在分组这方面，ES真的很强大，除了上面的支持列表分组外，还支持范围分组rangeFacet，多个分组可以一次全部发送给ES等等，更多功能，大家还是自己多多验证。

对这类话题感兴趣？欢迎发送邮件至donlianli@126.com

关于我：邯郸人，擅长Java，Javascript，Extjs，oracle sql。

更多我之前的文章，可以访问我的空间

1
顶

0
踩

分享到：

典型80后的5年工作总结 | Mongodb使用总结

2013-07-15 21:44
浏览 12498
评论(3)
分类:开源软件
查看更多

3 楼 xwstonny10 2014-05-12

有一个问题，不知道你用的是什么版本，在我的版本中为什么每次查询出来的都不一样的啊？

2 楼 donlianli 2013-07-16

reinhardv 写道

ES的Facet有一个很大的问题，其实不光是ES，大多数基于Lucene的引擎技术都有这个问题，就是当数据量很大的时候，Facet非常消耗内存，不知道你是怎么解决的？

没有解决。

1 楼 reinhardv 2013-07-16

发表评论

您还没有登录,请您登录后再发表评论