深度分析pGBRT源码 -

dajiangxiaoyan

浏览: 21050 次
性别:
来自: 北京

最近访客更多访客>>

super_2009

yekui

qqpkat2

xiumoo

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

深度分析pGBRT源码

博客分类：

机器学习、数据挖掘

pGBRT GBDT Boosting tree 回归树 mpi

一、算法介绍

GBDT(Gradient boosting decision tree)包括两种树分类树和回归树。今天主要介绍MPI版回归树pGBRT。

几篇比较好介绍GBRT算法的博文：

http://hi.baidu.com/hehehehello/item/96cc42e45c16e7265a2d64ee

http://blog.csdn.net/puqutogether/article/details/44781035

pGBRT源码下载：

http://machinelearning.wustl.edu/pmwiki.php/Main/Pgbrt

pGBRT版本是基于最小二乘（残差）梯度回归树

二、Gradient Boosting 和 decision tree

Boosting的步骤如上：

三、源码介绍

单颗回归树的构建过程

class StaticNode {
public:
	int feature; //属性全局index
    float split; //属性分割点
    double label;//均值
    double loss; //最小化均方误差
    
    //temp var
    int m_infty, m_s;   //总个数， leftNode个数
    float s;		    //当前分隔点
    double l_infty, l_s;//总残差和， leftNode残差和
};

void StaticTree::updateBestSplits(FeatureData* data, int f) {
    // compute global feature index
    int globalf = data->globalFeatureIndex(f);
    
    // reset counts at nodes
    for (int n=0; n<nodes; n++) {
        StaticNode* node = layers[layer][n];
        node->m_s = 0;
        node->l_s = 0.0;
    }
    
    // iterate over feature
    for (int j=0; j<data->getN(); j++) {
        // get current value
        float v = data->getSortedFeature(f,j);
        int i = data->getSortedIndex(f,j);
        int n = data->getNode(i);
        float l = data->getResidual(i);
        
        // get node
        StaticNode* node = layers[layer][n];
        
        // if not first instance at node and greater than split point, consider new split at v
        if (node->m_s > 0 and v > node->s) {
        	//最小化均方误差
            double loss_i = pow(node->l_s,2.0) / (double) node->m_s + pow(node->l_infty - node->l_s,2.0) / (double) (node->m_infty - node->m_s);
            if (node->loss < 0 or loss_i > node->loss) {
                node->loss = loss_i;
                node->feature = globalf;
                node->split = (node->s + v) / 2.f;
                
                // TODO : create a lookup table for these child values at tree construction, store in static tree or store each in static node
                StaticNode* child1 = layers[layer+1][2*n];
                child1->label = node->l_s / (double) node->m_s;
                StaticNode* child2 = layers[layer+1][2*n+1];
                child2->label = (node->l_infty - node->l_s) / (double) (node->m_infty - node->m_s);
                
                // if (child2->label > 5.0)
                //     printf("### %f %d %d %f %d %f %d %f %d %d %d %f %d\n", v, i, n, l, node->m_s, node->l_s, node->m_infty, node->l_infty, globalf, f, j, loss_i, layer);
            }
        }
        
        // update variables
        node->m_s += 1;
        node->l_s += l;
        node->s = v;
    }
}

void StaticTree::exchangeBestSplits() {
    // instantiate buffer
    int buffersize = nodes*5;
    double* buffer = new double[buffersize];
    
    // write layer of tree to buffer
    for (int n=0; n<nodes; n++) {
        StaticNode* node = layers[layer][n];
        buffer[n*5 + 0] = node->loss;
        buffer[n*5 + 1] = node->feature;
        buffer[n*5 + 2] = node->split;
        
        StaticNode* child1 = layers[layer+1][n*2];
        buffer[n*5 + 3] = child1->label;
        
        StaticNode* child2 = layers[layer+1][n*2+1];
        buffer[n*5 + 4] = child2->label;
    }
    
    // get myid and numprocs
    int myid;
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    int numprocs;
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    
    // determine isRoot
    int root = numprocs-1;
    bool isRoot = (myid == root);
    
    // exchange buffers
    double* rbuffer = (isRoot ? new double[numprocs*buffersize] : NULL);
    MPI_Gather(buffer, buffersize, MPI_DOUBLE, rbuffer, buffersize, MPI_DOUBLE, root, MPI_COMM_WORLD);
    
    // save best global splits
    if (isRoot) for (int n=0; n<nodes; n++) {
        // reset loss at node and get pointers
        StaticNode* node = layers[layer][n];
        node->loss = -1;
        StaticNode* child1 = layers[layer+1][n*2];
        StaticNode* child2 = layers[layer+1][n*2+1];
        
        // consider loss from all processors
        for (int p=0; p<numprocs; p++) {
            int offset = p*buffersize + n*5;
            double loss = rbuffer[offset + 0];
            
            // update if better than current
            if (node->loss < 0 or loss > node->loss) {
                node->loss = loss;
                node->feature = (int) rbuffer[offset + 1];
                node->split = (float) rbuffer[offset + 2];
                child1->label = rbuffer[offset + 3];
                child2->label = rbuffer[offset + 4];
            }
        }
    }
    
    // buffer best global splits
    if (isRoot) for (int n=0; n<nodes; n++) {
        StaticNode* node = layers[layer][n];
        buffer[n*5 + 0] = node->loss;
        buffer[n*5 + 1] = node->feature;
        buffer[n*5 + 2] = node->split;
        
        StaticNode* child1 = layers[layer+1][n*2];
        buffer[n*5 + 3] = child1->label;
        
        StaticNode* child2 = layers[layer+1][n*2+1];
        buffer[n*5 + 4] = child2->label;
    }
    
    // broadcast best splits
    MPI_Bcast(buffer, nodes*5, MPI_DOUBLE, root, MPI_COMM_WORLD);
    
    // update tree with best global splits
    for (int n=0; n<nodes; n++) {
        StaticNode* node = layers[layer][n];
        node->loss = buffer[n*5 + 0];
        node->feature = (int) buffer[n*5 + 1];
        node->split = (float) buffer[n*5 + 2];
        
        StaticNode* child1 = layers[layer+1][n*2];
        child1->label = buffer[n*5 + 3];
        
        StaticNode* child2 = layers[layer+1][n*2+1];
        child2->label = buffer[n*5 + 4];
    }
    
    // delete buffers
    delete [] buffer;
    delete [] rbuffer;
}

四、缺点

MPI每个节点按列读取数据，所以每个节点至少读取一列的数据，导致数据行数扩展有限，在数据量大时单节点内存狂涨。后续会介绍如何按行列分隔数据。