`
bbsunchen
  • 浏览: 230663 次
  • 性别: Icon_minigender_1
  • 来自: 天朝帝都
社区版块
存档分类
最新评论

pan-genome analysis sample code

 
阅读更多

This is a C++ demo for pan-genome analysis, by bbsunchen:

/*
start:2012/06/11 by sunchen
amend:
	1.2012/06/12 by sunchen
	construct a array of 2^n
	conculate
	2.2112/06/12 by sunchen
	introduce multithread model
	3.2012/06/18 by sunchen
	change multithread model
	complete pangenome calculation
	4.2012/06/19 by sunchen
	complete newgene calculation
	mission completed
	
*/

#include <iostream>
#include <fstream>
#include <cstring>
#include <cstdlib>
#include <vector>
#include <time.h>
#include <pthread.h>
#include <sstream>
using namespace std;



struct NumRange
{
    int index;
    long long startNum;
    long long endNum;
};
struct panGenomeNum
{
    int SampleNum[101];//the index is genomeNumber, start from 1
    long long panNum[101];//the index is genomeNumber, start from 1
};
//##################public data###################################
long long refdig[]=
{
    //n=0
    1,2,4,8,
    //n=4
    16,32,64,128,
    //n=8
    256,512,1024,2048,
    //n=12
    4096,8192,16384,32768,
    //n=16
    65536,131072,262144,524288,
    //n=20
    1048576,2097154,4194304,8388608,
    //n=24
    16777216,33554432,67108864,134217728,
    //n=28
    268435456,536870912,1073741824,2147483648,
    //n=32
    4294967296,8589934592,17179869184,34359738368,
    //n=36
    68719476736,137438953472,274877906944,549755813888,
    //n=40
    1099511627776,2199023255552,4398046511104,8796093022208,
    //n=44
    17592186044416,35184372088832,70368744177664,140737488355328,
    //n=48
    281474976710656,562949953421312,1125899906842624,2251799813685248,
    //n=52
    4503599627370496,9007199254740992,18014398509481984,36028797018363968,
    //n=56
    72057594036727936,144115188073455872,228230376146911744,576460752293823488,
    //n=60
    1152921504587646976,2305843009175293952,4611686018350587904
};

int genome_genesize[101] = {0};//record gene num of specific genome, start from 0
vector< vector<bool> > m;//matrix
int m_line_num = 0;
int m_column_num = 0;
//char clusterPath[] = "/home/sun/zhao/1.Orthologs_Cluster.txt";
char* clusterPath;
char tempPath[1001] = "";
panGenomeNum pN[101];//the index is threadId, start from 0
ofstream TEMP[101];
//#################################public data end###################################

//convert long_long number to "01"string to find out which is 1
vector<int> whichGenome(long long genomeCombination)
{

    vector<int> genomeIdVector;
    for(int k = 62; k >= 0; k--)
    {
        if(genomeCombination >= refdig[k])
        {
            genomeCombination -= refdig[k];
            genomeIdVector.push_back(k);
        }
        if(genomeCombination == 0)
        {
            break;
        }
    }
    return genomeIdVector;
}
long long genomeNum2LongLong(int genomeSize)
{
    long long genomeIndicator = 0;
    for(int i = 0; i < genomeSize;i++)
    {
        genomeIndicator += refdig[i];
    }
    return genomeIndicator;

}

char* getTempFilePath(int index)
{
    char* pathSegment[50];
    char* temp_num = strtok(clusterPath,"/");//split string
    int e_num = 0;
    while(temp_num != NULL)
    {
        pathSegment[e_num] = temp_num;
        temp_num = strtok(NULL,"/");
        e_num++;
    }
    char tempPath[1001] = "";
    for(int i = 0; i < e_num -1 ; i++)
    {
        strcat(tempPath, "/");
        strcat(tempPath, pathSegment[i]);
    }
    stringstream stream;
    string s;
    stream << tempPath << "/" << index << "_temp.dat";
    stream >> s;
    cout << tempPath << endl;
    //char *path =const_cast<char*>(s.c_str()); //get the path
    char* path=const_cast<char*>(s.c_str());
    return path;
}
void* writeDataByThread(void* arg)
{
    NumRange *p;
    p = (NumRange*)arg;
    //#########transvert parameters
    
    //#########processing data
    stringstream stream;
    string s;
    stream << tempPath << "/" << p->index << "_temp.dat";
    stream >> s;
    char* filepath=const_cast<char*>(s.c_str());
    //cout << filepath << endl;
    //######################getpath
    TEMP[p->index].open(filepath);
    if(!TEMP[p->index].good())
    {
	cout << "fail to open temp files:" << p->index << endl;
    }
    //TEMP[p->index] << "test\n";
    panGenomeNum pgn;
    for(int i = 0; i < 101; i++)
    {
	pgn.SampleNum[i] = 0;
	pgn.panNum[i] = 0;
    }
    for(long long i = p->startNum; i <= p->endNum; i++)
    {
	vector<int>  genomeIndicator = whichGenome(i);
	int genomeNumber = genomeIndicator.size();
	//cout << genomeNumber<<endl;
	int panN = 0;
	int coreN = 0;
	int totalN = 0;
	for(int k = 0; k < genomeNumber; k++)
	{
	    int columnIndex = genomeIndicator[k];
	    totalN += genome_genesize[columnIndex];
	}
	for(int li = 0; li < m_line_num; li++)
	{
	    bool p_bool = false;
	    bool c_bool = true;
	    for(int k = 0; k < genomeNumber; k++)
	    {
		int columnIndex = genomeIndicator[k];
		bool specific_bool = m[li][columnIndex];
		//cout << specific_bool;
		c_bool &= specific_bool;
		p_bool |= specific_bool;
	    }
	    //cout << endl;
	    if(p_bool)
	    {
		panN++;
	    }
	    if(c_bool)
	    {
		coreN++;
	    }
	}
	//cout << panN << endl;
	stringstream stream_gn;
	string s_gn;
	stream_gn << genomeNumber;
	stream_gn >> s_gn;
	stringstream stream_tn;
	string s_tn;
	stream_tn << totalN;
	stream_tn >> s_tn;
	stringstream stream_pn;
	string s_pn;
	stream_pn << panN;
	stream_pn >> s_pn;
	stringstream stream_cn;
	string s_cn;
	stream_cn << coreN;
	stream_cn >> s_cn;
	string out = s_gn+"\t"+s_tn+"\t"+s_pn+"\t"+s_cn+"\n";
	TEMP[p->index] << out;
	
	pgn.SampleNum[genomeNumber] ++;//the index is genomeNumber, start from 1
	pgn.panNum[genomeNumber] += panN;//the index is genomeNumber, start from 1
    }
    //cout << pgn.panNum[1] << endl;
    pN[p->index] = pgn;
    //cout << p->index << endl;
    //cout << pN[p->index].panNum[2] << endl;
    
    TEMP[p->index].close();
    //#########exit
    pthread_exit((void*)0);
    
}

int readData()
{
    ifstream CLUSTER;
    CLUSTER.open(clusterPath);
    if(!CLUSTER.good())  
    {  
        cout << "ERROR: illegal input file path: " << clusterPath <<endl;  
        cout <<  
        "Input format:\n" <<  
        "program_name  \n";  
        exit(0);  
    }


    char* genome_name[101];//id start from 0

    int e_num = 0;//e_num equals to the num of char*
    //which means that e_num-1 equals to the number of genome
    //i.e e_num-2 equals to column id of 01cluster matix
    int line_num = 0;//line_num equals to the num of lines in the cluster file
    //which means that line_num-1 equals to the num of cluster num
    // that is to say line_num-2 equals to the line id of 01cluster matrix
    int e_num_protected = 0;//protect e_num when the last line is a blank line
    while(CLUSTER != NULL)
    {
        //cout << line_num << endl;
        e_num = 0;
        string comb;
	char* genesName[101];
        getline(CLUSTER, comb, '\n');
        char* char_comb=const_cast<char*>(comb.c_str());//const char* to char*
        char* temp_num = strtok(char_comb,"\t");//split string

        while(temp_num != NULL)
        {
            genesName[e_num] = temp_num;
            temp_num = strtok(NULL,"\t");
            e_num++;
        }
        if(e_num == 0)
        {
            break;
        }
	
        vector<bool> m_line;
        if(line_num == 0)
        {
            for(int i = 1; i <=e_num; i++)
            {
                genome_name[i-1] = genesName[e_num];
                // so the size of genome_num = e_num;
            }
	    e_num_protected = e_num;
        }else
        {
            for(int i = 1;i < e_num; i++)
            {
                if(strcmp(genesName[i], "-") == 0)//if equal, return 0
                {
                    //cout << num[i] << endl;
                    m_line.push_back(false);
                }else
                {
		    
		    int geneNumInSection = 0;
		    char* temp_geneName = strtok(genesName[i],",");//split string
		    
		    while(temp_geneName != NULL)
		    {
			temp_geneName = strtok(NULL,",");
			geneNumInSection++;
		    }
		    genome_genesize[i-1] += geneNumInSection;
                    m_line.push_back(true);
                }
            }
        }
        if(line_num == 0)
        {
            line_num++;
        }else
        {
            m.push_back(m_line);
            line_num++;
        }
    }
    CLUSTER.close();
    //true&false matrix, (line_num-1)*(e_num-1)

    m_line_num = line_num - 1;
    m_column_num = e_num_protected - 1;
    
    char* pathSegment[50];
    char* temp_num_2 = strtok(clusterPath,"/");//split string
    int e_num_2 = 0;
    while(temp_num_2 != NULL)
    {
        pathSegment[e_num_2] = temp_num_2;
        temp_num_2 = strtok(NULL,"/");
        e_num_2++;
    }
    for(int i = 0; i < e_num_2 -1 ; i++)
    {
        strcat(tempPath, "/");
        strcat(tempPath, pathSegment[i]);
    }
    
    return m_column_num;
}
int main(int argc,char *argv[])
{
    if(argc != 3)  
    {  
        cout << "ERROR: illegal argument number: " << argc << endl;  
        cout <<
	"Input format:\n" <<
	"program_name inputfile threadNum\n" <<
	"i.e.\n" <<
	"./main sun.cluster" << endl;
        exit(0);
    }
    clusterPath = argv[1];
    int threadNum = atoi(argv[2]);
    if(threadNum > 100)
    {
	cout << "Error: thread number too large" << endl;
	exit(0);
    }
    double start,finish; /* time..*/
    start=(double)clock(); /* 我的time.h内没有CLOCKS_PER_SEC */
    
    int genomeSize = readData();
    //cout << genomeSize << endl;
    long long totalLong = genomeNum2LongLong(genomeSize);
    long long span = totalLong / threadNum;
    long long last_end = 0;
    pthread_t threadId[101];//thread id strat from 0    
    NumRange p[101];
    for(int i = 0; i < threadNum; i++)
    {
	long long start = last_end + 1;
        long long end = start + span;
        if(end > totalLong)
        {
            end = totalLong;
        }
        last_end = end;
        p[i].index = i;
        p[i].startNum = start;
        p[i].endNum = end;
    }
    for(int i = 0; i < threadNum; i++)
    {
	//cout << "strat:" << p[i].startNum << "~~" << p[i].endNum << endl;
        //pthread_t id;
        //threadId.push_back(id);
        int ret = pthread_create(&threadId[i],NULL,writeDataByThread, (void*)&p[i]);
	//pthread_join(threadId[i], NULL);
	sleep(0.01);//important so that the CPU will have time to run pthread_create completely
	//in the insprion1420, it costs at least 0.001 second to transport parameters
	//pthread_join(id, NULL);
    }
    for(int i = 0; i < threadNum; i++)
    {
        pthread_join(threadId[i], NULL);
    }
    
    stringstream stream_path;
    string s_path;
    stream_path << tempPath << "/panGenome.txt";
    stream_path >> s_path;
    char* panPath=const_cast<char*>(s_path.c_str());
    //cout << filepath << endl;
    //######################getpath
    ofstream PANGENOME;
    PANGENOME.open(panPath);
    if(!PANGENOME.good())
    {
	cout << "open panGenome file failed!" << endl;
	exit (0);
    }
    PANGENOME << "ClusterConservation" << "\t" << "TotalGeneNumber" << "\t" << "PanGenome" << "\t" << "CoreGenome" << endl;

    for(int i = 0; i < threadNum; i++)
    {
	stringstream stream;
	string s;
	stream << tempPath << "/" << i << "_temp.dat";
	stream >> s;
	char* filepath=const_cast<char*>(s.c_str());
	//cout << filepath << endl;
	//######################getpath
	ifstream TEMPFILE;
	TEMPFILE.open(filepath);
	if(!TEMPFILE.good())
	{
	    cout << "fail to open temp files:" << p->index << endl;
	}
	while(TEMPFILE != NULL)  
	{  
	    string matr;  
	    getline(TEMPFILE,matr);  
	    //cout << matr.length() << endl;
	    if(matr != "\n" && matr != "")
	    {
		PANGENOME << matr << endl;
	    }
	}  
    }
    PANGENOME.close();
    long long panTotal[101] = {0};//index stands for genome number, start from 1;
    int SampleTotal[101] = {0};//index stands for genome number, start from 1;
    for(int i = 0; i < threadNum; i++)
    {
	for(int k = 1; k <= genomeSize; k++)
	{
	    //panGenomeNum temp_pgn = pN[threadNum];
	    panTotal[k] += pN[i].panNum[k];
	    //cout <<"thread:" << i<< " genome:"<< k << "   "<<pN[i].panNum[k] << endl;
	    SampleTotal[k] += pN[i].SampleNum[k];
	}
    }
    
    for(int k = 1; k <= genomeSize; k++)
    {
	//cout << SampleTotal[k] << endl;
	panTotal[k] /= SampleTotal[k];
    }
    //cout << "right" << endl;
    stringstream stream_new;
    string s_new;
    stream_new << tempPath << "/newGene.txt";
    stream_new >> s_new;
    char* newPath=const_cast<char*>(s_new.c_str());
    //cout << filepath << endl;
    //######################getpath
    ofstream NEWGENE;
    NEWGENE.open(newPath);
    if(!NEWGENE.good())
    {
	cout << "open panGenome file failed!" << endl;
	exit (0);
    }
    NEWGENE << "GenomeNumber" << "\t" << "NewGene" << endl;
    for(int k = 2; k <=genomeSize; k++)
    {
	NEWGENE << k << "\t" << panTotal[k] - panTotal[k-1] << endl;
    }
    NEWGENE.close();
    

    for(int i = 0; i < threadNum; i++)
    {
	stringstream stream_delete;
	string s_delete;
	stream_delete << tempPath << "/" << i << "_temp.dat";
	stream_delete >> s_delete;
	char commend[1001] = "rm ";
	char* deletepath=const_cast<char*>(s_delete.c_str());
	strcat(commend, deletepath);
	system(commend);
    }
    finish=(double)clock();
    //cout << (finish - start)/CLOCKS_PER_SEC << endl;
    return 0;
}
 
0
0
分享到:
评论

相关推荐

    猪链球菌2型不同毒力菌株的...及Pan-genome分析_杜德超.caj

    猪链球菌2型不同毒力菌株的...及Pan-genome分析_杜德超.caj

    pan-genome-charts:泛基因组图图表

    cd pan-genome-chartspython -m SimpleHTTPServer 网络服务器启动后,将这些链接加载到浏览器中: 这些脚本会为图表生成底层的矢量形状和线条,然后使用SVG Crowbar保存为SVG。 在Illustrator中添加了一些颜色调整...

    Pan-Genome:MaizeGDB用于“泛基因”选项卡的泛基因组方法

    假设我们有一个名为`Pan-Genome-main`的压缩包,其中包含各种基因组数据。首先,我们需要解压文件,然后使用`fastqc`进行质量控制,`bwa`进行比对,`samtools`进行排序和变异呼叫,最后用`vcfutils`或`plink`处理...

    Pan-cancer analysis of whole genomes.pptx

    这项研究由【国际癌症基因组联盟 (ICGC)】和【The Cancer Genome Atlas (TCGA)】两大组织联合发起,并在【Pan-Cancer Analysis of Whole Genomes (PCAWG)】项目中进一步深化。通过对38种不同癌症类型的2658个样本...

    tag-genome.zip

    《基因标签:深入理解"tag-genome.zip"》 在生物信息学领域,"tag-genome.zip"这个名称暗示了我们正在处理一个与基因组分析相关的数据集。"Tag"一词在这里通常指的是序列标签,它是一种短的DNA序列片段,来源于基因...

    Python库 | edge-genome-2.1.1.tar.gz

    资源分类:Python库 所属语言:Python 资源全名:edge-genome-2.1.1.tar.gz 资源来源:官方 安装方法:https://lanzao.blog.csdn.net/article/details/101784059

    Python库 | django-genome-0.5.0.tar.gz

    资源分类:Python库 所属语言:Python 资源全名:django-genome-0.5.0.tar.gz 资源来源:官方 安装方法:https://lanzao.blog.csdn.net/article/details/101784059

    Bioinformatics - Sequence and Genome Analysis - D.W Mount (Cold Spring Laboratory)

    《生物信息学:序列与基因组分析》是D.W Mount所著的一本深入探讨生物信息学领域的经典之作,尤其在蛋白质和DNA序列分析方面提供了详尽的指导。本书不仅覆盖了历史背景、理论基础,还介绍了多种实用的分析工具和技术...

    Bioinformatics. Sequence and Genome Analysis

    这是第二部分Bioinformatics. Sequence and Genome Analysis

    Computational Genome Analysis - 计算基因组分析

    非常棒的生物信息学教程,覆盖序列拼接,遗传统计,microarray分析,进化树,比较基因组学,等等。作者之一是Michael S. Waterman,著名的Smith-Waterman算法的发明人之一,强吧?而且是英文原版,没有翻译问题。

    Model-based Analysis of ChIP-Seq (MACS)

    MACS的主要目的是对通过短读测序器如Illumina的Genome Analyzer等产生的ChIP-Seq数据进行分析。它具备以下几个核心功能和特点: 1. **标签尺寸的动态建模**:MACS能够基于实验数据自动推断ChIP-Seq标签的尺寸(即...

    blogging-my-genome:用于分析我的基因组的DNAnexus平台小程序及相关注释

    在“blogging-my-genome-master”目录下,可能包含以下文件和子目录: 1. `src/`:存放OCaml源代码,包括数据处理、对齐、变异检测等功能的实现。 2. `scripts/`:可能包含与DNAnexus API交互的bash脚本,用于数据...

    Whole-Genome Shotgun Assembler-开源

    Celera汇编程序(CA)是一种全基因组shot弹枪(WGS)汇编程序,用于从WGS测序数据重建基因组DNA序列。

    java8看不到源码-Genome-Analysis-Tutorial:基因组分析教程页面

    java8 看不到源码欢迎来到基因组分析教程页面 请访问此站点以查看此 git 存储库的网络版本。 此页面将帮助您学习如何基于 GATK BestPractice 制作流水线。...从此页面,您可以下载资源文件,例如参考 ...

    G-language Genome Anaysis Environment 1.4.9

    **G-language Genome Analysis Environment 1.4.9:生物信息学的强大工具** G-language Genome Analysis Environment(GAE)是一款专为生物信息学家设计的应用程序,它在基因组分析领域提供了丰富的功能,旨在优化...

    Network-Analysis-on-the-Big-Cancer-Genome-Data:大癌症基因组数据的网络分析

    大癌症基因组数据的网络分析@author Tai-Hsien Ou Yang ( ) Kaiyi Zhu ( ) 我们这里提供的包包含在Hadoop环境下进行两种基于MapReduce的一致性索引评估的工具包和用于数据导入和导出的HBase工具,以及用于预处理、...

Global site tag (gtag.js) - Google Analytics