- 浏览: 230663 次
- 性别:
- 来自: 天朝帝都
文章分类
最新评论
-
hanmiao:
CSDN 博客地址是这個?http://blog.csdn.n ...
将博客搬至CSDN -
chenwq:
下载了,谢谢分享!
R语言学习入门 -
bbsunchen:
今天跟英姐聊天,她verbal考了151,不够啊,数学也不高。 ...
跟我一起考GRE(三) -
bbsunchen:
qinger说得对我今年只做三件事情:考好GRE,考好TOEF ...
IT行业成功必备的素质 -
bbsunchen:
还有8天就考试了,哥还在过单词啊
跟我一起考GRE(三)
This is a C++ demo for pan-genome analysis, by bbsunchen:
/* start:2012/06/11 by sunchen amend: 1.2012/06/12 by sunchen construct a array of 2^n conculate 2.2112/06/12 by sunchen introduce multithread model 3.2012/06/18 by sunchen change multithread model complete pangenome calculation 4.2012/06/19 by sunchen complete newgene calculation mission completed */ #include <iostream> #include <fstream> #include <cstring> #include <cstdlib> #include <vector> #include <time.h> #include <pthread.h> #include <sstream> using namespace std; struct NumRange { int index; long long startNum; long long endNum; }; struct panGenomeNum { int SampleNum[101];//the index is genomeNumber, start from 1 long long panNum[101];//the index is genomeNumber, start from 1 }; //##################public data################################### long long refdig[]= { //n=0 1,2,4,8, //n=4 16,32,64,128, //n=8 256,512,1024,2048, //n=12 4096,8192,16384,32768, //n=16 65536,131072,262144,524288, //n=20 1048576,2097154,4194304,8388608, //n=24 16777216,33554432,67108864,134217728, //n=28 268435456,536870912,1073741824,2147483648, //n=32 4294967296,8589934592,17179869184,34359738368, //n=36 68719476736,137438953472,274877906944,549755813888, //n=40 1099511627776,2199023255552,4398046511104,8796093022208, //n=44 17592186044416,35184372088832,70368744177664,140737488355328, //n=48 281474976710656,562949953421312,1125899906842624,2251799813685248, //n=52 4503599627370496,9007199254740992,18014398509481984,36028797018363968, //n=56 72057594036727936,144115188073455872,228230376146911744,576460752293823488, //n=60 1152921504587646976,2305843009175293952,4611686018350587904 }; int genome_genesize[101] = {0};//record gene num of specific genome, start from 0 vector< vector<bool> > m;//matrix int m_line_num = 0; int m_column_num = 0; //char clusterPath[] = "/home/sun/zhao/1.Orthologs_Cluster.txt"; char* clusterPath; char tempPath[1001] = ""; panGenomeNum pN[101];//the index is threadId, start from 0 ofstream TEMP[101]; //#################################public data end################################### //convert long_long number to "01"string to find out which is 1 vector<int> whichGenome(long long genomeCombination) { vector<int> genomeIdVector; for(int k = 62; k >= 0; k--) { if(genomeCombination >= refdig[k]) { genomeCombination -= refdig[k]; genomeIdVector.push_back(k); } if(genomeCombination == 0) { break; } } return genomeIdVector; } long long genomeNum2LongLong(int genomeSize) { long long genomeIndicator = 0; for(int i = 0; i < genomeSize;i++) { genomeIndicator += refdig[i]; } return genomeIndicator; } char* getTempFilePath(int index) { char* pathSegment[50]; char* temp_num = strtok(clusterPath,"/");//split string int e_num = 0; while(temp_num != NULL) { pathSegment[e_num] = temp_num; temp_num = strtok(NULL,"/"); e_num++; } char tempPath[1001] = ""; for(int i = 0; i < e_num -1 ; i++) { strcat(tempPath, "/"); strcat(tempPath, pathSegment[i]); } stringstream stream; string s; stream << tempPath << "/" << index << "_temp.dat"; stream >> s; cout << tempPath << endl; //char *path =const_cast<char*>(s.c_str()); //get the path char* path=const_cast<char*>(s.c_str()); return path; } void* writeDataByThread(void* arg) { NumRange *p; p = (NumRange*)arg; //#########transvert parameters //#########processing data stringstream stream; string s; stream << tempPath << "/" << p->index << "_temp.dat"; stream >> s; char* filepath=const_cast<char*>(s.c_str()); //cout << filepath << endl; //######################getpath TEMP[p->index].open(filepath); if(!TEMP[p->index].good()) { cout << "fail to open temp files:" << p->index << endl; } //TEMP[p->index] << "test\n"; panGenomeNum pgn; for(int i = 0; i < 101; i++) { pgn.SampleNum[i] = 0; pgn.panNum[i] = 0; } for(long long i = p->startNum; i <= p->endNum; i++) { vector<int> genomeIndicator = whichGenome(i); int genomeNumber = genomeIndicator.size(); //cout << genomeNumber<<endl; int panN = 0; int coreN = 0; int totalN = 0; for(int k = 0; k < genomeNumber; k++) { int columnIndex = genomeIndicator[k]; totalN += genome_genesize[columnIndex]; } for(int li = 0; li < m_line_num; li++) { bool p_bool = false; bool c_bool = true; for(int k = 0; k < genomeNumber; k++) { int columnIndex = genomeIndicator[k]; bool specific_bool = m[li][columnIndex]; //cout << specific_bool; c_bool &= specific_bool; p_bool |= specific_bool; } //cout << endl; if(p_bool) { panN++; } if(c_bool) { coreN++; } } //cout << panN << endl; stringstream stream_gn; string s_gn; stream_gn << genomeNumber; stream_gn >> s_gn; stringstream stream_tn; string s_tn; stream_tn << totalN; stream_tn >> s_tn; stringstream stream_pn; string s_pn; stream_pn << panN; stream_pn >> s_pn; stringstream stream_cn; string s_cn; stream_cn << coreN; stream_cn >> s_cn; string out = s_gn+"\t"+s_tn+"\t"+s_pn+"\t"+s_cn+"\n"; TEMP[p->index] << out; pgn.SampleNum[genomeNumber] ++;//the index is genomeNumber, start from 1 pgn.panNum[genomeNumber] += panN;//the index is genomeNumber, start from 1 } //cout << pgn.panNum[1] << endl; pN[p->index] = pgn; //cout << p->index << endl; //cout << pN[p->index].panNum[2] << endl; TEMP[p->index].close(); //#########exit pthread_exit((void*)0); } int readData() { ifstream CLUSTER; CLUSTER.open(clusterPath); if(!CLUSTER.good()) { cout << "ERROR: illegal input file path: " << clusterPath <<endl; cout << "Input format:\n" << "program_name \n"; exit(0); } char* genome_name[101];//id start from 0 int e_num = 0;//e_num equals to the num of char* //which means that e_num-1 equals to the number of genome //i.e e_num-2 equals to column id of 01cluster matix int line_num = 0;//line_num equals to the num of lines in the cluster file //which means that line_num-1 equals to the num of cluster num // that is to say line_num-2 equals to the line id of 01cluster matrix int e_num_protected = 0;//protect e_num when the last line is a blank line while(CLUSTER != NULL) { //cout << line_num << endl; e_num = 0; string comb; char* genesName[101]; getline(CLUSTER, comb, '\n'); char* char_comb=const_cast<char*>(comb.c_str());//const char* to char* char* temp_num = strtok(char_comb,"\t");//split string while(temp_num != NULL) { genesName[e_num] = temp_num; temp_num = strtok(NULL,"\t"); e_num++; } if(e_num == 0) { break; } vector<bool> m_line; if(line_num == 0) { for(int i = 1; i <=e_num; i++) { genome_name[i-1] = genesName[e_num]; // so the size of genome_num = e_num; } e_num_protected = e_num; }else { for(int i = 1;i < e_num; i++) { if(strcmp(genesName[i], "-") == 0)//if equal, return 0 { //cout << num[i] << endl; m_line.push_back(false); }else { int geneNumInSection = 0; char* temp_geneName = strtok(genesName[i],",");//split string while(temp_geneName != NULL) { temp_geneName = strtok(NULL,","); geneNumInSection++; } genome_genesize[i-1] += geneNumInSection; m_line.push_back(true); } } } if(line_num == 0) { line_num++; }else { m.push_back(m_line); line_num++; } } CLUSTER.close(); //true&false matrix, (line_num-1)*(e_num-1) m_line_num = line_num - 1; m_column_num = e_num_protected - 1; char* pathSegment[50]; char* temp_num_2 = strtok(clusterPath,"/");//split string int e_num_2 = 0; while(temp_num_2 != NULL) { pathSegment[e_num_2] = temp_num_2; temp_num_2 = strtok(NULL,"/"); e_num_2++; } for(int i = 0; i < e_num_2 -1 ; i++) { strcat(tempPath, "/"); strcat(tempPath, pathSegment[i]); } return m_column_num; } int main(int argc,char *argv[]) { if(argc != 3) { cout << "ERROR: illegal argument number: " << argc << endl; cout << "Input format:\n" << "program_name inputfile threadNum\n" << "i.e.\n" << "./main sun.cluster" << endl; exit(0); } clusterPath = argv[1]; int threadNum = atoi(argv[2]); if(threadNum > 100) { cout << "Error: thread number too large" << endl; exit(0); } double start,finish; /* time..*/ start=(double)clock(); /* 我的time.h内没有CLOCKS_PER_SEC */ int genomeSize = readData(); //cout << genomeSize << endl; long long totalLong = genomeNum2LongLong(genomeSize); long long span = totalLong / threadNum; long long last_end = 0; pthread_t threadId[101];//thread id strat from 0 NumRange p[101]; for(int i = 0; i < threadNum; i++) { long long start = last_end + 1; long long end = start + span; if(end > totalLong) { end = totalLong; } last_end = end; p[i].index = i; p[i].startNum = start; p[i].endNum = end; } for(int i = 0; i < threadNum; i++) { //cout << "strat:" << p[i].startNum << "~~" << p[i].endNum << endl; //pthread_t id; //threadId.push_back(id); int ret = pthread_create(&threadId[i],NULL,writeDataByThread, (void*)&p[i]); //pthread_join(threadId[i], NULL); sleep(0.01);//important so that the CPU will have time to run pthread_create completely //in the insprion1420, it costs at least 0.001 second to transport parameters //pthread_join(id, NULL); } for(int i = 0; i < threadNum; i++) { pthread_join(threadId[i], NULL); } stringstream stream_path; string s_path; stream_path << tempPath << "/panGenome.txt"; stream_path >> s_path; char* panPath=const_cast<char*>(s_path.c_str()); //cout << filepath << endl; //######################getpath ofstream PANGENOME; PANGENOME.open(panPath); if(!PANGENOME.good()) { cout << "open panGenome file failed!" << endl; exit (0); } PANGENOME << "ClusterConservation" << "\t" << "TotalGeneNumber" << "\t" << "PanGenome" << "\t" << "CoreGenome" << endl; for(int i = 0; i < threadNum; i++) { stringstream stream; string s; stream << tempPath << "/" << i << "_temp.dat"; stream >> s; char* filepath=const_cast<char*>(s.c_str()); //cout << filepath << endl; //######################getpath ifstream TEMPFILE; TEMPFILE.open(filepath); if(!TEMPFILE.good()) { cout << "fail to open temp files:" << p->index << endl; } while(TEMPFILE != NULL) { string matr; getline(TEMPFILE,matr); //cout << matr.length() << endl; if(matr != "\n" && matr != "") { PANGENOME << matr << endl; } } } PANGENOME.close(); long long panTotal[101] = {0};//index stands for genome number, start from 1; int SampleTotal[101] = {0};//index stands for genome number, start from 1; for(int i = 0; i < threadNum; i++) { for(int k = 1; k <= genomeSize; k++) { //panGenomeNum temp_pgn = pN[threadNum]; panTotal[k] += pN[i].panNum[k]; //cout <<"thread:" << i<< " genome:"<< k << " "<<pN[i].panNum[k] << endl; SampleTotal[k] += pN[i].SampleNum[k]; } } for(int k = 1; k <= genomeSize; k++) { //cout << SampleTotal[k] << endl; panTotal[k] /= SampleTotal[k]; } //cout << "right" << endl; stringstream stream_new; string s_new; stream_new << tempPath << "/newGene.txt"; stream_new >> s_new; char* newPath=const_cast<char*>(s_new.c_str()); //cout << filepath << endl; //######################getpath ofstream NEWGENE; NEWGENE.open(newPath); if(!NEWGENE.good()) { cout << "open panGenome file failed!" << endl; exit (0); } NEWGENE << "GenomeNumber" << "\t" << "NewGene" << endl; for(int k = 2; k <=genomeSize; k++) { NEWGENE << k << "\t" << panTotal[k] - panTotal[k-1] << endl; } NEWGENE.close(); for(int i = 0; i < threadNum; i++) { stringstream stream_delete; string s_delete; stream_delete << tempPath << "/" << i << "_temp.dat"; stream_delete >> s_delete; char commend[1001] = "rm "; char* deletepath=const_cast<char*>(s_delete.c_str()); strcat(commend, deletepath); system(commend); } finish=(double)clock(); //cout << (finish - start)/CLOCKS_PER_SEC << endl; return 0; }
发表评论
-
绦虫基因组研究方法
2012-12-21 21:21 961今天跟一个同学讨论了绦虫基因组研究方法,同时我也看到一些同学在 ... -
PyDev,在Eclipse中运行python
2012-04-20 10:38 2387最近学python做高精度运算。 虽然网上有很多高精度运算的 ... -
并行计算的强大
2012-04-17 10:36 1542最近在处理一批数据,10的8次方,处理完毕大概要一个月,并且这 ... -
生物信息学工具使用的经验之谈
2012-01-16 18:08 1547荣耀归于上帝, ... -
如何保持开放的头脑
2012-01-07 15:14 1211世界向我打开一扇大门,我却选择转过身,背对这个世界。 ... -
ortholog/inparalog/coortholog
2012-01-04 16:52 1811Homologs which originat ... -
非root权限用户安装perl模块
2012-01-04 09:36 3089网上有很多说非root权限怎么安装perl模块的帖子,我觉 ... -
非root权限安装perl
2012-01-03 21:18 1657在使用Linux或是unix ... -
运行interproscan/iprscan会遇到的问题
2012-01-01 21:28 16341. 运行iprscan的时候,一般需要根据机器的能力和安装i ... -
Interproscan性能测试
2011-12-28 20:50 1627interproscan的安装和运行,很多网站都有介绍,这里主 ... -
华大的生物信息培训教材
2011-12-19 15:45 1258LOL... -
PAML中文文档/计算分子进化
2011-12-12 16:14 2780先说PAML中文文档,PA ... -
Qt程序在windows下的发布
2011-12-02 14:21 1563这个问题,其实 Qt 的 manual 中解释的已经比较 ... -
【原创】用C++(QT)写跨平台GUI详解
2011-12-02 10:53 2394你还不知道什么是Qt?... ...什么?你还不知道C++能快 ... -
数据可视化之美
2011-12-01 20:08 2294最近越来越对数据可视化感兴趣了,正因为此我学习了R,excel ... -
Perl也可以读写excel哦
2011-12-01 15:32 1684perl 里面用Spreadsheet::WriteExcel ... -
R语言学习入门
2011-12-01 15:28 2884R语言是很多统计学和数据可视化的常用工具。 R语言也是生物信 ... -
R语言绘制heatmap热图
2011-11-22 10:40 16087介绍如何使用 R 绘制 heatmap 的文章。 今天无意间 ... -
使用Vienna RNA进行RNA二级结构预测
2011-11-07 15:50 3186现在比较准确,比较流行的RNA二级结构预测软件就是Vi ... -
perl实现蛋白质翻译以及蛋白质个数统计
2011-10-27 15:09 1961这个程序,用perl语言实现了RNA序列翻译蛋白质序列的过程。 ...
相关推荐
猪链球菌2型不同毒力菌株的...及Pan-genome分析_杜德超.caj
cd pan-genome-chartspython -m SimpleHTTPServer 网络服务器启动后,将这些链接加载到浏览器中: 这些脚本会为图表生成底层的矢量形状和线条,然后使用SVG Crowbar保存为SVG。 在Illustrator中添加了一些颜色调整...
假设我们有一个名为`Pan-Genome-main`的压缩包,其中包含各种基因组数据。首先,我们需要解压文件,然后使用`fastqc`进行质量控制,`bwa`进行比对,`samtools`进行排序和变异呼叫,最后用`vcfutils`或`plink`处理...
这项研究由【国际癌症基因组联盟 (ICGC)】和【The Cancer Genome Atlas (TCGA)】两大组织联合发起,并在【Pan-Cancer Analysis of Whole Genomes (PCAWG)】项目中进一步深化。通过对38种不同癌症类型的2658个样本...
《基因标签:深入理解"tag-genome.zip"》 在生物信息学领域,"tag-genome.zip"这个名称暗示了我们正在处理一个与基因组分析相关的数据集。"Tag"一词在这里通常指的是序列标签,它是一种短的DNA序列片段,来源于基因...
资源分类:Python库 所属语言:Python 资源全名:edge-genome-2.1.1.tar.gz 资源来源:官方 安装方法:https://lanzao.blog.csdn.net/article/details/101784059
资源分类:Python库 所属语言:Python 资源全名:django-genome-0.5.0.tar.gz 资源来源:官方 安装方法:https://lanzao.blog.csdn.net/article/details/101784059
《生物信息学:序列与基因组分析》是D.W Mount所著的一本深入探讨生物信息学领域的经典之作,尤其在蛋白质和DNA序列分析方面提供了详尽的指导。本书不仅覆盖了历史背景、理论基础,还介绍了多种实用的分析工具和技术...
这是第二部分Bioinformatics. Sequence and Genome Analysis
非常棒的生物信息学教程,覆盖序列拼接,遗传统计,microarray分析,进化树,比较基因组学,等等。作者之一是Michael S. Waterman,著名的Smith-Waterman算法的发明人之一,强吧?而且是英文原版,没有翻译问题。
MACS的主要目的是对通过短读测序器如Illumina的Genome Analyzer等产生的ChIP-Seq数据进行分析。它具备以下几个核心功能和特点: 1. **标签尺寸的动态建模**:MACS能够基于实验数据自动推断ChIP-Seq标签的尺寸(即...
在“blogging-my-genome-master”目录下,可能包含以下文件和子目录: 1. `src/`:存放OCaml源代码,包括数据处理、对齐、变异检测等功能的实现。 2. `scripts/`:可能包含与DNAnexus API交互的bash脚本,用于数据...
Celera汇编程序(CA)是一种全基因组shot弹枪(WGS)汇编程序,用于从WGS测序数据重建基因组DNA序列。
java8 看不到源码欢迎来到基因组分析教程页面 请访问此站点以查看此 git 存储库的网络版本。 此页面将帮助您学习如何基于 GATK BestPractice 制作流水线。...从此页面,您可以下载资源文件,例如参考 ...
**G-language Genome Analysis Environment 1.4.9:生物信息学的强大工具** G-language Genome Analysis Environment(GAE)是一款专为生物信息学家设计的应用程序,它在基因组分析领域提供了丰富的功能,旨在优化...
大癌症基因组数据的网络分析@author Tai-Hsien Ou Yang ( ) Kaiyi Zhu ( ) 我们这里提供的包包含在Hadoop环境下进行两种基于MapReduce的一致性索引评估的工具包和用于数据导入和导出的HBase工具,以及用于预处理、...