Bioinfo~~~
GenBank filetype to Fasta filetype
.gb -> .fasta
public class G2F {
private String LOCUS = "LOCUS ", LOCUS_S = "begin", //"SOURCE(name)"+"." +"LOCUS"
ORGANISM = " ORGANISM ", ORGANISM_S = "begin",
ACCESSION = "ACCESSION ", ACCESSION_S = "begin",
VERSION = "VERSION ", VERSION_S = "begin",
DEFINITION = "DEFINITION ", DEFINITION_S = "begin",
ORIGIN = "ORIGIN ", ORIGIN_S = "begin",
END_DATA = "//";
String short_name = null, accession_name = null,
version_name = null, definition_name = null,
organism_name = null, warning_mess = null,
origin = null,
genbank_name = null,
sequence = "",
firstline = null, secondline =null;
public void resetState()
{
LOCUS_S = "begin";
ORGANISM_S = "begin";
ACCESSION_S = "begin";
VERSION_S = "begin";
DEFINITION_S = "begin";
ORIGIN_S = "begin";
}
public void resetName()
{
short_name = null;
accession_name = null;
version_name = null;
definition_name = null;
organism_name = null;
warning_mess = null;
genbank_name = null;
origin = null;
sequence = "";
firstline = null;
secondline =null;
}
public void scan1squence(String inputfile, String outputfile) throws IOException
{
BufferedReader in = new BufferedReader(new FileReader(inputfile));
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outputfile)));
String sline = null;
/*String firstline = "";
String secondline = "";*/
while((sline = in.readLine()) != null){
if( ! sline.equals(END_DATA) && ! sline.equals("")){
String stemp = sline.substring(0, 12);
String resub = sline.substring(12);
//System.out.println(stemp);
if(stemp.equals(ORGANISM)){
organism_name = resub;
String s[] = resub.split(" ");
if(s[1].length() < 3){
s[1] = "XXX";
}
short_name = s[0].substring(0, 4)+ "_" + s[1].substring(0,4);
ORGANISM_S = "done";
}
if(stemp.equals(ACCESSION)){
accession_name = resub;
ACCESSION_S = "done";
}
if(stemp.equals(VERSION)){
String s[] = resub.split("GI:");
version_name = s[0].trim();
genbank_name = s[1].trim();
//System.out.println(version_name);
VERSION_S = "done";
}
if(stemp.equals(DEFINITION)){
definition_name = resub;
DEFINITION_S = "done";
}
if(stemp.equals(ORIGIN)){
ORIGIN_S = "done";
//sline = in.readLine();
}
if( ORIGIN_S.equals("done") ){
while( ! (sline = in.readLine()).equals(END_DATA) ){
String tempsequence = sline.substring(10);
sequence += tempsequence.replace(" ","").toUpperCase();
}
ORIGIN_S = "nextS";
//System.out.println(sequence);
}
if(ORGANISM_S.equals("done") && ACCESSION_S.equals("done")
&& VERSION_S.equals("done") && DEFINITION_S.equals("done")
&& ORIGIN_S.equals("nextS"))
{
firstline = ">"+short_name+"."+accession_name+
" ( "+version_name+" GI:"+genbank_name+" ) { "+definition_name+
" } [ "+organism_name+" ]";
/*if(firstline.equals(">Cich_endi.EL372564 ( EL372564.1 GI:125358052 ) { CCEL5375.b1_N24.ab1 CCE(LMS) endive Cichorium endivia cDNA clone } [ Cichorium endivia ]"))
System.out.println("----------true-----------");*/
secondline = sequence;
/*System.out.println(firstline);
System.out.println(secondline);
if(secondline.equals("TATTCCAGAATCTCACACCTTTTTACACTAGCAGAAAGCCAGAAACACAGACCAACAACAGACGAGGAGGCACGAATTCCAACACAGAAAGGTTTTGTCTTCTCTTTCAACATCAAAGAGGGCTCTAGAAGCCCCCTGAGACCAAATCTTCAAACCAACATGGAATACCAAGCAAACTATTCAATTTGGGATGGTTTATACTACCATCCACACCTATTCGGTGGCATTATGCTAACAGTTGCATTGCTTGGTCTTTCCACAAGCTATCTAAGTGGCATAGCTGGCTTCCCTACTTTACCCTACATGTTACCTTATTTAGGAAACTTCCAGAAACAAAAAACCAACAAGAAACGTATCCGTGTGTACATGGATGGATGTTTTGATCTCATGCATTATGGTCACGCAAATGCTTTAAGACAAGCTAAAGCTTTAGGAGACGAATTAGTGGTTGGAATTGTAAGTGATGAAGAAATCATCAAGAACAAAGGTCCTCCTGTTTTATCAATGGAGGAAAGATTGGCACTTGTTAGTGGATTGAAGTGGGTTGATGAAGTTATTGCTAATGCACCTTATGCTATTACTGAAGACTTCATGAACAGTCTATTTAAAGAACATAAGATTGATTATATCATTCATGGAGATGATCCTTGTTTGCTTCCTGATGGAAGTGATGCATATGCTTTAGCAAAAAAAAAGTTGGTCGTTACAA"))
System.out.println("----------true-----------");*/
out.println(firstline);
out.println(secondline);
resetName();
resetState();
continue;
}
}
else if(sline.equals(END_DATA)){
//resetName();
resetState();
}
}
out.close();
in.close();
}
}
public class G2F {
private String LOCUS = "LOCUS ", LOCUS_S = "begin", //"SOURCE(name)"+"." +"LOCUS"
ORGANISM = " ORGANISM ", ORGANISM_S = "begin",
ACCESSION = "ACCESSION ", ACCESSION_S = "begin",
VERSION = "VERSION ", VERSION_S = "begin",
DEFINITION = "DEFINITION ", DEFINITION_S = "begin",
ORIGIN = "ORIGIN ", ORIGIN_S = "begin",
END_DATA = "//";
String short_name = null, accession_name = null,
version_name = null, definition_name = null,
organism_name = null, warning_mess = null,
origin = null,
genbank_name = null,
sequence = "",
firstline = null, secondline =null;
public void resetState()
{
LOCUS_S = "begin";
ORGANISM_S = "begin";
ACCESSION_S = "begin";
VERSION_S = "begin";
DEFINITION_S = "begin";
ORIGIN_S = "begin";
}
public void resetName()
{
short_name = null;
accession_name = null;
version_name = null;
definition_name = null;
organism_name = null;
warning_mess = null;
genbank_name = null;
origin = null;
sequence = "";
firstline = null;
secondline =null;
}
public void scan1squence(String inputfile, String outputfile) throws IOException
{
BufferedReader in = new BufferedReader(new FileReader(inputfile));
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outputfile)));
String sline = null;
/*String firstline = "";
String secondline = "";*/
while((sline = in.readLine()) != null){
if( ! sline.equals(END_DATA) && ! sline.equals("")){
String stemp = sline.substring(0, 12);
String resub = sline.substring(12);
//System.out.println(stemp);
if(stemp.equals(ORGANISM)){
organism_name = resub;
String s[] = resub.split(" ");
if(s[1].length() < 3){
s[1] = "XXX";
}
short_name = s[0].substring(0, 4)+ "_" + s[1].substring(0,4);
ORGANISM_S = "done";
}
if(stemp.equals(ACCESSION)){
accession_name = resub;
ACCESSION_S = "done";
}
if(stemp.equals(VERSION)){
String s[] = resub.split("GI:");
version_name = s[0].trim();
genbank_name = s[1].trim();
//System.out.println(version_name);
VERSION_S = "done";
}
if(stemp.equals(DEFINITION)){
definition_name = resub;
DEFINITION_S = "done";
}
if(stemp.equals(ORIGIN)){
ORIGIN_S = "done";
//sline = in.readLine();
}
if( ORIGIN_S.equals("done") ){
while( ! (sline = in.readLine()).equals(END_DATA) ){
String tempsequence = sline.substring(10);
sequence += tempsequence.replace(" ","").toUpperCase();
}
ORIGIN_S = "nextS";
//System.out.println(sequence);
}
if(ORGANISM_S.equals("done") && ACCESSION_S.equals("done")
&& VERSION_S.equals("done") && DEFINITION_S.equals("done")
&& ORIGIN_S.equals("nextS"))
{
firstline = ">"+short_name+"."+accession_name+
" ( "+version_name+" GI:"+genbank_name+" ) { "+definition_name+
" } [ "+organism_name+" ]";
/*if(firstline.equals(">Cich_endi.EL372564 ( EL372564.1 GI:125358052 ) { CCEL5375.b1_N24.ab1 CCE(LMS) endive Cichorium endivia cDNA clone } [ Cichorium endivia ]"))
System.out.println("----------true-----------");*/
secondline = sequence;
/*System.out.println(firstline);
System.out.println(secondline);
if(secondline.equals("TATTCCAGAATCTCACACCTTTTTACACTAGCAGAAAGCCAGAAACACAGACCAACAACAGACGAGGAGGCACGAATTCCAACACAGAAAGGTTTTGTCTTCTCTTTCAACATCAAAGAGGGCTCTAGAAGCCCCCTGAGACCAAATCTTCAAACCAACATGGAATACCAAGCAAACTATTCAATTTGGGATGGTTTATACTACCATCCACACCTATTCGGTGGCATTATGCTAACAGTTGCATTGCTTGGTCTTTCCACAAGCTATCTAAGTGGCATAGCTGGCTTCCCTACTTTACCCTACATGTTACCTTATTTAGGAAACTTCCAGAAACAAAAAACCAACAAGAAACGTATCCGTGTGTACATGGATGGATGTTTTGATCTCATGCATTATGGTCACGCAAATGCTTTAAGACAAGCTAAAGCTTTAGGAGACGAATTAGTGGTTGGAATTGTAAGTGATGAAGAAATCATCAAGAACAAAGGTCCTCCTGTTTTATCAATGGAGGAAAGATTGGCACTTGTTAGTGGATTGAAGTGGGTTGATGAAGTTATTGCTAATGCACCTTATGCTATTACTGAAGACTTCATGAACAGTCTATTTAAAGAACATAAGATTGATTATATCATTCATGGAGATGATCCTTGTTTGCTTCCTGATGGAAGTGATGCATATGCTTTAGCAAAAAAAAAGTTGGTCGTTACAA"))
System.out.println("----------true-----------");*/
out.println(firstline);
out.println(secondline);
resetName();
resetState();
continue;
}
}
else if(sline.equals(END_DATA)){
//resetName();
resetState();
}
}
out.close();
in.close();
}
}
分享到:
相关推荐
在进行基因序列分析时,经常需要将GenBank格式的文件转换成FASTA格式,因为FASTA格式更简洁,易于处理。`gb2fasta`就是一个Perl脚本,专门设计来完成这个任务。 Perl是一种强大的脚本语言,因其灵活性和在文本处理...
gbmunge 将Munge GenBank文件转换成FASTA序列和制表符分隔的元数据。 这个小C程序将从GenBank文件中提取以下信息: 名称加入长度提交日期主持人国家采集日期除了提取此信息外,还对日期进行了重新格式化,例如31-DEC...
DNA序列查看器,支持自定义,GenBank,FASTA,NCBI登录和iGEM输入特征SeqViz目标是成为具有简单API和易定制性的DNA序列查看器。 目前提供: 多种输入格式顺序加入(NCBI或iGEM) 文件(FASTA,GenBank,SBOL,...
FASTApple用Applescript编写,是针对普通计算生物学家的一套实用程序,可快速轻松地处理FASTA文件。... GenBank重命名器会以Genus_species_accession的格式自动重命名从GenBank下载的FASTA文件中的分类单元。
fastaToJson //handles fasta files (.fa, .fasta) genbankToJson //handles genbank files (.gb, .gbk) ab1ToJson //handles .ab1 sequencing read files sbolXmlToJson //handles .sbol files snapgeneToJson //...
教你读懂Genbank数据,作用:了解序列数据库的格式,有助于更好地提高数据库检索的效率和准确性。 DDBJ数据库的内容和格式与GenBank相同,此处不作详细介绍。 分别介绍EMBL和GenBank的数据库结构
perl的cpan库支持对基因库文件的解析,这个perl的脚本文件实现了对genbank类型的基因库中基因数据的提取和解析。用户使用的时候需要手动修改代码中的genbank文件的路径。
2. `GenBank2Fasta_UniExtractor_126.tcl`:这个文件名表明它是一个TCL脚本,可能是一个程序,用于将GenBank格式的数据转换为FASTA格式。GenBank是一个公共数据库,存储了大量生物分子序列及其相关注释。 3. `Cich_...
一组将常见的生物序列格式(例如EMBL,SWISS-PROT,UniProtKB,GenBank和RefSeq)转换为fasta序列格式的工具。
本篇将深入探讨如何利用BioPython来处理`ls_orchid.fasta`和`ls_orchid.gbk`这两个文件,它们是BioPython学习过程中的经典示例。 `ls_orchid.fasta`文件是FASTA格式的序列文件,通常用于存储DNA、RNA或蛋白质序列。...
本地序列库(例如Genbank)到FASTA文件的转换, 从本地序列库格式以BLAST格式准备数据库, 对Genbank,Refseq,Embl,Genpept,Swissprot,TrEmbl,Fasta,Silva和BOLD文件建立索引,从而可以通过序列标识符进行...
LoopMatcher是一种生物信息学工具,可在环中具有特定共有序列的cDNA / mRNA序列(FASTA,GenBank或Vienna格式)中搜索发夹结构。 它使用RNAfold预测序列结构,使用UShuffle生成具有定义的k个核苷酸频率的随机...
GenBank数据库检索及其应用 GenBank数据库是由美国国立生物技术信息中心(NCBI)维护的一级核酸序列数据库。该数据库的数据来源有三种:直接来源于测序工作者提交的序列;与其它数据机构协作交换的数据;美国专利局...
从GenBank获取基因序列及PCR引物设计的方法
尽管 MATLAB 生物信息学工具箱具有内生 GenBank 文件读取器 genbankread(),但有时难以读取这些具有意外但并非非正统格式的平面文件。 该程序 gbread() 旨在用更通用的替代方案替换 genbankread()。 与期望字段保持...
项目简介考虑到以下强制性要求,以您选择的任何脚本语言编写一个计算机程序,以将分配给您的DNA序列(以.fasta格式;请参阅附录)转换为蛋白质序列: 蛋白质的最小长度应为44(包含44个氨基酸)。 蛋白质的最大长度...
《PyPI官网下载:polish_genbank-0.0.2.tar.gz——深入解析Python库》 在Python的生态系统中,PyPI(Python Package Index)是最重要的资源库,它为开发者提供了一个分享和获取Python软件包的平台。本文将详细探讨...
GenBank是一个全球性的核酸序列数据库,它包含了各种生物体的DNA和RNA序列信息,由美国国立生物技术信息中心(NCBI)维护。GenBank数据库的检索对于科研人员来说至关重要,因为它提供了广泛的生命科学数据,支持遗传...