#!/usr/bin/env python # -*- coding: utf-8 -*- # '''Extract certain species sequences based on the taxid from NCBI pre-formated database. For example, extract all human sequences from the refseq_rna database Usage: ./ExtractByTaxonomyID.py -i refseq_rna -b ~/bif/ncbi-blast-2.2.22+/bin -t rna by Wubin Qu Copyright @ 2010, All Rights Reserved. ''' Author = 'Wubin Qu , BIRM, China' Date = '2010-2-26' License = 'GPL v3' Version = '1.0' import sys, re, os import subprocess from optparse import OptionParser taxid_dict = { '3702' : 'Arabidopsis_thaliana', '11103' : 'Hepatitis_C_virus', '4754' : 'Pneumocystis_carinii', '9913' : 'Bos_taurus', '9606' : 'human', '10116' : 'rat', '6239' : 'Caenorhabditis_elegans', '148305' : 'Magnaporthe_grisea', '4932' : 'Saccharomyces_cerevisiae', '3055' : 'Chlamydomonas_reinhardtii', '10090' : 'mouse', '9031' : 'chicken', '4896' : 'Schizosaccharomyces_pombe', '7955' : 'zebrafish', '2104' : 'Mycoplasma_pneumoniae', '31033' : 'Takifugu_rubripes', '44689' : 'Dictyostelium_discoideum', '5141' : 'Neurospora_crassa', '8355' : 'Xenopus_laevis', '7227' : 'Drosophila_melanogaster', '4530' : 'Oryza_sativa', '4577' : 'rice', '562' : 'Escherichia_coli', '5833' : 'Plasmodium_falciparum', } def get_opt(): '''Options and args''' usage = '''Usage: %prog [options] -i infile -b blastplus_bin_direcotry -t type Example: ./ExtractByTaxonomyID.py -i refseq_rna -b ~/bif/ncbi-blast-2.2.22+/bin -t rna ''' version = '%prog' + ' %s [%s]' % (Version, Date) parser = OptionParser(usage=usage, version=version) parser.add_option('-i', '--infile', dest='infile', help='File to be extracted. [String]') parser.add_option('-b', '--blastplus_bin', dest='blastplus_bin', help='The path of the blast+ bin direcotry, such as "/bif/ncbi-blast2.2.22+/bin". [String]') parser.add_option('-t', '--type', dest='type', help='The database type: rna or genomic. Default is rna. [String]', default='rna') (options, args) = parser.parse_args() if len(args) > 1: parser.error('Incorrect input arguments') if not options.infile: parser.error('Input file needed, add -h for help') if not options.blastplus_bin: parser.error('BLAST+ bin direcotry needed, add -h for help') if not options.type: parser.error('The type for the output files is needed: "genomic or rna", add -h for help') return options def extract_by_taxid(options, tmp_taxid_file): '''Extract the sequence by taxid''' taxid_gi_dict = {} for line in open(tmp_taxid_file): gi, taxid = line.split() if taxid in taxid_gi_dict: taxid_gi_dict[taxid].append(gi) else: taxid_gi_dict[taxid] = [] taxid_gi_dict[taxid].append(gi) for taxid in taxid_dict.iterkeys(): if taxid in taxid_gi_dict: gi_list = taxid_gi_dict[taxid] else: print '%s (%s) is not found in %s database' % (taxid_dict[taxid], taxid, options.infile) continue out_file = '%s.%s' % (taxid_dict[taxid], options.type) cmd = '%s%sblastdbcmd -db %s -entry_batch - -out %s' % (options.blastplus_bin, os.sep, options.infile, out_file) try: out, err = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE).communicate(os.linesep.join(gi_list)) except: print >> sys.stderr, 'Error: %s' % cmd exit() if err: print >> sys.stderr, err exit() def main (): '''Main''' options = get_opt() tmp_taxid_file = os.path.join(os.curdir, 'tmp_taxid_file.txt') cmd = '%s%sblastdbcmd -db %s -entry all -outfmt "%%g %%T" -out %s' % (options.blastplus_bin, os.sep, options.infile, tmp_taxid_file) try: subprocess.Popen(cmd, shell=True).wait() except: print >> sys.stderr, 'Error: %s' % cmd exit() extract_by_taxid(options, tmp_taxid_file) if __name__ == '__main__': main()