#!/home/quwb/bin/ActivePython-2.6/bin/python # -*- coding:utf-8 -*- '''Extract genome sequences for E. coli and concantenate them. There are many genome sequences of different strains of E. coli are available in NCBI ftp site. The goal of this script (program) is to extract the genome sequences of different stains of E.coli and concantenate them into one file in fasta format. 1. Download all bacteria genome sequences from NCBI ftp site: wget -c ftp://ftp.ncbi.nih.gov/genomes/Bacteria/all.fna.tar.gz 2. Uncompress it using: tar zxvf all.fna.tar.gz 3. Running this script: ./merge_E.coli_genome.py -i . -o Escherichia_coli.genome 4. Done. 5. Extend function: Change the prefix of "Escherichia_coli_" (Line: 68) to any other prefix and to concantenate genome sequences of other species. by Wubin Qu , Copyright @ 2010, All Rights Reserved. ''' Author = 'Wubin Qu CZlab, BIRM, China' Date = 'May-27-2010 15:50:05' License = 'GPL v3' Version = '1.0' import sys import os from optparse import OptionParser import shutil def get_opt(): '''Handle options''' usage = 'Usage: %prog [options]' version = '%prog Version: ' + '%s [%s]' % (Version, Date) parser = OptionParser(usage=usage, version=version) parser.add_option('-i', '--indir', dest='indir', help='The directory which containing bacteria genome sequences. [String]') parser.add_option('-o', '--outfile', dest='outfile', help='Output file name. [String]') [options, args] = parser.parse_args() if len(args) > 1: parser.error('Incorrect argument, add" "-h" for help.') if not options.indir: parser.error('The directory which containing bacteria genome sequences was needed, add" "-h" for help.') if not options.outfile: parser.error('The output file name needed, add" "-h" for help.') return options def merge_cat(indir, outfile): '''Merge & concantenate''' dirs = os.listdir(indir) try: fh = open(outfile, 'wb') except: print >> sys.stderr, 'Error: can not open %s for writing' % outfile exit() for dir in dirs: if dir.startswith('Escherichia_coli_'): for file in os.listdir(os.path.join(indir, dir)): shutil.copyfileobj(open(os.path.join(indir, dir, file), 'rb'), fh) fh.close() def main (): '''Main''' options = get_opt() merge_cat(options.indir, options.outfile) if __name__ == '__main__': main()