[pygr-notify] [pygr commit] r147 - contrib/compare
codesite-noreply at google.com
codesite-noreply at google.com
Wed Feb 11 11:37:56 PST 2009
Author: istvan.albert
Date: Wed Feb 11 11:31:33 2009
New Revision: 147
Added:
contrib/compare/
contrib/compare/benchmark.py
contrib/compare/results.txt
Log:
framework comparison tests
Added: contrib/compare/benchmark.py
==============================================================================
--- (empty file)
+++ contrib/compare/benchmark.py Wed Feb 11 11:31:33 2009
@@ -0,0 +1,223 @@
+"""
+Benchmarking various python based sequence manipulation
+frameworks
+"""
+import time, shelve, glob, os
+from pygr import seqdb, logger
+from Bio import SeqIO
+from cogent import LoadSeqs, DNA
+from bx.seq import fasta
+
+NSLICE = 10**5
+
+class Timer(object):
+ """Timer decorator, prints timing information on a function call"""
+ def __init__(self, func):
+ self.func = func
+
+ def __call__(self, *args, **kwds):
+ start = time.time()
+ try:
+ # first argument is the tested function, the others are
arguments
+ return self.func(*args, **kwds)
+ finally:
+ end = time.time()
+ print '%20s\t%3.1fs' % ( self.func.__name__, end-start)
+
+def get_index( fname, tag='index', flag='c' ):
+ fp = shelve.open( "%s-%s.db" % (fname, tag), flag=flag )
+ return fp
+
+def cleanup():
+ "Cleans up index data"
+ protected = [ 'dm.fasta', 'sacCer.fasta', '100K.fasta' ]
+ for fname in glob.glob("data/*"):
+ base = os.path.basename(fname)
+ if base not in protected:
+ os.remove(fname)
+
+ at Timer
+def pygr_parse_fasta( fname ):
+ fasta = seqdb.SequenceFileDB( fname )
+
+ at Timer
+def bio_parse_fasta( fname ):
+ handle = open( fname )
+ index = get_index( fname, tag='bio', flag='c')
+ for rec in SeqIO.parse(handle, "fasta") :
+ index[rec.id] = rec
+ index.close()
+ handle.close()
+
+
+ at Timer
+def cogent_parse_fasta( fname ):
+ index = get_index( fname, tag='cogent', flag='c')
+ fasta = LoadSeqs(fname, moltype=DNA, aligned=False)
+ for id, seq in fasta.items():
+ index[id] = seq
+ index.close()
+
+ at Timer
+def bx_parse_fasta( fname ):
+ handle = open( fname )
+ index = get_index( fname, tag='bx', flag='c')
+ for rec in fasta.FastaReader( handle ) :
+ # nuke open file handle
+ rec.file = None
+ index[rec.name] = rec
+ index.close()
+ handle.close()
+
+
+ at Timer
+def pygr_iter( fname ):
+ fasta = seqdb.SequenceFileDB( fname )
+ for rec in fasta:
+ seq = fasta[rec]
+
+ at Timer
+def bio_iter( fname ):
+ fasta = get_index( fname, tag='bio', flag='c')
+ for rec in fasta:
+ seq = fasta[rec]
+ fasta.close()
+
+ at Timer
+def bx_iter( fname ):
+ fasta = get_index( fname, tag='bx', flag='c')
+ for rec in fasta:
+ seq = fasta[rec]
+ fasta.close()
+
+ at Timer
+def cogent_iter( fname ):
+ fasta = get_index( fname, tag='cogent', flag='c')
+ for rec in fasta:
+ seq = fasta[rec]
+ fasta.close()
+
+
+ at Timer
+def pygr_slice( fname ):
+ fasta = seqdb.SequenceFileDB( fname )
+ for rec in fasta:
+ seq = fasta[rec]
+ for i in range(NSLICE):
+ sub = str(seq[:100])
+ break
+
+ at Timer
+def bio_slice( fname ):
+ fasta = get_index( fname, tag='bio', flag='c')
+ for rec in fasta:
+ seq = fasta[rec].seq
+ for i in range(NSLICE):
+ sub = seq[:100].tostring()
+ break
+ fasta.close()
+
+ at Timer
+def bx_slice( fname ):
+ fasta = get_index( fname, tag='bx', flag='c')
+ for rec in fasta:
+ seq = fasta[rec]
+ for i in range(NSLICE):
+ sub = seq.get(0, 100)
+ break
+ fasta.close()
+
+ at Timer
+def cogent_slice( fname ):
+ fasta = get_index( fname, tag='cogent', flag='c')
+ for rec in fasta:
+ seq = fasta[rec]
+ for i in range(NSLICE):
+ sub = str( seq[:100] )
+ break
+ fasta.close()
+
+ at Timer
+def pygr_reverse_comp( fname ):
+ fasta = seqdb.SequenceFileDB( fname )
+ keys = fasta.keys()
+ keys.sort()
+ for rec in keys:
+ # force full reverse complement
+ seq = str(-fasta[rec])
+ sub = seq[:10]
+
+ at Timer
+def bio_reverse_comp( fname ):
+ fasta = get_index( fname, tag='bio', flag='c')
+ keys = fasta.keys()
+ keys.sort()
+ for rec in keys:
+ seq = fasta[rec].seq.reverse_complement()
+ sub = seq.tostring()
+ fasta.close()
+
+ at Timer
+def cogent_reverse_comp( fname ):
+ fasta = get_index( fname, tag='cogent', flag='c')
+ keys = fasta.keys()
+ keys.sort()
+ for rec in keys:
+ seq = fasta[rec].reversecomplement()
+ seq = str(seq)
+ sub = seq[:10]
+ fasta.close()
+
+ at Timer
+def bx_reverse_comp( fname ):
+ fasta = get_index( fname, tag='bx', flag='c')
+ for rec in fasta:
+ seq = fasta[rec]
+ rev = seq.reverse_complement( seq.text )
+ rev = str(rev)
+
+ fasta.close()
+
+def run_benchmarks( fname ):
+ print '%20s\t%s' % ( 'Test', 'Time')
+
+ logger.disable('DEBUG')
+
+ if 1:
+ cleanup()
+ pygr_parse_fasta(fname )
+ bio_parse_fasta(fname)
+ bx_parse_fasta(fname)
+ cogent_parse_fasta(fname )
+
+ print '-' * 20
+
+ if 1:
+ pygr_iter(fname )
+ bio_iter(fname)
+ bx_iter(fname)
+ cogent_iter(fname)
+
+ print '-' * 20
+
+ if 1:
+ pygr_slice(fname )
+ bio_slice(fname)
+ bx_slice(fname)
+ cogent_slice(fname)
+
+ print '-' * 20
+
+ if 1:
+ pygr_reverse_comp(fname )
+ bio_reverse_comp(fname)
+ bx_reverse_comp(fname)
+ cogent_reverse_comp(fname)
+
+if __name__ == '__main__':
+ fname = 'data/100K.fasta'
+
+ fname = 'data/dm.fasta'
+
+ print '*** benchmarking=%s' % fname
+ run_benchmarks( fname )
\ No newline at end of file
Added: contrib/compare/results.txt
==============================================================================
--- (empty file)
+++ contrib/compare/results.txt Wed Feb 11 11:31:33 2009
@@ -0,0 +1,77 @@
+Speed comparisons when parsing FASTA files, test frameworks:
+
+Pygr: http://code.google.com/p/pygr/
+Biopython: http://biopython.org/wiki/Main_Page
+BX Python: http://bx-python.trac.bx.psu.edu/
+PyCogent: http://pycogent.sourceforge.net/
+
+
+The parse_fasta tests read the file and store it in its index. For tools
that
+do not have their own fast storage I stored them in a bsdb storage via
+shelve keyed by the fasta id.
+
+The iter tests measure the time it takes to loop over the sequences in the
+storage.
+
+The slice tests measure the perfomance of slicing 10,000 times on a
sequence
+
+Reverse complement measures the perfomance of iterating through the
database
+reverse complementing each entire sequence.
+
+Two test files were used in the tests. A generated fasta file with 100,000
entries,
+each 100 bp long and a file containing the drosophila genome. The first
+test file contains many short reads the second much fewer but much longer
+sequences.
+
+RESULTS
+
+========================================
+
+*** benchmarking=data/100K.fasta
+
+ Test Time
+ pygr_parse_fasta 5.7s
+ bio_parse_fasta 24.8s
+ bx_parse_fasta 28.8s
+ cogent_parse_fasta 56.1s
+--------------------
+ pygr_iter 14.5s
+ bio_iter 6.7s
+ bx_iter 5.8s
+ cogent_iter 8.2s
+--------------------
+ pygr_slice 11.8s
+ bio_slice 1.4s
+ bx_slice 0.8s
+ cogent_slice 11.9s
+--------------------
+ pygr_reverse_comp 35.6s
+ bio_reverse_comp 12.4s
+ bx_reverse_comp 8.1s
+ cogent_reverse_comp 44.4s
+
+========================================
+
+*** benchmarking=data/dm.fasta
+
+ Test Time
+ pygr_parse_fasta 36.6s
+ bio_parse_fasta 17.7s
+ bx_parse_fasta 29.1s
+ cogent_parse_fasta 40.2s
+--------------------
+ pygr_iter 0.0s
+ bio_iter 3.0s
+ bx_iter 2.9s
+ cogent_iter 2.9s
+--------------------
+ pygr_slice 11.5s
+ bio_slice 1.4s
+ bx_slice 0.8s
+ cogent_slice 12.1s
+--------------------
+ pygr_reverse_comp 137.8s
+ bio_reverse_comp 5.3s
+ bx_reverse_comp 31.7s
+ cogent_reverse_comp 26.6s
+
More information about the pygr-notify
mailing list