[pygr-notify] [pygr commit] r147 - contrib/compare

Wed Feb 11 11:37:56 PST 2009

Author: istvan.albert
Date: Wed Feb 11 11:31:33 2009
New Revision: 147

Added:
    contrib/compare/
    contrib/compare/benchmark.py
    contrib/compare/results.txt

Log:
framework comparison tests

Added: contrib/compare/benchmark.py
==============================================================================

--- (empty file)
+++ contrib/compare/benchmark.py	Wed Feb 11 11:31:33 2009
@@ -0,0 +1,223 @@
+"""
+Benchmarking various python based sequence manipulation
+frameworks
+"""
+import time, shelve, glob, os
+from pygr import seqdb, logger
+from Bio import SeqIO
+from cogent import LoadSeqs, DNA
+from bx.seq import fasta
+
+NSLICE = 10**5
+
+class Timer(object):
+    """Timer decorator, prints timing information on a function call"""
+    def __init__(self, func):
+        self.func = func
+
+    def __call__(self, *args, **kwds):
+        start = time.time()
+        try:
+            # first argument is the tested function, the others are  
arguments
+            return self.func(*args, **kwds)
+        finally:
+            end = time.time()
+            print '%20s\t%3.1fs' % ( self.func.__name__, end-start)
+
+def get_index( fname, tag='index', flag='c' ):
+    fp = shelve.open( "%s-%s.db" % (fname, tag), flag=flag )
+    return fp
+
+def cleanup():
+    "Cleans up index data"
+    protected = [ 'dm.fasta', 'sacCer.fasta', '100K.fasta' ]
+    for fname in glob.glob("data/*"):
+        base = os.path.basename(fname)
+        if base not in protected:
+            os.remove(fname)
+
+ at Timer
+def pygr_parse_fasta( fname ):
+    fasta = seqdb.SequenceFileDB( fname )
+
+ at Timer
+def bio_parse_fasta( fname ):
+    handle = open( fname )
+    index = get_index( fname, tag='bio', flag='c')
+    for rec in SeqIO.parse(handle, "fasta") :
+        index[rec.id] = rec
+    index.close()
+    handle.close()
+
+
+ at Timer
+def cogent_parse_fasta( fname ):
+    index = get_index( fname, tag='cogent', flag='c')
+    fasta = LoadSeqs(fname, moltype=DNA, aligned=False)
+    for id, seq in fasta.items():
+        index[id] =  seq
+    index.close()
+
+ at Timer
+def bx_parse_fasta( fname ):
+    handle = open( fname )
+    index = get_index( fname, tag='bx', flag='c')
+    for rec in fasta.FastaReader( handle ) :
+        # nuke open file handle
+        rec.file = None
+        index[rec.name] = rec
+    index.close()
+    handle.close()
+
+
+ at Timer
+def pygr_iter( fname ):
+    fasta = seqdb.SequenceFileDB( fname )
+    for rec in fasta:
+        seq = fasta[rec]
+
+ at Timer
+def bio_iter( fname ):
+    fasta = get_index( fname, tag='bio', flag='c')
+    for rec in fasta:
+        seq = fasta[rec]
+    fasta.close()
+
+ at Timer
+def bx_iter( fname ):
+    fasta = get_index( fname, tag='bx', flag='c')
+    for rec in fasta:
+        seq = fasta[rec]
+    fasta.close()
+
+ at Timer
+def cogent_iter( fname ):
+    fasta = get_index( fname, tag='cogent', flag='c')
+    for rec in fasta:
+        seq = fasta[rec]
+    fasta.close()
+
+
+ at Timer
+def pygr_slice( fname ):
+    fasta = seqdb.SequenceFileDB( fname )
+    for rec in fasta:
+        seq = fasta[rec]
+        for i in range(NSLICE):
+            sub = str(seq[:100])
+        break
+
+ at Timer
+def bio_slice( fname ):
+    fasta = get_index( fname, tag='bio', flag='c')
+    for rec in fasta:
+        seq = fasta[rec].seq
+        for i in range(NSLICE):
+            sub = seq[:100].tostring()
+        break
+    fasta.close()
+
+ at Timer
+def bx_slice( fname ):
+    fasta = get_index( fname, tag='bx', flag='c')
+    for rec in fasta:
+        seq = fasta[rec]
+        for i in range(NSLICE):
+            sub = seq.get(0, 100)
+        break
+    fasta.close()
+
+ at Timer
+def cogent_slice( fname ):
+    fasta = get_index( fname, tag='cogent', flag='c')
+    for rec in fasta:
+        seq = fasta[rec]
+        for i in range(NSLICE):
+            sub = str( seq[:100] )
+        break
+    fasta.close()
+
+ at Timer
+def pygr_reverse_comp( fname ):
+    fasta = seqdb.SequenceFileDB( fname )
+    keys = fasta.keys()
+    keys.sort()
+    for rec in keys:
+        # force full reverse complement
+        seq = str(-fasta[rec])
+        sub = seq[:10]
+
+ at Timer
+def bio_reverse_comp( fname ):
+    fasta = get_index( fname, tag='bio', flag='c')
+    keys = fasta.keys()
+    keys.sort()
+    for rec in keys:
+        seq = fasta[rec].seq.reverse_complement()
+        sub = seq.tostring()
+    fasta.close()
+
+ at Timer
+def cogent_reverse_comp( fname ):
+    fasta = get_index( fname, tag='cogent', flag='c')
+    keys = fasta.keys()
+    keys.sort()
+    for rec in keys:
+        seq = fasta[rec].reversecomplement()
+        seq = str(seq)
+        sub = seq[:10]
+    fasta.close()
+
+ at Timer
+def bx_reverse_comp( fname ):
+    fasta = get_index( fname, tag='bx', flag='c')
+    for rec in fasta:
+        seq = fasta[rec]
+        rev = seq.reverse_complement( seq.text )
+        rev = str(rev)
+
+    fasta.close()
+
+def run_benchmarks( fname ):
+    print '%20s\t%s' % ( 'Test', 'Time')
+
+    logger.disable('DEBUG')
+
+    if 1:
+        cleanup()
+        pygr_parse_fasta(fname )
+        bio_parse_fasta(fname)
+        bx_parse_fasta(fname)
+        cogent_parse_fasta(fname )
+
+    print '-' * 20
+
+    if 1:
+        pygr_iter(fname )
+        bio_iter(fname)
+        bx_iter(fname)
+        cogent_iter(fname)
+
+    print '-' * 20
+
+    if 1:
+        pygr_slice(fname )
+        bio_slice(fname)
+        bx_slice(fname)
+        cogent_slice(fname)
+
+    print '-' * 20
+
+    if 1:
+        pygr_reverse_comp(fname )
+        bio_reverse_comp(fname)
+        bx_reverse_comp(fname)
+        cogent_reverse_comp(fname)
+
+if __name__ == '__main__':
+    fname = 'data/100K.fasta'
+
+    fname = 'data/dm.fasta'
+
+    print '*** benchmarking=%s' % fname
+    run_benchmarks( fname )
\ No newline at end of file

Added: contrib/compare/results.txt
==============================================================================
--- (empty file)
+++ contrib/compare/results.txt	Wed Feb 11 11:31:33 2009
@@ -0,0 +1,77 @@
+Speed comparisons when parsing FASTA files, test frameworks:
+
+Pygr: http://code.google.com/p/pygr/
+Biopython: http://biopython.org/wiki/Main_Page
+BX Python: http://bx-python.trac.bx.psu.edu/
+PyCogent: http://pycogent.sourceforge.net/
+
+
+The parse_fasta tests read the file and store it in its index. For tools  
that
+do not have their own fast storage I stored them in a bsdb storage via
+shelve keyed by the fasta id.
+
+The iter tests measure the time it takes to loop over the sequences in the
+storage.
+
+The slice tests measure the perfomance of slicing 10,000 times on a  
sequence
+
+Reverse complement measures the perfomance of iterating through the  
database
+reverse complementing each entire sequence.
+
+Two test files were used in the tests. A generated fasta file with 100,000  
entries,
+each 100 bp long and a file containing the drosophila genome. The first
+test file contains many short reads the second much fewer but much longer
+sequences.
+
+RESULTS
+
+========================================
+
+*** benchmarking=data/100K.fasta
+
+				Test	Time
+    pygr_parse_fasta	5.7s
+     bio_parse_fasta	24.8s
+      bx_parse_fasta	28.8s
+  cogent_parse_fasta	56.1s
+--------------------
+           pygr_iter	14.5s
+            bio_iter	6.7s
+             bx_iter	5.8s
+         cogent_iter	8.2s
+--------------------
+          pygr_slice	11.8s
+           bio_slice	1.4s
+            bx_slice	0.8s
+        cogent_slice	11.9s
+--------------------
+   pygr_reverse_comp	35.6s
+    bio_reverse_comp	12.4s
+     bx_reverse_comp	8.1s
+ cogent_reverse_comp	44.4s
+
+========================================
+
+*** benchmarking=data/dm.fasta
+
+				Test	Time
+    pygr_parse_fasta	36.6s
+     bio_parse_fasta	17.7s
+      bx_parse_fasta	29.1s
+  cogent_parse_fasta	40.2s
+--------------------
+           pygr_iter	0.0s
+            bio_iter	3.0s
+             bx_iter	2.9s
+         cogent_iter	2.9s
+--------------------
+          pygr_slice	11.5s
+           bio_slice	1.4s
+            bx_slice	0.8s
+        cogent_slice	12.1s
+--------------------
+   pygr_reverse_comp	137.8s
+    bio_reverse_comp	5.3s
+     bx_reverse_comp	31.7s
+ cogent_reverse_comp	26.6s
+