[Avida-SVN] r3543 - development/source/main

dk at myxo.css.msu.edu dk at myxo.css.msu.edu
Mon Nov 30 10:10:27 PST 2009


Author: dk
Date: 2009-11-30 13:10:27 -0500 (Mon, 30 Nov 2009)
New Revision: 3543

Modified:
   development/source/main/cGenomeUtil.cc
   development/source/main/cGenomeUtil.h
   development/source/main/cPopulationInterface.cc
Log:
Added tracking of the beginning of an HGT match.

Modified: development/source/main/cGenomeUtil.cc
===================================================================
--- development/source/main/cGenomeUtil.cc	2009-11-24 20:25:26 UTC (rev 3542)
+++ development/source/main/cGenomeUtil.cc	2009-11-30 18:10:27 UTC (rev 3543)
@@ -31,7 +31,7 @@
 #include "cInstSet.h"
 #include "functions.h"
 #include <algorithm>
-// #include <strings.h>
+#include <string.h>
 
 
 using namespace std;
@@ -245,49 +245,42 @@
 }
 
 
-/*! Return all matches of substring within base.
+/*! Find (one of) the best substring matches of substring in base.
  
- The return value here is somewhat incomplete.  Eventually, the idea is that the
- list of matches include the starting position of the match, the overall length (extent)
- of the match, and the cost of the match.  Right now, however, it only includes the cost
- and ending position (inclusive). 
+ The algorithm here is based on the well-known dynamic programming approach to
+ finding a substring match.  Here, it has been extended to track the beginning and
+ ending locations of that match.  Specifically, [begin,end) of the returned substring_match
+ denotes the matched region in the base string.
  */
-cGenomeUtil::substring_match_list_type cGenomeUtil::FindSubstringMatches(const cGenome& base, const cGenome& substring) {
-	substring_match_list_type ssml(base.GetSize());
+cGenomeUtil::substring_match cGenomeUtil::FindSubstringMatch(const cGenome& base, const cGenome& substring) {
 	const int rows=substring.GetSize()+1;
 	const int cols=base.GetSize()+1;
-	int* c = new int[cols]; // current row
-	int* p = new int[cols]; // previous row
-	for (int j=0; j<cols; ++j) {
-	  c[j] = 0;
-	  p[j] = 0;
+	substring_match m[2][cols];
+	substring_match* c=m[0];
+	substring_match* p=m[1];
+	
+	for(int j=1; j<cols; ++j) {
+		p[j].begin = j;
 	}
 	
 	for(int i=1; i<rows; ++i) {
-		c[0] = p[0]+1;
+		c[0].cost = i;
+		c[0].begin = 0;
 		for(int j=1; j<cols; ++j) {
-			int l[3] = {p[j-1], p[j], c[j-1]};
-			c[j] = *std::min_element(l,l+3) + (substring[i-1] != base[j-1]);
-			ssml[j-1].cost = c[j];
-			ssml[j-1].position = j-1;
+			substring_match l[3] = {p[j-1], p[j], c[j-1]};
+			substring_match* s = std::min_element(l,l+3);
+			
+			c[j].begin = s->begin;
+			c[j].end = j;
+			c[j].cost = s->cost + (substring[i-1] != base[j-1]);
 		}
 		std::swap(c,p);
 	}
 	
-	return ssml;
+	return *std::min_element(p, p+cols);
 }
 
 
-/*! Return the best match of substring within base.
- 
- \todo Ties for the value of the best match should be broken randomly.
- */
-cGenomeUtil::substring_match cGenomeUtil::FindBestSubstringMatch(const cGenome& base, const cGenome& substring) {
-	substring_match_list_type ssml = FindSubstringMatches(base, substring);
-	return *std::min_element(ssml.begin(), ssml.end());
-}
-
-
 cGenome cGenomeUtil::Crop(const cGenome & in_genome, int start, int end)
 {
   assert(end > start);                // Must have a positive length clip!

Modified: development/source/main/cGenomeUtil.h
===================================================================
--- development/source/main/cGenomeUtil.h	2009-11-24 20:25:26 UTC (rev 3542)
+++ development/source/main/cGenomeUtil.h	2009-11-30 18:10:27 UTC (rev 3543)
@@ -60,30 +60,25 @@
 
 	/*! Substring match record.
 	 
-	 The intent behind a substring match record is that it specifies where (position) a match
-	 between two genomes was found, the length (extent) of that match in the base string,
-	 and the edit distance (cost) of that match.
+	 The intent behind a substring match record is that it specifies where a match
+	 between two genomes was found as well as the edit distance (cost) of that match.
 	 
-	 The position and extent are inclusive, and describe the matching region in the
-	 base string like so: [position-extent, position]
+	 The begin, end members follow iterator semantics.  I.e., the matched region in
+	 the base string is [begin, end).  (End points to the element immediately following
+	 the match).
 	 */
 	struct substring_match {
 		//! Default constructor.
-		substring_match() : position(0), //extent(0),
-		cost(0) { }
-		//! Operator< overload.
+		substring_match() : begin(0), end(0), cost(0) { }
+		//! Operator< overload to support std::min_element.
 		bool operator<(const substring_match& sm) { return cost < sm.cost; }
-		int position; //!< Final position in the base string of this match.
-		//int extent; //!< Length of the match in the base string.
+		int begin; //!< Beginning of the substring match.
+		int end; //!< Ending of the substring match.
 		int cost; //!< Cost (edit distance) of this match.
 	};
 	
-	//! Type for a list of substring matches.
-	typedef std::vector<substring_match> substring_match_list_type;
-	//! Return all matches of substring within base.
-	static substring_match_list_type FindSubstringMatches(const cGenome& base, const cGenome& substring);
-	//! Return the best match of substring within base.
-	static substring_match FindBestSubstringMatch(const cGenome& base, const cGenome& substring);
+	//! Find (one of) the best substring matches of substring in base.
+	static substring_match FindSubstringMatch(const cGenome& base, const cGenome& substring);
 
   // ===== Construction methods =====
   static cGenome Crop(const cGenome& genome, int start, int end);

Modified: development/source/main/cPopulationInterface.cc
===================================================================
--- development/source/main/cPopulationInterface.cc	2009-11-24 20:25:26 UTC (rev 3542)
+++ development/source/main/cPopulationInterface.cc	2009-11-30 18:10:27 UTC (rev 3543)
@@ -658,29 +658,41 @@
 	}
 	
 	// find the location within the offspring's genome that best matches the selected fragment:
-	cGenomeUtil::substring_match ssm = cGenomeUtil::FindBestSubstringMatch(circ, *f);
-	
-	if(ssm.position > offspring.GetSize()) {
-		// we matched on the circular portion of the genome, so we have to modify the
-		// matched position to reflect this;
-		ssm.position -= offspring.GetSize();
-	} 
-	
+	cGenomeUtil::substring_match ssm = cGenomeUtil::FindSubstringMatch(circ, *f);
+
+	// did we match any part of the circular portion of the genome?  if so, adjust
+	// begin & end to suit.
+	if(ssm.begin > offspring.GetSize()) { ssm.begin -= offspring.GetSize(); }
+	if(ssm.end > offspring.GetSize()) { ssm.end -= offspring.GetSize(); }
+
 	// there are (currently) two supported types of HGT mutations: insertions & replacements.
 	// which one are we doing?
 	if(ctx.GetRandom().P(m_world->GetConfig().HGT_INSERTION_MUT_P.Get())) {
 		// insertion: insert the fragment just after the final location of the match:
-		offspring.Insert(ssm.position+1, *f);
+		offspring.Insert(ssm.end, *f);
 	} else {
-		// replacement: replace up to fragment size instructions in the genome.
-		// note that replacement can wrap around from front->back.  replacement counts
-		// forward, so we have to find the start position first.
-		int start = ssm.position - f->GetSize() + 1;
-		//int start = ssm.position - ssm.extent + 1; // this isn't turned on yet.
-		if(start < 0) { start += offspring.GetSize(); }
-		for(int i=0; i<f->GetSize(); ++i) {
-			offspring[start] = (*f)[i];
-			if(++start >= offspring.GetSize()) { start = 0; }
+		// replacement: replace [begin,end) instructions in the genome with the fragment.
+		if(ssm.begin <= ssm.end) {
+			// match didn't wrap around the end, things are easy:
+			offspring.Replace(ssm.begin, std::max(ssm.end-ssm.begin, 1), *f);
+		} else {
+			// match wrapped around the end.  two different replacements to do now:
+			// [ssm.begin, offspring.end) and [0, ssm.end).
+						
+			// first, replace the [ssm.begin, offspring.end) region; we'll try to
+			// preserve the size of this region:
+			int tail_size = std::min(offspring.GetSize()-ssm.begin, f->GetSize());
+			cGenome tail(&(*f)[0], &(*f)[0]+tail_size);
+			offspring.Replace(ssm.begin, tail_size, tail);
+			
+			// now, replace the [0, ssm.end) region or remove it if the whole fragment
+			// was already copied in:
+			if(tail_size != f->GetSize()) {
+				cGenome head(&(*f)[0]+tail_size, &(*f)[0]+f->GetSize());
+				offspring.Replace(0, ssm.end, head);
+			} else {
+				offspring.Remove(0, ssm.end);
+			}
 		}
 	}
 	




More information about the Avida-cvs mailing list