[Avida-SVN] r3543 - development/source/main
dk at myxo.css.msu.edu
dk at myxo.css.msu.edu
Mon Nov 30 10:10:27 PST 2009
Author: dk
Date: 2009-11-30 13:10:27 -0500 (Mon, 30 Nov 2009)
New Revision: 3543
Modified:
development/source/main/cGenomeUtil.cc
development/source/main/cGenomeUtil.h
development/source/main/cPopulationInterface.cc
Log:
Added tracking of the beginning of an HGT match.
Modified: development/source/main/cGenomeUtil.cc
===================================================================
--- development/source/main/cGenomeUtil.cc 2009-11-24 20:25:26 UTC (rev 3542)
+++ development/source/main/cGenomeUtil.cc 2009-11-30 18:10:27 UTC (rev 3543)
@@ -31,7 +31,7 @@
#include "cInstSet.h"
#include "functions.h"
#include <algorithm>
-// #include <strings.h>
+#include <string.h>
using namespace std;
@@ -245,49 +245,42 @@
}
-/*! Return all matches of substring within base.
+/*! Find (one of) the best substring matches of substring in base.
- The return value here is somewhat incomplete. Eventually, the idea is that the
- list of matches include the starting position of the match, the overall length (extent)
- of the match, and the cost of the match. Right now, however, it only includes the cost
- and ending position (inclusive).
+ The algorithm here is based on the well-known dynamic programming approach to
+ finding a substring match. Here, it has been extended to track the beginning and
+ ending locations of that match. Specifically, [begin,end) of the returned substring_match
+ denotes the matched region in the base string.
*/
-cGenomeUtil::substring_match_list_type cGenomeUtil::FindSubstringMatches(const cGenome& base, const cGenome& substring) {
- substring_match_list_type ssml(base.GetSize());
+cGenomeUtil::substring_match cGenomeUtil::FindSubstringMatch(const cGenome& base, const cGenome& substring) {
const int rows=substring.GetSize()+1;
const int cols=base.GetSize()+1;
- int* c = new int[cols]; // current row
- int* p = new int[cols]; // previous row
- for (int j=0; j<cols; ++j) {
- c[j] = 0;
- p[j] = 0;
+ substring_match m[2][cols];
+ substring_match* c=m[0];
+ substring_match* p=m[1];
+
+ for(int j=1; j<cols; ++j) {
+ p[j].begin = j;
}
for(int i=1; i<rows; ++i) {
- c[0] = p[0]+1;
+ c[0].cost = i;
+ c[0].begin = 0;
for(int j=1; j<cols; ++j) {
- int l[3] = {p[j-1], p[j], c[j-1]};
- c[j] = *std::min_element(l,l+3) + (substring[i-1] != base[j-1]);
- ssml[j-1].cost = c[j];
- ssml[j-1].position = j-1;
+ substring_match l[3] = {p[j-1], p[j], c[j-1]};
+ substring_match* s = std::min_element(l,l+3);
+
+ c[j].begin = s->begin;
+ c[j].end = j;
+ c[j].cost = s->cost + (substring[i-1] != base[j-1]);
}
std::swap(c,p);
}
- return ssml;
+ return *std::min_element(p, p+cols);
}
-/*! Return the best match of substring within base.
-
- \todo Ties for the value of the best match should be broken randomly.
- */
-cGenomeUtil::substring_match cGenomeUtil::FindBestSubstringMatch(const cGenome& base, const cGenome& substring) {
- substring_match_list_type ssml = FindSubstringMatches(base, substring);
- return *std::min_element(ssml.begin(), ssml.end());
-}
-
-
cGenome cGenomeUtil::Crop(const cGenome & in_genome, int start, int end)
{
assert(end > start); // Must have a positive length clip!
Modified: development/source/main/cGenomeUtil.h
===================================================================
--- development/source/main/cGenomeUtil.h 2009-11-24 20:25:26 UTC (rev 3542)
+++ development/source/main/cGenomeUtil.h 2009-11-30 18:10:27 UTC (rev 3543)
@@ -60,30 +60,25 @@
/*! Substring match record.
- The intent behind a substring match record is that it specifies where (position) a match
- between two genomes was found, the length (extent) of that match in the base string,
- and the edit distance (cost) of that match.
+ The intent behind a substring match record is that it specifies where a match
+ between two genomes was found as well as the edit distance (cost) of that match.
- The position and extent are inclusive, and describe the matching region in the
- base string like so: [position-extent, position]
+ The begin, end members follow iterator semantics. I.e., the matched region in
+ the base string is [begin, end). (End points to the element immediately following
+ the match).
*/
struct substring_match {
//! Default constructor.
- substring_match() : position(0), //extent(0),
- cost(0) { }
- //! Operator< overload.
+ substring_match() : begin(0), end(0), cost(0) { }
+ //! Operator< overload to support std::min_element.
bool operator<(const substring_match& sm) { return cost < sm.cost; }
- int position; //!< Final position in the base string of this match.
- //int extent; //!< Length of the match in the base string.
+ int begin; //!< Beginning of the substring match.
+ int end; //!< Ending of the substring match.
int cost; //!< Cost (edit distance) of this match.
};
- //! Type for a list of substring matches.
- typedef std::vector<substring_match> substring_match_list_type;
- //! Return all matches of substring within base.
- static substring_match_list_type FindSubstringMatches(const cGenome& base, const cGenome& substring);
- //! Return the best match of substring within base.
- static substring_match FindBestSubstringMatch(const cGenome& base, const cGenome& substring);
+ //! Find (one of) the best substring matches of substring in base.
+ static substring_match FindSubstringMatch(const cGenome& base, const cGenome& substring);
// ===== Construction methods =====
static cGenome Crop(const cGenome& genome, int start, int end);
Modified: development/source/main/cPopulationInterface.cc
===================================================================
--- development/source/main/cPopulationInterface.cc 2009-11-24 20:25:26 UTC (rev 3542)
+++ development/source/main/cPopulationInterface.cc 2009-11-30 18:10:27 UTC (rev 3543)
@@ -658,29 +658,41 @@
}
// find the location within the offspring's genome that best matches the selected fragment:
- cGenomeUtil::substring_match ssm = cGenomeUtil::FindBestSubstringMatch(circ, *f);
-
- if(ssm.position > offspring.GetSize()) {
- // we matched on the circular portion of the genome, so we have to modify the
- // matched position to reflect this;
- ssm.position -= offspring.GetSize();
- }
-
+ cGenomeUtil::substring_match ssm = cGenomeUtil::FindSubstringMatch(circ, *f);
+
+ // did we match any part of the circular portion of the genome? if so, adjust
+ // begin & end to suit.
+ if(ssm.begin > offspring.GetSize()) { ssm.begin -= offspring.GetSize(); }
+ if(ssm.end > offspring.GetSize()) { ssm.end -= offspring.GetSize(); }
+
// there are (currently) two supported types of HGT mutations: insertions & replacements.
// which one are we doing?
if(ctx.GetRandom().P(m_world->GetConfig().HGT_INSERTION_MUT_P.Get())) {
// insertion: insert the fragment just after the final location of the match:
- offspring.Insert(ssm.position+1, *f);
+ offspring.Insert(ssm.end, *f);
} else {
- // replacement: replace up to fragment size instructions in the genome.
- // note that replacement can wrap around from front->back. replacement counts
- // forward, so we have to find the start position first.
- int start = ssm.position - f->GetSize() + 1;
- //int start = ssm.position - ssm.extent + 1; // this isn't turned on yet.
- if(start < 0) { start += offspring.GetSize(); }
- for(int i=0; i<f->GetSize(); ++i) {
- offspring[start] = (*f)[i];
- if(++start >= offspring.GetSize()) { start = 0; }
+ // replacement: replace [begin,end) instructions in the genome with the fragment.
+ if(ssm.begin <= ssm.end) {
+ // match didn't wrap around the end, things are easy:
+ offspring.Replace(ssm.begin, std::max(ssm.end-ssm.begin, 1), *f);
+ } else {
+ // match wrapped around the end. two different replacements to do now:
+ // [ssm.begin, offspring.end) and [0, ssm.end).
+
+ // first, replace the [ssm.begin, offspring.end) region; we'll try to
+ // preserve the size of this region:
+ int tail_size = std::min(offspring.GetSize()-ssm.begin, f->GetSize());
+ cGenome tail(&(*f)[0], &(*f)[0]+tail_size);
+ offspring.Replace(ssm.begin, tail_size, tail);
+
+ // now, replace the [0, ssm.end) region or remove it if the whole fragment
+ // was already copied in:
+ if(tail_size != f->GetSize()) {
+ cGenome head(&(*f)[0]+tail_size, &(*f)[0]+f->GetSize());
+ offspring.Replace(0, ssm.end, head);
+ } else {
+ offspring.Remove(0, ssm.end);
+ }
}
}
More information about the Avida-cvs
mailing list