[Avida-SVN] r3464 - in development/source: cpu main

dk at myxo.css.msu.edu dk at myxo.css.msu.edu
Sat Oct 10 11:41:39 PDT 2009


Author: dk
Date: 2009-10-10 14:41:38 -0400 (Sat, 10 Oct 2009)
New Revision: 3464

Modified:
   development/source/cpu/cHardwareBase.cc
   development/source/cpu/cHardwareCPU.cc
   development/source/cpu/cTestCPUInterface.h
   development/source/main/cAvidaConfig.h
   development/source/main/cGenomeUtil.cc
   development/source/main/cGenomeUtil.h
   development/source/main/cOrgInterface.h
   development/source/main/cOrganism.cc
   development/source/main/cOrganism.h
   development/source/main/cPopulation.cc
   development/source/main/cPopulationInterface.cc
   development/source/main/cPopulationInterface.h
Log:
Updates to HGT code, including substring matching for genomes.

Modified: development/source/cpu/cHardwareBase.cc
===================================================================
--- development/source/cpu/cHardwareBase.cc	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/cpu/cHardwareBase.cc	2009-10-10 18:41:38 UTC (rev 3464)
@@ -238,8 +238,15 @@
                                                   m_organism->GetDivSlipProb() / mut_multiplier);
     for (int i = 0; i < num_mut; i++) doSlipMutation(ctx, offspring_genome);
   }
+	
+	// HGT Mutations - NOT COUNTED
+	// These are location-dependent, hence they are implemented in cPopulationInterface.
+	if(m_world->GetConfig().ENABLE_HGT.Get()
+		 && (m_world->GetConfig().HGT_MUTATION_P.Get() > 0.0)
+		 && (ctx.GetRandom().P(m_world->GetConfig().HGT_MUTATION_P.Get()))) {
+		m_organism->GetOrgInterface().DoHGTMutation(ctx, offspring_genome);
+	}
   
-  
   // Divide Mutations
   if (m_organism->TestDivideMut(ctx) && totalMutations < maxmut) {
     const unsigned int mut_line = ctx.GetRandom().GetUInt(offspring_genome.GetSize());

Modified: development/source/cpu/cHardwareCPU.cc
===================================================================
--- development/source/cpu/cHardwareCPU.cc	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/cpu/cHardwareCPU.cc	2009-10-10 18:41:38 UTC (rev 3464)
@@ -6194,7 +6194,6 @@
     } else 
       doSlipMutation(ctx, m_memory, write_head.GetPosition());
   }
-	m_organism->AttemptHGTInsertion(ctx);
 
   read_head.Advance();
   write_head.Advance();

Modified: development/source/cpu/cTestCPUInterface.h
===================================================================
--- development/source/cpu/cTestCPUInterface.h	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/cpu/cTestCPUInterface.h	2009-10-10 18:41:38 UTC (rev 3464)
@@ -112,6 +112,9 @@
 	void CreateLinkByXY(int x, int y, double weight=1.0) { }
 	//! Link this organism's cell to the cell with index idx.
 	void CreateLinkByIndex(int idx, double weight=1.0) { }
+	
+	//! HGT mutation (does nothing).
+	void DoHGTMutation(cAvidaContext& ctx, cGenome& offspring) { }
 };
 
 

Modified: development/source/main/cAvidaConfig.h
===================================================================
--- development/source/main/cAvidaConfig.h	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cAvidaConfig.h	2009-10-10 18:41:38 UTC (rev 3464)
@@ -660,16 +660,16 @@
 	CONFIG_ADD_VAR(DEME_NETWORK_REQUIRES_CONNECTEDNESS, int, 1, "Whether the deme's network must be connected before an actual fitness is calculated.");
 	CONFIG_ADD_VAR(DEME_NETWORK_TOPOLOGY_FITNESS, int, 0, "Network measure used to determine fitness; see cDemeTopologyNetwork.h.");
 	
-	// -------- Horizontal gene transfer config options --------
+	// -------- Horizontal Gene Transfer (HGT) config options --------
 	CONFIG_ADD_GROUP(HGT_GROUP, "Horizontal gene transfer settings");
 	CONFIG_ADD_VAR(ENABLE_HGT, int, 0, "Whether HGT is enabled; 0=false (default),\n 1=true.");
 	CONFIG_ADD_VAR(HGT_FRAGMENT_SIZE_MEAN, double, 10, "Mean size of fragments (drawn from a normal\ndist., default=10).");
 	CONFIG_ADD_VAR(HGT_FRAGMENT_SIZE_VARIANCE, double, 2, "Variance of fragments (drawn from a normal\ndist., default=2).");
 	CONFIG_ADD_VAR(HGT_MAX_FRAGMENTS_PER_CELL, int, 100, "Max. allowed number of fragments\nper cell (default=100).");
 	CONFIG_ADD_VAR(HGT_DIFFUSION_METHOD, int, 0, "Method to use for diffusion of genome\nfragments (0=none [default]).");
-	CONFIG_ADD_VAR(HGT_INSERTION_PROB, double, 0.0, "Probability that a genome fragment\nwill be inserted during a copy (default=0.0).");
-	CONFIG_ADD_VAR(HGT_LOOKAHEAD_LENGTH, int, 4, "Number of instructions forward from the\nread head that will be used to calculate\nthe liklihood of HGT.");	
-  
+	CONFIG_ADD_VAR(HGT_MUTATION_P, double, 0.0, "Probability that an HGT mutation will occur on divide (default=0.0).");
+	CONFIG_ADD_VAR(HGT_INSERTION_MUT_P, double, 0.5, "Probability that an HGT mutation will result in an insertion (default=0.5); replacement if false.");
+
   CONFIG_ADD_GROUP(INST_RES_GROUP, "Resource-Dependent Instructions Settings");
   CONFIG_ADD_VAR(INST_RES, cString, "", "Resource upon which the execution of certain instruction depends");
   CONFIG_ADD_VAR(INST_RES_FLOOR, double, 0.0, "Assumed lower level of resource in environment.  Used for probability dist.");

Modified: development/source/main/cGenomeUtil.cc
===================================================================
--- development/source/main/cGenomeUtil.cc	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cGenomeUtil.cc	2009-10-10 18:41:38 UTC (rev 3464)
@@ -30,6 +30,8 @@
 #include "cInitFile.h"
 #include "cInstSet.h"
 #include "functions.h"
+#include <algorithm>
+#include <strings.h>
 
 
 using namespace std;
@@ -242,6 +244,84 @@
 }
 
 
+/*! Return all matches of substring within base.
+ 
+ The return value here is somewhat incomplete.  Eventually, the idea is that the
+ list of matches include the starting position of the match, the overall length (extent)
+ of the match, and the cost of the match.  Right now, however, it only includes the cost.
+ 
+ \todo Convert this over to using two rows instead of len(substring)+1 rows (easy, but I'm being lazy).
+ */
+cGenomeUtil::substring_match_list_type cGenomeUtil::FindSubstringMatches(const cGenome& base, const cGenome& substring) {
+	substring_match_list_type ssml(base.GetSize());
+	const int rows=substring.GetSize()+1;
+	const int cols=base.GetSize()+1;
+	int costmat[rows][cols];	
+	bzero(costmat, sizeof(int)*rows*cols);
+	
+	// initialize the first row, first column, and match list starts.
+	for(int i=0; i<rows; ++i) { costmat[i][0] = i; }
+	for(int j=0; j<cols; ++j) { costmat[0][j] = 0; }
+	//	for(int j=0; j<base.GetSize(); ++j) { ssml[j].start = j; ssml[j].extent = 0; }
+		
+	// now do all the rest...
+	for(int i=1; i<rows; ++i) {
+		for(int j=1; j<cols; ++j) {
+			// we're only doing cost at the moment, so we can get away with something simple (i know, i know; space & time, etc., etc.):
+			int l[3] = { costmat[i-1][j-1], costmat[i-1][j], costmat[i][j-1] };
+			costmat[i][j] = *std::min_element(l,l+3) + (substring[i-1] == base[j-1]);
+// if we need to know the relationship among all three elements for tracking extents,
+// then this may be a good starting point:
+// A=above left, B=above, C=left
+//			if(costmat[i-1][j-1] < costmat[i-1][j]) {
+//				// A < B
+//				if(costmat[i-1][j-1] < costmat[i][j-1]) {
+//					// A < C --> A < (B,C)
+//					costmat[i][j] = costmat[i-1][j-1];
+//					++ssml[j-1].extent;
+//				} else {
+//					// A >= C --> C <= (A,B)
+//					costmat[i][j] = costmat[i][j-1];
+//					++ssml[j-1].extent;
+//				}
+//			} else {
+//				// A >= B
+//				if(costmat[i-1][j] < costmat[i][j-1]) {
+//					// B < C --> B <= (A,C)
+//					costmat[i][j] = costmat[i-1][j];
+//					// extent unchanged
+//				} else {
+//					// B >= C --> C <= (A,B)
+//					costmat[i][j] = costmat[i][j-1];
+//					++ssml[j-1].extent;
+//				}
+//			}
+//			
+//			// add the cost of a mismatch:
+//			costmat[i][j] += (substring[j] == base[i]);
+		}
+	}
+	
+	// copy match costs from the last row, skipping the null column (it can't ever be least):
+	for(int j=0; j<base.GetSize(); ++j) {
+		ssml[j].cost = costmat[rows-1][j+1];
+		ssml[j].position = j;
+	}
+	
+	return ssml;
+}
+
+
+/*! Return the best match of substring within base.
+ 
+ \todo Ties for the value of the best match should be broken randomly.
+ */
+cGenomeUtil::substring_match cGenomeUtil::FindBestSubstringMatch(const cGenome& base, const cGenome& substring) {
+	substring_match_list_type ssml = FindSubstringMatches(base, substring);
+	return *std::min_element(ssml.begin(), ssml.end());
+}
+
+
 cGenome cGenomeUtil::Crop(const cGenome & in_genome, int start, int end)
 {
   assert(end > start);                // Must have a positive length clip!

Modified: development/source/main/cGenomeUtil.h
===================================================================
--- development/source/main/cGenomeUtil.h	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cGenomeUtil.h	2009-10-10 18:41:38 UTC (rev 3464)
@@ -29,6 +29,7 @@
 #ifndef cGenome_h
 #include "cGenome.h"
 #endif
+#include <vector>
 
 class cAvidaContext;
 class cInstruction;
@@ -57,6 +58,24 @@
   static int FindSlidingDistance(const cGenome& gen1, const cGenome& gen2);
   static int FindEditDistance(const cGenome& gen1, const cGenome& gen2);
 
+	//! Substring match record.
+	struct substring_match {
+		//! Default constructor.
+		substring_match() : position(0), extent(0), cost(0) { }
+		//! Operator< overload.
+		bool operator<(const substring_match& sm) { return cost < sm.cost; }
+		int position; //!< Final position in the base string of this match.
+		int extent; //!< Length of the match in the base string.
+		int cost; //!< Cost (edit distance) of this match.
+	};
+	
+	//! Type for a list of substring matches.
+	typedef std::vector<substring_match> substring_match_list_type;
+	//! Return all matches of substring within base.
+	static substring_match_list_type FindSubstringMatches(const cGenome& base, const cGenome& substring);
+	//! Return the best match of substring within base.
+	static substring_match FindBestSubstringMatch(const cGenome& base, const cGenome& substring);
+
   // ===== Construction methods =====
   static cGenome Crop(const cGenome& genome, int start, int end);
   static cGenome Cut(const cGenome& genome, int start, int end);

Modified: development/source/main/cOrgInterface.h
===================================================================
--- development/source/main/cOrgInterface.h	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cOrgInterface.h	2009-10-10 18:41:38 UTC (rev 3464)
@@ -117,6 +117,7 @@
 	virtual void CreateLinkByFacing(double weight=1.0) = 0;
 	virtual void CreateLinkByXY(int x, int y, double weight=1.0) = 0;
 	virtual void CreateLinkByIndex(int idx, double weight=1.0) = 0;
+	virtual void DoHGTMutation(cAvidaContext& ctx, cGenome& offspring) = 0;
 };
 
 #endif

Modified: development/source/main/cOrganism.cc
===================================================================
--- development/source/main/cOrganism.cc	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cOrganism.cc	2009-10-10 18:41:38 UTC (rev 3464)
@@ -1231,48 +1231,3 @@
 	return val;
 	
 }
-
-/*! Tests for and attempts to perform an insertion of an HGT genome fragment 
- into this organism's genome.  Returns true if a genome fragment was inserted, 
- false otherwise.
- */
-bool cOrganism::AttemptHGTInsertion(cAvidaContext& ctx) {
-	if(!m_world->GetConfig().ENABLE_HGT.Get()
-		 || (m_world->GetConfig().HGT_INSERTION_PROB.Get()==0.0)) {
-		return false;
-	}
-	
-	int cellid = GetCellID();
-	if(cellid == -1) { return false; } // test cpu, nothing to do.
-	cPopulationCell& cell = m_world->GetPopulation().GetCell(cellid);
-	
-	// need to calculate the probability of inserting any of the fragments that live in
-	// this organism's cell.  this P() is partially based on the instructions near the
-	// read head, so let's grab that fragment:
-	cGenome nearby = m_hardware->GetGenomeFragment(m_world->GetConfig().HGT_LOOKAHEAD_LENGTH.Get());
-
-	// get the list of genome fragments currently in this cell:
-	typedef cPopulationCell::fragment_list_type fragment_list_type;
-	fragment_list_type& fragments = cell.GetFragments();
-	
-	// now, the probability of selecting a fragment for insertion is a combination
-	// of how long it's been in contact with this organism, a background rate, and
-	// the similarity of this fragment to the nearby fragment.  the oldest genomes
-	// have the greatest chance of being inserted, because they've gone through this
-	// loop multiple times.
-	//
-	// when a fragment is inserted, we also need to adjust the amount of hgt resource
-	// present in the population, and do some stat tracking.
-	for(fragment_list_type::iterator i=fragments.begin(); i!=fragments.end(); ++i) {
-		int d = std::max(cGenomeUtil::FindHammingDistance(nearby, *i), 1);
-		if(m_world->GetRandom().P(m_world->GetConfig().HGT_INSERTION_PROB.Get() / d)) {
-			m_hardware->InsertGenomeFragment(*i);
-			m_world->GetPopulation().AdjustHGTResource(-i->GetSize());
-			m_world->GetStats().GenomeFragmentInserted(this, *i);
-			fragments.erase(i);
-			return true;
-		}
-	}
-	
-	return false;
-}

Modified: development/source/main/cOrganism.h
===================================================================
--- development/source/main/cOrganism.h	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cOrganism.h	2009-10-10 18:41:38 UTC (rev 3464)
@@ -646,11 +646,6 @@
   
   /*! The main DoOutput function.  The DoOutputs above all forward to this function. */
   void doOutput(cAvidaContext& ctx, tBuffer<int>& input_buffer, tBuffer<int>& output_buffer, const bool on_divide);
-	
-	// -------- HGT methods --------
-public:
-	//! Tests for and attempts to perform an insertion of an HGT genome fragment into this organism's genome.
-	bool AttemptHGTInsertion(cAvidaContext& ctx);
 };
 
 

Modified: development/source/main/cPopulation.cc
===================================================================
--- development/source/main/cPopulation.cc	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cPopulation.cc	2009-10-10 18:41:38 UTC (rev 3464)
@@ -294,7 +294,7 @@
 	
 	// if HGT is on, make sure there's a resource for it:
 	if(m_world->GetConfig().ENABLE_HGT.Get() && (m_hgt_resid == -1)) {
-		m_world->GetDriver().RaiseFatalException(-1, "HGT is enabled, but no HGT resource is defined; add hgt=1 to a single resource in the environment file.");
+		m_world->GetDriver().NotifyWarning("HGT is enabled, but no HGT resource is defined; add hgt=1 to a single resource in the environment file.");
 	}
   
 }
@@ -5893,7 +5893,9 @@
 /*!	Modify current level of the HGT resource.
  */
 void cPopulation::AdjustHGTResource(double delta) {
-	resource_count.Modify(m_hgt_resid, delta);
+	if(m_hgt_resid != -1) {
+		resource_count.Modify(m_hgt_resid, delta);
+	}
 }
 
 /*! Mix all organisms in the population.

Modified: development/source/main/cPopulationInterface.cc
===================================================================
--- development/source/main/cPopulationInterface.cc	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cPopulationInterface.cc	2009-10-10 18:41:38 UTC (rev 3464)
@@ -28,6 +28,7 @@
 #include "cDeme.h"
 #include "cEnvironment.h"
 #include "cGenotype.h"
+#include "cGenomeUtil.h"
 #include "cHardwareManager.h"
 #include "cOrganism.h"
 #include "cOrgSinkMessage.h"
@@ -38,6 +39,8 @@
 #include "cTestCPU.h"
 
 #include <cassert>
+#include <algorithm>
+#include <iterator>
 
 #ifndef NULL
 #define NULL 0
@@ -620,3 +623,67 @@
 	cPopulationCell& that_cell = deme->GetCell(idx % deme->GetSize());
 	deme->GetNetwork().Connect(*this_cell, that_cell, weight);
 }
+
+
+/*! Perform an HGT mutation on this offspring. 
+ 
+ HGT mutations are location-dependent, hence they are implemented here as opposed to
+ the CPU or organism.
+ 
+ If this method is called, an HGT mutation of some kind is imminent.  All that's left
+ is to actually *do* the mutation.  We only do *one* HGT mutation each time this method
+ is called.
+ 
+ \todo HGT should prefer more similar and older fragments.
+ \todo Enable replacement mutations.
+ */
+void cPopulationInterface::DoHGTMutation(cAvidaContext& ctx, cGenome& offspring) {
+	// get this organism's cell:
+	cPopulationCell& cell = m_world->GetPopulation().GetCell(m_cell_id);
+	
+	// do we have any fragments available?
+	if(cell.CountGenomeFragments() == 0) { return; }
+	
+	// randomly select the genome fragment for HGT:
+	typedef cPopulationCell::fragment_list_type fragment_list_type;
+	fragment_list_type& fragments = cell.GetFragments();
+	fragment_list_type::iterator f=fragments.begin();
+	std::advance(f, ctx.GetRandom().GetInt(fragments.size()));
+	
+	// find the location within this organism's genome that best matches the selected fragment:
+	cGenomeUtil::substring_match ssm = cGenomeUtil::FindBestSubstringMatch(cell.GetOrganism()->GetGenome(), *f);
+	
+	// there are (currently) two supported types of HGT mutations: insertions & replacements.
+	// which one are we doing?
+	if(ctx.GetRandom().P(m_world->GetConfig().HGT_INSERTION_MUT_P.Get())) {
+		// insertion: insert the fragment at the final location of the match:
+		offspring.Insert(ssm.position, *f);
+	} else {
+		// replacement is more complicated... for now, disallow it.
+		assert(false);
+		//		offspring.Replace(std::max(ssm.position-f->GetSize(), 0), f->GetSize(), *f); // pos, number of sites to replace, genom
+	}
+	
+	// resource utilization, cleanup, and stats tracking:
+	m_world->GetPopulation().AdjustHGTResource(-f->GetSize());
+	m_world->GetStats().GenomeFragmentInserted(cell.GetOrganism(), *f);
+	fragments.erase(f);
+	
+	// old code from a previous version of hgt, kept around for reference:
+	//	// calculate the best match for each fragment:
+	//	cGenomeUtil::substring_match_list_type match_list;
+	//	for(fragment_list_type::iterator i=fragments.begin(); i!=fragments.end(); ++i) {
+	//		match_list.push_back(cGenomeUtil::FindBestSubstringMatch(cell.GetOrganism()->GetGenome(), *i));
+	//	}	
+	
+	//	for(fragment_list_type::iterator i=fragments.begin(); i!=fragments.end(); ++i) {
+	//		int d = std::max(cGenomeUtil::FindHammingDistance(nearby, *i), 1);
+	//		if(m_world->GetRandom().P(m_world->GetConfig().HGT_INSERTION_PROB.Get() / d)) {
+	//			m_hardware->InsertGenomeFragment(*i);
+	//			m_world->GetPopulation().AdjustHGTResource(-i->GetSize());
+	//			m_world->GetStats().GenomeFragmentInserted(this, *i);
+	//			fragments.erase(i);
+	//			return true;
+	//		}
+	//	}
+}

Modified: development/source/main/cPopulationInterface.h
===================================================================
--- development/source/main/cPopulationInterface.h	2009-10-09 19:45:03 UTC (rev 3463)
+++ development/source/main/cPopulationInterface.h	2009-10-10 18:41:38 UTC (rev 3464)
@@ -36,12 +36,13 @@
 #include "cWorldDriver.h"
 #endif
 
+class cAvidaContext;
 class cDeme;
+class cGenome;
 class cPopulation;
 class cPopulationCell;
 class cOrgMessage;
 
-
 class cPopulationInterface : public cOrgInterface
 {
 private:
@@ -137,6 +138,11 @@
 protected:
 	//! Internal-use method to consolidate message-sending code.
 	bool SendMessage(cOrgMessage& msg, cPopulationCell& rcell);
+	
+	// -------- HGT support --------
+public:
+	//! Perform an HGT mutation on this offspring.
+	void DoHGTMutation(cAvidaContext& ctx, cGenome& offspring);
 };
 
 




More information about the Avida-cvs mailing list