[Avida-SVN] r1406 - development/source/actions

matt at myxo.css.msu.edu matt at myxo.css.msu.edu
Sun Mar 18 18:37:00 PDT 2007


Author: matt
Date: 2007-03-18 21:37:00 -0400 (Sun, 18 Mar 2007)
New Revision: 1406

Modified:
   development/source/actions/PrintActions.cc
Log:
Added two additional actions; added an additional constructor to tArray to initialize them to a default value; added a static method to cInstruction to convert genomic symbols to integer values.

Modified: development/source/actions/PrintActions.cc
===================================================================
--- development/source/actions/PrintActions.cc	2007-03-16 02:58:31 UTC (rev 1405)
+++ development/source/actions/PrintActions.cc	2007-03-19 01:37:00 UTC (rev 1406)
@@ -26,6 +26,8 @@
 
 #include "cAction.h"
 #include "cActionLibrary.h"
+#include "cAnalyze.h"
+#include "cAnalyzeGenotype.h"
 #include "cClassificationManager.h"
 #include "cCPUTestInfo.h"
 #include "cEnvironment.h"
@@ -44,6 +46,9 @@
 #include "cStats.h"
 #include "cWorld.h"
 #include "cWorldDriver.h"
+#include "tVector.h"
+#include <cmath>
+#include <cerrno>
 
 
 #define STATS_OUT_FILE(METHOD, DEFAULT)                                                   /*  1 */ \
@@ -680,6 +685,265 @@
 
 
 /*
+ @MRR March 2007
+ This function prints out fitness data. The main point is that it
+ calculates the average fitness from info from the testCPU + the actual
+ merit of the organisms, and assigns zero fitness to those organisms
+ that will never reproduce.
+ 
+ The function also determines the maximum fitness genotype, and can
+ produce fitness histograms.
+ 
+ This version of the DetailedFitnessData prints the information as a log histogram.
+ 
+ Parameters:
+ filename   (cString)     Where the fitness histogram should be written.
+ fit_mode   (cString)     Either {Current, Actual, TestCPU}, where
+															   Current is the current value in the grid.  [Default]
+                                 Actual uses the current merit, but the true gestation time.
+                                 TestCPU determined.
+ hist_fmin  (double)      The minimum fitness value for the fitness histogram.  [Default: -3]
+ hist_fmax  (double)      The maximum fitness value for the fitness histogram.  [Default: 12]
+ hist_fstep (double)      The width of the individual bins in the histogram.    [Default: 0.5]
+ */
+class cActionPrintLogFitnessHistogram : public cAction
+{
+private:
+
+  double m_hist_fmin;
+  double m_hist_fstep;
+	double m_hist_fmax;
+	string m_mode;
+  cString m_filename;
+	
+public:
+		cActionPrintLogFitnessHistogram(cWorld* world, const cString& args)
+    : cAction(world, args)
+	{
+			cString largs(args);
+		  m_filename   = (largs.GetSize()) ? largs.PopWord()           : "fitness_log_hist.dat";
+			m_mode       = (largs.GetSize()) ? largs.PopWord().ToUpper() : "CURRENT";
+			m_hist_fmin  = (largs.GetSize()) ? largs.PopWord().AsDouble(): -3.0;
+			m_hist_fstep = (largs.GetSize()) ? largs.PopWord().AsDouble(): 0.5;
+			m_hist_fmax  = (largs.GetSize()) ? largs.PopWord().AsDouble(): 12;
+	}
+  
+  static const cString GetDescription() { return  "Parameters:\n\tfilename   (cString) [fitness_log_hist.dat]    Where the fitness histogram should be written.\n\tfit_mode   (cString)     Either {Current, Actual, TestCPU}, where\n\t\t Current is the current value in the grid.  [Default]\n\tActual uses the current merit, but the true gestation time.\n\tTestCPU determined.\n\thist_fmin  (double)      The minimum fitness value for the fitness histogram.  [Default: -3]\n\thist_fmax  (double)      The maximum fitness value for the fitness histogram.  [Default: 12]\n\thist_fstep (double)      The width of the individual bins in the histogram.    [Default: 0.5]\n\n";}
+  
+  void Process(cAvidaContext& ctx)
+  {
+		
+		//Verify input parameters
+		if ( (m_mode != "ACTUAL" && m_mode != "CURRENT" && m_mode != "TESTCPU") ||
+				m_hist_fmin > m_hist_fmax)
+		{
+			cerr << "cActionPrintFitnessHistogram: Please check arguments.  Abort.\n";
+			cerr << "Parameters: " << m_filename << ", " << m_mode << ", " << m_hist_fmin << ":" << m_hist_fstep << ":" << m_hist_fmax << endl;
+			return;
+		}
+		
+		//Gather data objects
+    cPopulation& pop        = m_world->GetPopulation();
+    const int    update     = m_world->GetStats().GetUpdate();
+    const double generation = m_world->GetStats().SumGeneration().Average();
+    
+    //Set up histogram; extra columns prepended (non-viable, < m_hist_fmin) and appended ( > f_hist_fmax)
+		//If the bin size is not a multiple of the step size, the last bin is expanded to make it a multiple.
+		//All bins are [min, max)
+    tArray<int> histogram;
+		int num_bins = static_cast<int>(ceil( (m_hist_fmax - m_hist_fmin) / m_hist_fstep)) + 3;
+		m_hist_fmax  = m_hist_fmin + (num_bins - 3) * m_hist_fstep;
+		histogram.Resize(num_bins, 0);
+		
+    
+    cTestCPU* testcpu = m_world->GetHardwareManager().CreateTestCPU();
+		
+    for (int i = 0; i < pop.GetSize(); i++)
+		{
+      if (pop.GetCell(i).IsOccupied() == false) continue;  //Skip unoccupied cells
+      
+      cOrganism* organism = pop.GetCell(i).GetOrganism();
+      cGenotype* genotype = organism->GetGenotype();
+      
+      cCPUTestInfo test_info;
+      testcpu->TestGenome(ctx, test_info, genotype->GetGenome());
+			
+      // We calculate the fitness based on the current merit,
+      // but with the true gestation time. Also, we set the fitness
+      // to zero if the creature is not viable.
+      const double fitness = (m_mode == "CURRENT") ? organism->GetPhenotype().GetFitness() :
+				                     (m_mode == "ACUTAL")  ? 
+																((test_info.IsViable()) ? 
+																 organism->GetPhenotype().GetMerit().CalcFitness(test_info.GetTestPhenotype().GetGestationTime()) : 0.0) :
+				                     test_info.GetColonyFitness();
+		
+			//Update the histogram
+			const double log_fitness = log10(fitness);
+			
+			int update_bin = (errno == ERANGE) ? 0 :       //If we have a log of zero, this will be true.
+				static_cast<int>((log_fitness - m_hist_fmin) / m_hist_fstep);
+			if (update_bin < 0)
+				update_bin = 1;
+			else if (update_bin >= num_bins - 3)
+				update_bin = num_bins - 1;
+			else
+				update_bin = update_bin + 2;
+		
+			histogram[update_bin]++;
+		}
+			
+		delete testcpu;
+    
+		//Output histogram
+		cDataFile& hdf = m_world->GetDataFile(m_filename);
+		hdf.Write(update, "Update");
+		hdf.Write(generation, "Generation");
+		
+		for (int k = 0; k < histogram.GetSize(); k++)
+		{ 
+			if (k == 0)
+				hdf.Write(histogram[k], "Inviable");
+			else if (k == 1)
+				hdf.Write(histogram[k], cString("< ") + cStringUtil::Convert(m_hist_fmin));
+			else if (k < histogram.GetSize() - 1)
+				hdf.Write(histogram[k], cString("(") + cStringUtil::Convert(m_hist_fmin+m_hist_fstep*(k-2)) 
+																+ cString(", ") + cStringUtil::Convert(m_hist_fmin+m_hist_fstep*(k-1)) +
+																+ cString("]"));
+			else
+				hdf.Write(histogram[k], cString("> ") + cStringUtil::Convert(m_hist_fmax));
+		}
+		hdf.Endl();
+	}
+};
+
+
+
+/*
+ @MRR March 2007
+ This function will take the initial genotype for each organism in the
+ population/batch, align them, and calculate the per-site entropy of the
+ aligned sequences.  Please note that there may be a variable number
+ of columns in each line if the runs are not fixed length.  The site
+ entropy will be measured in mers, normalized by the instruction set size.
+ This is a population/batch measure of entropy, not a mutation-selection balance
+ measure.  
+*/
+class cActionPrintGenomicSiteEntropy : public cAction
+{
+	private:
+		cString m_filename;
+		bool    m_use_gap;
+	
+	public:
+		cActionPrintGenomicSiteEntropy(cWorld* world, const cString& args) : cAction(world, args)
+		{
+			cString largs = args;
+			m_filename = (largs.GetSize()) ? largs.PopWord() : "GenomicSiteEntropy.dat";
+		}
+
+		static const cString GetDescription() { return "Arguments: [filename = \"GenomicSiteEntropyData\"] [use_gap = false]";}
+		
+		void Process(cAvidaContext& ctx)
+		{
+			
+			const int        update     = m_world->GetStats().GetUpdate();
+			const double     generation = m_world->GetStats().SumGeneration().Average();
+			const int        num_insts  = m_world->GetNumInstructions();
+			tArray<cString> aligned;  //This will hold all of our aligned sequences
+			
+			if (ctx.GetAnalyzeMode()) //We're in analyze mode, so process the current batch
+			{
+				cAnalyze& analyze = m_world->GetAnalyze();  
+				if (!analyze.GetCurrentBatch().IsAligned()) analyze.CommandAlign(""); //Let analyze take charge of aligning this batch
+				tListIterator<cAnalyzeGenotype> batch_it(m_world->GetAnalyze().GetCurrentBatch().List());
+				cAnalyzeGenotype* genotype = NULL;
+				while(genotype = batch_it.Next())
+				{
+					aligned.Push(genotype->GetAlignedSequence());
+				}
+			}
+			else //We're not in analyze mode, process the population
+			{
+				cPopulation& pop = m_world->GetPopulation();
+				for (int i = 0; i < pop.GetSize(); i++)
+				{
+					if (pop.GetCell(i).IsOccupied() == false) continue;  //Skip unoccupied cells
+					aligned.Push(pop.GetCell(i).GetOrganism()->GetGenome().AsString());
+				}
+				AlignStringArray(aligned);  //Align our population genomes
+			}
+			
+			//With all sequences aligned and stored, we can proceed to calculate per-site entropies
+			if (!aligned.GetSize())
+			{
+				m_world->GetDriver().NotifyComment("cActionPrintGenomicSiteEntropy: No sequences available.  Abort.");
+				return;
+			}
+			
+			const int gen_size = aligned[0].GetSize();
+			tArray<double> site_entropy(gen_size, 0.0);
+			tArray<int> inst_count( (m_use_gap) ? num_insts + 1 : num_insts, 0);  //Add an extra place if we're using gaps
+			for (int pos = 0; pos < gen_size; pos++)
+			{
+				inst_count.SetAll(0);  //Reset the counter for each aligned position
+				int total_count = 0;
+				for (int seq = 0; seq < aligned.GetSize(); seq++)
+				{
+					char ch = aligned[seq][pos];
+					if (ch == '_' && !m_use_gap) continue;                  //Skip gaps when applicable
+					else if (ch == '_') site_entropy[num_insts]++;          //Update gap count at end
+					else inst_count[ cInstruction::ConvertSymbol(ch) ]++;   //Update true instruction count
+					total_count++;
+				}
+				for (int c = 0; c < inst_count.GetSize(); c++)
+				{
+					double p = (inst_count[c] > 0) ? inst_count[c] / static_cast<double>(total_count) : 0.0;
+					site_entropy[pos] += (p > 0.0) ? - p * log(p) / log(inst_count.GetSize()) : 0.0;
+				}
+			}
+		}
+	
+	
+	private:
+		void AlignStringArray(tArray<cString>& unaligned)  //Taken from cAnalyze::CommandAnalyze
+		{
+			// Create an array of all the sequences we need to align.
+			const int num_sequences = unaligned.GetSize();
+			
+			// Move through each sequence an update it.
+			cString diff_info;
+			for (int i = 1; i < num_sequences; i++) {
+				// Track of the number of insertions and deletions to shift properly.
+				int num_ins = 0;
+				int num_del = 0;
+				
+				// Compare each string to the previous.
+				cStringUtil::EditDistance(unaligned[i], unaligned[i-1], diff_info, '_');
+				while (diff_info.GetSize() != 0) {
+					cString cur_mut = diff_info.Pop(',');
+					const char mut_type = cur_mut[0];
+					cur_mut.ClipFront(1); cur_mut.ClipEnd(1);
+					int position = cur_mut.AsInt();
+					if (mut_type == 'M') continue;   // Nothing to do with Mutations
+					
+					if (mut_type == 'I') {           // Handle insertions
+						for (int j = 0; j < i; j++)    // Loop back and insert an '_' into all previous sequences
+								unaligned[j].Insert('_', position + num_del);
+						num_ins++;
+					}
+					
+					else if (mut_type == 'D'){      // Handle Deletions
+						// Insert '_' into the current sequence at the point of deletions.
+						unaligned[i].Insert("_", position + num_ins);
+						num_del++;
+					}
+				}
+			}
+		}
+};
+
+
+/*
  This function goes through all genotypes currently present in the soup,
  and writes into an output file the average Hamming distance between the
  creatures in the population and a given reference genome.
@@ -1523,6 +1787,7 @@
   action_lib->Register<cActionPrintDominantGenotype>("PrintDominantGenotype");
   action_lib->Register<cActionPrintDominantParasiteGenotype>("PrintDominantParasiteGenotype");
   action_lib->Register<cActionPrintDetailedFitnessData>("PrintDetailedFitnessData");
+	action_lib->Register<cActionPrintLogFitnessHistogram>("PrintLogFitnessHistogram");
   action_lib->Register<cActionPrintGeneticDistanceData>("PrintGeneticDistanceData");
   action_lib->Register<cActionPrintPopulationDistanceData>("PrintPopulationDistanceData");
   action_lib->Register<cActionPrintDebug>("PrintDebug");
@@ -1534,6 +1799,8 @@
   action_lib->Register<cActionPrintViableTasksData>("PrintViableTasksData");
   action_lib->Register<cActionPrintTreeDepths>("PrintTreeDepths");
   
+	action_lib->Register<cActionPrintGenomicSiteEntropy>("PrintGenomicSiteEntropy");
+	
   // Grid Information Dumps
   action_lib->Register<cActionDumpMemory>("DumpMemory");
   action_lib->Register<cActionDumpFitnessGrid>("DumpFitnessGrid");




More information about the Avida-cvs mailing list