[Avida-SVN] r3078 - in development/source: analyze cpu tools

jbarrick at myxo.css.msu.edu jbarrick at myxo.css.msu.edu
Sat Dec 27 15:26:20 PST 2008


Author: jbarrick
Date: 2008-12-27 18:26:19 -0500 (Sat, 27 Dec 2008)
New Revision: 3078

Modified:
   development/source/analyze/cAnalyze.cc
   development/source/analyze/cAnalyze.h
   development/source/cpu/cCPUTestInfo.h
   development/source/cpu/cInstSet.cc
   development/source/cpu/cTestCPU.cc
   development/source/tools/cStringUtil.cc
   development/source/tools/cStringUtil.h
Log:
Analysis command SAMPLE_OFFSPRING fills the current batch with the requested number of offspring from each organism in the current batch. Offspring genomes are generated by running the genome with the same mutation rates specified for a run through the test CPU and collecting single-generation descendants.

Beginning to rewrite analysis ALIGN command so that it intelligently places gaps (not completed).



Modified: development/source/analyze/cAnalyze.cc
===================================================================
--- development/source/analyze/cAnalyze.cc	2008-12-23 20:43:12 UTC (rev 3077)
+++ development/source/analyze/cAnalyze.cc	2008-12-27 23:26:19 UTC (rev 3078)
@@ -1694,7 +1694,94 @@
   }  
 }
 
+// JEB: Creates specified number of offspring by running
+// each organism in the test CPU with mutations on.
+void cAnalyze::SampleOffspring(cString cur_string)
+{
+  int number_to_sample = (cur_string.GetSize()) ? cur_string.PopWord().AsInt() : 1000;
+  
+  // These parameters copied from BatchRecalculate, they could change what kinds of offspring are produced!!
+  tArray<int> manual_inputs;  // Used only if manual inputs are specified  
+  cString msg;                // Holds any information we may want to send the driver to display
+  int use_resources      = (cur_string.GetSize()) ? cur_string.PopWord().AsInt() : 0;
+  int update             = (cur_string.GetSize()) ? cur_string.PopWord().AsInt() : -1;
+  bool use_random_inputs = (cur_string.GetSize()) ? cur_string.PopWord().AsInt() == 1: false;
+  bool use_manual_inputs = false;
+  
+  //Manual inputs will override random input request and must be the last arguments.
+  if (cur_string.CountNumWords() > 0){
+    if (cur_string.CountNumWords() == m_world->GetEnvironment().GetInputSize()){
+      manual_inputs.Resize(m_world->GetEnvironment().GetInputSize());
+      use_random_inputs = false;
+      use_manual_inputs = true;
+      for (int k = 0; cur_string.GetSize(); k++)
+        manual_inputs[k] = cur_string.PopWord().AsInt();
+    } else if (m_world->GetVerbosity() >= VERBOSE_ON){
+      msg.Set("Invalid number of environment inputs requested for recalculation: %d specified, %d required.", 
+              cur_string.CountNumWords(), m_world->GetEnvironment().GetInputSize());
+      m_world->GetDriver().NotifyWarning(msg);
+    }
+  }
+  
+  cCPUTestInfo test_info(1); //we only allow one generation of testing! v. important to get proper offspring
+  if (use_manual_inputs)
+    test_info.UseManualInputs(manual_inputs);
+  else
+    test_info.UseRandomInputs(use_random_inputs); 
+  test_info.SetResourceOptions(use_resources, m_resources, update, m_resource_time_spent_offset);
 
+  if (m_world->GetVerbosity() >= VERBOSE_ON) {
+    msg.Set("Sampling %d offspring from each of the %d organisms in batch %d...", number_to_sample, batch[cur_batch].GetSize(), cur_batch);
+    m_world->GetDriver().NotifyComment(msg);
+  } else{ 
+    msg.Set("Sampling offspring...");
+    m_world->GetDriver().NotifyComment(msg);
+  }
+  
+  // Load the mutation rates from the environment.
+  test_info.MutationRates().Copy(m_world->GetEnvironment().GetMutRates());
+  // Copy them into the organism  
+  tListPlus<cAnalyzeGenotype> offspring_list;
+  tListIterator<cAnalyzeGenotype> batch_it(batch[cur_batch].List());
+  cAnalyzeGenotype* parent_genotype = NULL;
+
+  cTestCPU * test_cpu = m_world->GetHardwareManager().CreateTestCPU();  
+  while ((parent_genotype = batch_it.Next())) {
+    
+    // We keep a hash with genome strings as keys
+    // to save duplication of the same offspring genotype.
+    // NumCPUs is incremented whenever an offspring is
+    // created more than once from the same parent.
+    tDictionary<cAnalyzeGenotype*> genome_hash;
+    
+    for (int i=0; i<number_to_sample; i++) {
+      test_cpu->TestGenome(m_world->GetDefaultContext(), test_info, parent_genotype->GetGenome());
+      cAnalyzeGenotype * offspring_genotype = NULL;
+      bool found = genome_hash.Find(test_info.GetTestOrganism(0)->ChildGenome().AsString(), offspring_genotype);
+      if (found) {
+        offspring_genotype->SetNumCPUs(offspring_genotype->GetNumCPUs() + 1);
+      }
+      else {
+        cAnalyzeGenotype * offspring_genotype = new cAnalyzeGenotype(m_world, test_info.GetTestOrganism(0)->ChildGenome(), inst_set);
+        offspring_genotype->SetID(parent_genotype->GetID());
+        offspring_genotype->SetNumCPUs(1);
+        offspring_list.Push(offspring_genotype);
+        genome_hash.Add(test_info.GetTestOrganism(0)->ChildGenome().AsString(), offspring_genotype);
+      }
+    }
+    batch_it.Remove();
+    delete parent_genotype;
+  }
+  delete test_cpu;
+    
+  // Fill back in the current batch with the new offspring
+  while (offspring_list.GetSize() > 0) {
+    batch[cur_batch].List().PushRear(offspring_list.Pop());
+  }
+
+}
+
+
 //////////////// Output Commands...
 
 void cAnalyze::CommandPrint(cString cur_string)
@@ -1858,6 +1945,9 @@
   if (m_world->GetVerbosity() >= VERBOSE_ON) cout << "Detailing batch " << cur_batch << endl;
   else cout << "Detailing..." << endl;
   
+  // @JEB return if there are no organisms in the current batch
+  if (batch[cur_batch].GetSize() == 0) return;
+  
   // Load in the variables...
   cString filename("detail.dat");
   if (cur_string.GetSize() != 0) filename = cur_string.PopWord();
@@ -2204,7 +2294,7 @@
   }
   
   delete cur_command;
-  }
+}
 
 
 
@@ -5338,11 +5428,28 @@
 // @JEB 9-24-2008
 void cAnalyze::CommandAnalyzeRedundancyByInstFailure(cString cur_string)
 {
+  cout << "Analyzing redundancy by changing instruction failure probability..." << endl;
+
   cString filename("analyze_redundancy_by_inst_failure.dat");
   if (cur_string.GetSize() != 0) filename = cur_string.PopWord();
   int replicates = 1000;
   if (cur_string.GetSize() != 0) replicates = cur_string.PopWord().AsInt();
-
+  double log10_start_pr_fail = -4;
+  
+  // add mode
+  int mode = 0; 
+  // 0 = average log2 fitness
+  // 1 = fitness decreased
+  
+  if (cur_string.GetSize() != 0) log10_start_pr_fail = cur_string.PopWord().AsDouble();
+  double log10_end_pr_fail = 0;
+  if (cur_string.GetSize() != 0) log10_end_pr_fail = cur_string.PopWord().AsDouble();
+  if (log10_end_pr_fail > 0) {
+    m_world->GetDriver().NotifyWarning("ANALYZE_REDUNDANCY_BY_INST_FAILURE: End log value greater than 0 set to 0.");
+  }
+  double log10_step_size_pr_fail = 0.1;
+  if (cur_string.GetSize() != 0) log10_step_size_pr_fail = cur_string.PopWord().AsDouble();
+  
   // Output is one line per organism in the current batch with columns.
   cDataFile & df = m_world->GetDataFile(filename);
   df.WriteComment( "Redundancy calculated by changing the probability of instruction failure" );
@@ -5361,17 +5468,27 @@
       cout << "  Determining redundancy by instruction failure for " << genotype->GetName() << endl;
     }
     
-     cInstSet modify_inst_set = genotype->GetInstructionSet();
+    cInstSet original_inst_set = genotype->GetInstructionSet();
+    cInstSet modify_inst_set = genotype->GetInstructionSet();
       
     // Modify the instruction set to include the current probability of failure.
+    int num_pr_fail_insts = 0;
     for (int j=0; j<modify_inst_set.GetSize(); j++)
     {
       cString inst_name = modify_inst_set.GetName(j);
       cInstruction inst = modify_inst_set.GetInst(inst_name);
+      if ( original_inst_set.GetProbFail(inst) > 0) {
+        num_pr_fail_insts++;
+      }
       modify_inst_set.SetProbFail(inst, 0);
     }
     genotype->SetInstructionSet(modify_inst_set);
   
+    // Avoid unintentional use with no instructions having a chance of failure
+    if (num_pr_fail_insts == 0) {
+      m_world->GetDriver().RaiseFatalException(1,"ANALYZE_REDUNDANCY_BY_INST_FAILURE: No instructions have a chance of failure in default instruction set.");
+    }
+  
     // Recalculate the baseline fitness
     // May need to calculate multiple times to check for stochastic behavior....
     genotype->Recalculate(m_ctx);
@@ -5385,29 +5502,39 @@
       df.Write(baseline_fitness, "fitness");
       
       // Run the organism the specified number of replicates
-      
-      for (double log10_fc=-4.0; log10_fc<=0.0; log10_fc+=0.1)
+      for (double log10_fc=log10_start_pr_fail; log10_fc<=log10_end_pr_fail; log10_fc+=log10_step_size_pr_fail)
       {
         double fc = exp(log10_fc*log(10.0));
         
         // Modify the instruction set to include the current probability of failure.
+        modify_inst_set = genotype->GetInstructionSet();
         for (int j=0; j<modify_inst_set.GetSize(); j++)
         {
           cString inst_name = modify_inst_set.GetName(j);
           cInstruction inst = modify_inst_set.GetInst(inst_name);
-          modify_inst_set.SetProbFail(inst, fc);
+          if ( original_inst_set.GetProbFail(inst) > 0) {
+            modify_inst_set.SetProbFail(inst, fc);
+          }
         }
         genotype->SetInstructionSet(modify_inst_set);
         
         // Recalculate the requested number of times
         double chance = 0;
+        double avg_fitness = 0;
         for (int i=0; i<replicates; i++)
         {
           genotype->Recalculate(m_ctx);
           if (genotype->GetFitness() < baseline_fitness) chance++;
+          avg_fitness += genotype->GetFitness();
         }      
-        s.Set("Inst prob fail %.3g", fc);
-        df.Write(chance/replicates, s);
+        
+        if (mode == 0) {
+          s.Set("Avg fitness when inst prob fail %.3g", fc);
+          df.Write(avg_fitness/replicates, s);
+        } else {
+          s.Set("Fraction of replicates with reduced fitness at inst prob fail %.3g", fc);
+          df.Write(chance/replicates, s);
+        }
       }
       df.Endl();
     }
@@ -6071,6 +6198,7 @@
 {
   // Align does not need any args yet.
   (void) cur_string;
+  int perform_slow_alignment = (cur_string.GetSize()) ? cur_string.PopWord().AsInt() : 0;
   
   cout << "Aligning sequences..." << endl;
   
@@ -6085,7 +6213,7 @@
   const int num_sequences = glist.GetSize();
   cString * sequences = new cString[num_sequences];
   
-  // Move through each sequence an update it.
+  // Move through each sequence and update it.
   batch_it.Reset();
   cString diff_info;
   for (int i = 0; i < num_sequences; i++) {
@@ -6096,7 +6224,29 @@
     int num_del = 0;
     
     // Compare each string to the previous.
-    cStringUtil::EditDistance(sequences[i], sequences[i-1], diff_info, '_');
+    if (perform_slow_alignment == 0) {
+      cStringUtil::EditDistance(sequences[i], sequences[i-1], diff_info, '_');
+    }
+    else
+    if (perform_slow_alignment == 1) {
+      cStringUtil::GapMinimizingEditDistance(sequences[i], sequences[i-1], diff_info, '_');
+    }
+    else {
+      cString best_diff_info;
+      int min_dist = -1;
+      for (int j=0; j<i; j++) {
+        cString test_diff_info;
+        int test_dist = cStringUtil::GapMinimizingEditDistance(sequences[i], sequences[j], test_diff_info, '_');
+        if (min_dist == -1 || test_dist < min_dist) {
+          min_dist = test_dist;
+          best_diff_info = test_diff_info;
+          if (min_dist == 0) j=i;
+        }
+      }
+      
+       diff_info = best_diff_info;
+    }
+    
     while (diff_info.GetSize() != 0) {
       cString cur_mut = diff_info.Pop(',');
       const char mut_type = cur_mut[0];
@@ -9053,7 +9203,7 @@
   AddLibraryDef("LOAD_RESOURCES", &cAnalyze::LoadResources);
   AddLibraryDef("LOAD", &cAnalyze::LoadFile);
   
-  // Reduction commands...
+  // Reduction and sampling commands...
   AddLibraryDef("FILTER", &cAnalyze::CommandFilter);
   AddLibraryDef("FIND_GENOTYPE", &cAnalyze::FindGenotype);
   AddLibraryDef("FIND_ORGANISM", &cAnalyze::FindOrganism);
@@ -9066,7 +9216,8 @@
   AddLibraryDef("KEEP_TOP", &cAnalyze::KeepTopGenotypes);
   AddLibraryDef("TRUNCATELINEAGE", &cAnalyze::TruncateLineage); // Depricate!
   AddLibraryDef("TRUNCATE_LINEAGE", &cAnalyze::TruncateLineage);
-  
+  AddLibraryDef("SAMPLE_OFFSPRING", &cAnalyze::SampleOffspring);
+
   // Direct output commands...
   AddLibraryDef("PRINT", &cAnalyze::CommandPrint);
   AddLibraryDef("TRACE", &cAnalyze::CommandTrace);

Modified: development/source/analyze/cAnalyze.h
===================================================================
--- development/source/analyze/cAnalyze.h	2008-12-23 20:43:12 UTC (rev 3077)
+++ development/source/analyze/cAnalyze.h	2008-12-27 23:26:19 UTC (rev 3078)
@@ -220,7 +220,7 @@
   void LoadResources(cString cur_string);
   void LoadFile(cString cur_string);
   
-  // Reduction
+  // Reduction and Sampling
   void CommandFilter(cString cur_string);
   void FindGenotype(cString cur_string);
   void FindOrganism(cString cur_string);
@@ -232,8 +232,8 @@
   void SampleGenotypes(cString cur_string);
   void KeepTopGenotypes(cString cur_string);
   void TruncateLineage(cString cur_string);
+  void SampleOffspring(cString cur_string);
   
-  
   // Direct Output Commands...
   void CommandPrint(cString cur_string);
   void CommandTrace(cString cur_string);

Modified: development/source/cpu/cCPUTestInfo.h
===================================================================
--- development/source/cpu/cCPUTestInfo.h	2008-12-23 20:43:12 UTC (rev 3077)
+++ development/source/cpu/cCPUTestInfo.h	2008-12-27 23:26:19 UTC (rev 3078)
@@ -29,6 +29,9 @@
 #ifndef nHardware_h
 #include "nHardware.h"
 #endif
+#ifndef cMutationRates_h
+#include "cMutationRates.h"
+#endif
 #ifndef cString_h
 #include "cString.h"
 #endif
@@ -65,6 +68,7 @@
 	tArray<int> manual_inputs;  //   if so, use these.
   cHardwareTracer* m_tracer;
   cInstSet* m_inst_set;
+  cMutationRates m_mut_rates;
   
   int m_cur_sg;
 
@@ -104,8 +108,8 @@
     { m_res_method = (eTestCPUResourceMethod)res_method; m_res = res; m_res_update = update; m_res_cpu_cycle_offset = cpu_cycle_offset; }
   
   void SetCurrentStateGridID(int sg) { m_cur_sg = sg; }
+  cMutationRates& MutationRates() { return m_mut_rates; }
 
-
   // Input Accessors
   int GetGenerationTests() const { return generation_tests; }
   bool GetTraceTaskOrder() const { return trace_task_order; }

Modified: development/source/cpu/cInstSet.cc
===================================================================
--- development/source/cpu/cInstSet.cc	2008-12-23 20:43:12 UTC (rev 3077)
+++ development/source/cpu/cInstSet.cc	2008-12-27 23:26:19 UTC (rev 3078)
@@ -378,8 +378,9 @@
 
     if ((*m_inst_lib)[fun_id].IsNop()) {
       // Assert nops are at the _beginning_ of an inst_set.
-      assert(m_lib_name_map.GetSize() == (m_lib_nopmod_map.GetSize() + 1));
-      
+      if (m_lib_name_map.GetSize() != (m_lib_nopmod_map.GetSize() + 1)) {
+        m_world->GetDriver().RaiseFatalException(1, "No operation instructions (nop-A, nop-B, ...) must be the first instructions in an instruction set file.");
+      }
       m_lib_nopmod_map.Resize(inst_id + 1);
       m_lib_nopmod_map[inst_id] = fun_id;
     }

Modified: development/source/cpu/cTestCPU.cc
===================================================================
--- development/source/cpu/cTestCPU.cc	2008-12-23 20:43:12 UTC (rev 3077)
+++ development/source/cpu/cTestCPU.cc	2008-12-27 23:26:19 UTC (rev 3078)
@@ -286,6 +286,10 @@
   
   if (test_info.GetInstSet()) organism = new cOrganism(m_world, ctx, genome, test_info.GetInstSet());
   else organism = new cOrganism(m_world, ctx, genome);
+  
+  // Copy the test mutation rates
+  organism->MutationRates().Copy(test_info.MutationRates());
+  
   test_info.org_array[cur_depth] = organism;
   organism->SetOrgInterface(ctx, new cTestCPUInterface(this, test_info));
   organism->GetPhenotype().SetupInject(genome);

Modified: development/source/tools/cStringUtil.cc
===================================================================
--- development/source/tools/cStringUtil.cc	2008-12-23 20:43:12 UTC (rev 3077)
+++ development/source/tools/cStringUtil.cc	2008-12-27 23:26:19 UTC (rev 3078)
@@ -154,16 +154,15 @@
     for (int j = 1; j < size1; j++) {
       // If the values are equal, keep the value in the upper left.
       if (string1[j] == string2[i]) {
-	cur_row[j] = prev_row[j-1];
+        cur_row[j] = prev_row[j-1];
       }
 
       // Otherwise, set the current position the the minimal of the three
       // numbers to the upper right in the chart plus one.
       else {
-	cur_row[j] =
-	  (prev_row[j] < prev_row[j-1]) ? prev_row[j] : prev_row[j-1];
-	if (cur_row[j-1] < cur_row[j]) cur_row[j] = cur_row[j-1];
-	cur_row[j]++;
+        cur_row[j] = (prev_row[j] < prev_row[j-1]) ? prev_row[j] : prev_row[j-1];
+        if (cur_row[j-1] < cur_row[j]) cur_row[j] = cur_row[j-1];
+        cur_row[j]++;
       }
     }
 
@@ -186,6 +185,143 @@
 }
 
 
+int cStringUtil::GapMinimizingEditDistance(const cString & string1, const cString & string2,
+			      cString & info, const char gap)
+{
+  const int size1 = string1.GetSize();
+  const int size2 = string2.GetSize();
+ 
+  if (!size1) return size2;
+  if (!size2) return size1;
+
+  // Note: For both matrices, the first and last cols/rows 
+  // are for alignments to an end gap.
+  tMatrix<double> dist_matrix(size2+2, size1+2);
+
+  // Keep track of changes in a mut_matrix.
+  //  N=None, M=Mutations, I=Insertion, D=Deletion
+  tMatrix<char> mut_matrix(size2+2, size1+2);
+
+  // Initialize the last row and col to record the difference from nothing.
+  for (int i = 0; i < size1+1; i++) {
+    dist_matrix(size2+1,i) = (double) (size1+1 - i);
+    mut_matrix(size2+1,i) = 'I';
+  }
+  for (int i = 0; i < size2+1; i++) {
+    dist_matrix(i,size1+1) = (double) (size2+1 - i);
+    mut_matrix(i,size1+1) = 'D';
+  }
+  mut_matrix(size2+1,size1+1) = 'N';
+  dist_matrix(size2+1,size1+1) = (double) 0;
+  
+  for (int i = size2; i >= 0; i--) {
+    // Move down the cur_row and fill it out.
+    for (int j = size1; j >= 0; j--) {
+      // Otherwise, set the current position to the minimal of the three
+      // numbers above (insertion), to the left (deletion), or upper left
+      // (mutation/match) plus one if the aligned positions do not match.
+      
+      char c1 = (j-1 >= 0) ? string1[j-1] : '\0';
+      char c2 = (i-1 >= 0) ? string2[i-1] : '\0';
+
+      bool matched = (c1 == c2);
+      if (j-1 < 0 && i-1 <= 0) matched = 0;
+      double match_dist = dist_matrix(i+1,j+1) + ((matched) ? 0 : 1);
+      
+      double ins_dist = dist_matrix(i+1,j) + 1;
+      if (mut_matrix(i+1,j) != 'D') ins_dist += 0.0001;
+      double del_dist = dist_matrix(i,j+1) + 1;
+      if (mut_matrix(i,j+1) != 'I') del_dist += 0.0001;
+
+      if (ins_dist <= del_dist && ins_dist <= match_dist) {     // Insertion!
+        dist_matrix(i,j) = ins_dist;
+        mut_matrix(i,j) = 'D';
+      } else if (match_dist <= del_dist) {                       // Mutation!
+        dist_matrix(i,j) = match_dist;
+        mut_matrix(i,j) = (matched) ? 'N' : 'M';        
+
+      } else {                     // Deletion!
+        dist_matrix(i,j) = del_dist;
+        mut_matrix(i,j) = 'I';
+      }
+    }
+  }
+  mut_matrix(size2+1,size1+1) = 'N';
+  dist_matrix(size2+1,size1+1) = (double) 0;
+  
+  /* For debugging, print out matrices
+  for (int i = 0; i <= size2+1; i++) {
+    for (int j = 0; j <=size1+1; j++) {
+      cout << dist_matrix(i,j) << " ";
+    }
+    cout << endl;
+  }
+
+  for (int i = 0; i <= size2+1; i++) {
+    for (int j = 0; j <=size1+1; j++) {
+      cout << mut_matrix(i,j) << " ";
+    }
+  cout << endl;
+  }
+  */
+
+  // Construct the list of changes
+  int pos1 = 0;
+  int pos2 = 0;
+  info = "";
+
+  // We don't start at 0,0 (which would be an initial gap to an initial gap alignment)
+  // Choose whether there are initial gaps.
+  cString mut_string;
+  bool first_time = true;
+  while (pos1 < size1+1 || pos2 < size2+1) {
+    if (mut_matrix(pos2, pos1) == 'N') {     
+      pos1++; pos2++;
+      continue;
+    }
+
+    // There is a mutation here; determine the type...
+    const char old_char = (pos2-1 < size2 && pos2>0) ? string2[pos2-1] : '\0';
+    const char new_char = (pos1-1 < size1 && pos1>0) ? string1[pos1-1] : '\0';
+
+    if (mut_matrix(pos2, pos1) == 'M') {
+      mut_string.Set("M%d%c%c", pos2-1, old_char, new_char);
+      
+      // "Mutations" in the initial column are really alignments
+      // with gaps so we must substitute insertions or deletions.
+      if (pos1==0) {
+          mut_string.Set("D%d%c", pos1, new_char);
+      }
+      if (pos2==0) {
+            mut_string.Set("I%d%c", pos2, new_char);
+      }
+        pos1++; pos2++;
+    }
+    else if (mut_matrix(pos2, pos1) == 'D') {
+      mut_string.Set("D%d%c", pos2-1, old_char);
+      pos2++;
+    }
+    else { // if (mut_matrix(pos2+1, pos1+1) == 'I') {
+      mut_string.Set("I%d%c", pos1-1, new_char);
+      pos1++;
+    }
+  
+    if (first_time) {
+      first_time = false;
+      continue;
+    }
+    
+    /* For debugging, print out mutations found
+    cout << mut_string << endl;
+    */
+    if (info.GetSize() > 0) mut_string.Insert(",");
+    info += mut_string;
+  } 
+
+  // Now that we are done, return the bottom-right corner of the chart.
+  return (int) dist_matrix(0, 0);
+}
+
 int cStringUtil::EditDistance(const cString & string1, const cString & string2,
 			      cString & info, const char gap)
 {
@@ -217,12 +353,12 @@
     for (int j = 0; j < size1; j++) {
       // If the values are equal, keep the value in the upper left.
       if (string1[j] == string2[i]) {
-	dist_matrix(i+1,j+1) = dist_matrix(i,j);
-	mut_matrix(i+1,j+1) = 'N';
-	continue; // Move on to next entry...
+        dist_matrix(i+1,j+1) = dist_matrix(i,j);
+        mut_matrix(i+1,j+1) = 'N';
+        continue; // Move on to next entry...
       }
 
-      // Otherwise, set the current position the the minimal of the three
+      // Otherwise, set the current position to the minimal of the three
       // numbers above (insertion), to the left (deletion), or upper left
       // (mutation) in the chart, plus one.
       double mut_dist = dist_matrix(i,j) + 1;
@@ -231,14 +367,14 @@
       const double del_dist = dist_matrix(i,j+1) + (string2[i] != gap);
 
       if (mut_dist < ins_dist && mut_dist < del_dist) {  // Mutation!
-	dist_matrix(i+1,j+1) = mut_dist;
-	mut_matrix(i+1,j+1) = 'M';
+        dist_matrix(i+1,j+1) = mut_dist;
+        mut_matrix(i+1,j+1) = 'M';
       } else if (ins_dist < del_dist) {                  // Insertion!
-	dist_matrix(i+1,j+1) = ins_dist;
-	mut_matrix(i+1,j+1) = 'I';
+        dist_matrix(i+1,j+1) = ins_dist;
+        mut_matrix(i+1,j+1) = 'I';
       } else {                                           // Deletion!
-	dist_matrix(i+1,j+1) = del_dist;
-	mut_matrix(i+1,j+1) = 'D';
+        dist_matrix(i+1,j+1) = del_dist;
+        mut_matrix(i+1,j+1) = 'D';
       }
     }
   }
@@ -280,6 +416,8 @@
   return (int) dist_matrix(size2, size1);
 }
 
+
+
 const cString & cStringUtil::Convert(const cString& in_string,
 				     const cString& out_string)
 {

Modified: development/source/tools/cStringUtil.h
===================================================================
--- development/source/tools/cStringUtil.h	2008-12-23 20:43:12 UTC (rev 3077)
+++ development/source/tools/cStringUtil.h	2008-12-27 23:26:19 UTC (rev 3078)
@@ -65,6 +65,7 @@
    **/
   static int EditDistance(const cString& string1, const cString& string2);
   static int EditDistance(const cString& string1, const cString& string2, cString& info, const char gap = ' '); 
+  static int GapMinimizingEditDistance(const cString& string1, const cString& string2, cString& info, const char gap = ' '); 
 
   /**
    * Various, overloaded conversion functions for use in templates.  Note




More information about the Avida-cvs mailing list