[Avida-cvs] [Avida2-svn] r284 - trunk/source/main
huangw10@myxo.css.msu.edu
huangw10 at myxo.css.msu.edu
Sun Aug 14 05:27:17 PDT 2005
Author: huangw10
Date: 2005-08-14 08:27:17 -0400 (Sun, 14 Aug 2005)
New Revision: 284
Modified:
trunk/source/main/analyze.cc
Log:
Analyze Biocomplexity of community.
Modified: trunk/source/main/analyze.cc
===================================================================
--- trunk/source/main/analyze.cc 2005-08-10 18:53:27 UTC (rev 283)
+++ trunk/source/main/analyze.cc 2005-08-14 12:27:17 UTC (rev 284)
@@ -9,6 +9,7 @@
#include <sstream>
#include <string>
#include <queue>
+#include <stack>
#include "analyze.hh"
@@ -2250,20 +2251,28 @@
}
}
+
void cAnalyze::CommunityComplexity(cString cur_string)
{
- cout << "Analyze community complexity for current population ...\n";
- // Get mutation rate
- double mut_rate = cur_string.PopWord().AsDouble();
+ /////////////////////////////////////////////////////////////////////////
+ // Calculate the mutual information between all genotypes and environment
+ /////////////////////////////////////////////////////////////////////////
+ cout << "Analyze biocomplexity of current population about environment ...\n";
- // Create the directory using the string given as the second argument
+ // Get the number of genotypes that are gonna be analyzed.
+ int max_genotypes = cur_string.PopWord().AsInt();
+
+ // Get update
+ int update = cur_string.PopWord().AsInt();
+
+ // Get the directory
cString dir = cur_string.PopWord();
cString defaultDir = "community_cpx/";
cString directory = PopDirectory(dir, defaultDir);
- // Create the file that saves the result
+ // Get the file name that saves the result
cString filename = cur_string.PopWord();
if (filename.IsEmpty()) {
filename = "community.complexity.dat";
@@ -2274,105 +2283,354 @@
cpx_fp << "# Legend:" << endl;
cpx_fp << "# 1: Genotype ID" << endl;
- cpx_fp << "# 2: Parent ID" << endl;
- cpx_fp << "# 3: Fitness" << endl;
- cpx_fp << "# 4: New Information" << endl;
+ cpx_fp << "# 2: Entropy given Known Genotypes" << endl;
+ cpx_fp << "# 3: Entropy given Both Known Genotypes and Env" << endl;
+ cpx_fp << "# 4: New Information about Environment" << endl;
cpx_fp << "# 5: Total Complexity" << endl;
- cpx_fp << "# 6: New Information - Lost Informtion" << endl;
- cpx_fp << "# 7: Net Total Complexity" << endl << endl;
+ cpx_fp << endl;
- ////////////////////////////////////////////////////////////////////////////////
- // Loop through all of the genotypes in this batch and build parent-genotype map
-
- typedef multimap< int, cAnalyzeGenotype *, less<int> > tMultimap;
- tMultimap children;
+ /////////////////////////////////////////////////////////////////////////////////
+ // Loop through all of the genotypes in all batches and build id vs. genotype map
+
+ map<int, cAnalyzeGenotype *> genotype_database;
+ for (int i = 0; i < MAX_BATCHES; ++ i) {
+ tListIterator<cAnalyzeGenotype> batch_it(batch[i].List());
+ cAnalyzeGenotype * genotype = NULL;
+ while ((genotype = batch_it.Next()) != NULL) {
+ genotype_database.insert(make_pair(genotype->GetID(), genotype));
+ }
+ }
+
+ ////////////////////////////////////////////////
+ // Check if all the genotypes having same length
+
+ int length_genome;
+ if (genotype_database.size() > 0) {
+ length_genome = genotype_database.begin()->second->GetLength();
+ }
+ map<int, cAnalyzeGenotype *>::iterator gen_iterator = genotype_database.begin();
+ for (; gen_iterator != genotype_database.end(); ++ gen_iterator) {
+ if (gen_iterator->second->GetLength() != length_genome) {
+ cerr << "Genotype " << gen_iterator->first << " has different genome length." << endl;
+ exit(1);
+ }
+ }
+
+ ///////////////////////
+ // Backup test CPU data
+ bool backupUsage = cTestCPU::UseResources();
+ tArray<double> backupResources(cTestCPU::GetResources());
+ FillResources(update);
+
+ ///////////////////////////////////////////////////////////////////////
+ // Choose the first n most abundant genotypes and put them in community
+
+ vector<cAnalyzeGenotype *> community;
+ cAnalyzeGenotype * genotype = NULL;
tListIterator<cAnalyzeGenotype> batch_it(batch[cur_batch].List());
- cAnalyzeGenotype * genotype = NULL;
+
+ while (((genotype = batch_it.Next()) != NULL) && (community.size() < max_genotypes)) {
+ community.push_back(genotype);
+ }
+
+ ///////////////////////////
+ // Measure hamming distance
+
+ int size_community = community.size();
+ if (size_community == 0) {
+ cerr << "There is no genotype in this community." << endl;
+ cpx_fp.close();
+ exit(1);
+ }
+ typedef pair<int,int> gen_pair;
+ map<gen_pair, int> hamming_dist;
- while ((genotype = batch_it.Next()) != NULL) {
- children.insert(make_pair(genotype->GetParentID(), genotype));
+ for (int i = 0; i< size_community; ++ i) {
+ for (int j = i+1; j < size_community; ++ j) {
+ int dist = cGenomeUtil::FindHammingDistance(community[i]->GetGenome(),
+ community[j]->GetGenome());
+ int id1 = community[i]->GetID();
+ int id2 = community[j]->GetID();
+
+ hamming_dist.insert(make_pair(gen_pair(id1, id2), dist));
+ hamming_dist.insert(make_pair(gen_pair(id2, id1), dist));
+ }
}
- // Put the first genotype (root of the phylogenetic tree) in the queue.
- // Breadth-first traversal of the phylogeny_tree. Accumulate new information
- // in the child compared to its parent.
- typedef pair<cAnalyzeGenotype *, cAnalyzeGenotype *> tGenotypePair; // genotype and its parent
- queue<tGenotypePair> genotype_queue;
+ //////////////////////////////////
+ // Get Most Recent Common Ancestor
- // Look for the root that has multiple childrent.
- int genotype_id = children.begin()->second->GetID();
- while (children.count(genotype_id) == 1 && children.size() > 1) {
- children.erase(children.begin());
- genotype_id = children.begin()->second->GetID();
+ map<gen_pair, cAnalyzeGenotype *> mrca;
+ for (int i = 0; i< size_community; ++ i) {
+ for (int j = i+1; j < size_community; ++ j) {
+
+ cAnalyzeGenotype * lineage1_genotype = community[i];
+ cAnalyzeGenotype * lineage2_genotype = community[j];
+
+ while (lineage1_genotype->GetID() != lineage2_genotype->GetID()) {
+ if (lineage1_genotype->GetID() > lineage2_genotype->GetID()) {
+ int parent_id = lineage1_genotype->GetParentID();
+ lineage1_genotype = genotype_database.find(parent_id)->second;
+ } else {
+ int parent_id = lineage2_genotype->GetParentID();
+ lineage2_genotype = genotype_database.find(parent_id)->second;
+ }
+ }
+
+ int id1 = community[i]->GetID();
+ int id2 = community[j]->GetID();
+ mrca.insert(make_pair(gen_pair(id1, id2), lineage1_genotype));
+ mrca.insert(make_pair(gen_pair(id2, id1), lineage1_genotype));
+ }
}
- if (children.begin() != children.end()) {
- genotype_queue.push( tGenotypePair(children.begin()->second, NULL) );
+ /*
+ ////////////////////////////////////////////////////////////////////////////////////////////
+ // Sort the genotype that is next genotype is the most closest one to all previous genotypes
+
+ vector<cAnalyzeGenotype *> sorted_community;
+ vector<cAnalyzeGenotype *> left_genotypes = community;
+
+ // Put the first genotype in left to sorted.
+ sorted_community.push_back(*left_genotypes.begin());
+ left_genotypes.erase(left_genotypes.begin());
+
+ while (left_genotypes.size() > 0) {
+ int min_total_hamming = size_community * length_genome;
+ int index_next;
+
+ for (int next = 0; next < left_genotypes.size(); ++ next) {
+ int total_hamming = 0;
+ int id1 = left_genotypes[next]->GetID();
+
+ for (int given = 0; given < sorted_community.size(); ++ given) {
+ int id2 = sorted_community[given]->GetID();
+ total_hamming += hamming_dist.find(gen_pair(id1, id2))->second;
+ }
+
+ if (total_hamming < min_total_hamming) {
+ min_total_hamming = total_hamming;
+ index_next = next;
+ }
+ }
+
+ sorted_community.push_back(left_genotypes[index_next]);
+ left_genotypes.erase(left_genotypes.begin() + index_next);
}
- double community_cpx = 0.0;
- double community_cpx_m2 = 0.0; // Add net new information in the phylogeny.
- while (genotype_queue.size() > 0) {
+ */
+ vector<cAnalyzeGenotype *> sorted_community = community;
+ /////////////////////////////////////////////
+ // Loop through genotypes in sorted community
- cAnalyzeGenotype * cur_genotype = genotype_queue.front().first;
- cAnalyzeGenotype * parent_genotype = genotype_queue.front().second;
- genotype_queue.pop();
- cur_genotype->Recalculate();
- cout << cur_genotype->GetID() << endl;
+ double complexity = 0.0;
+ vector<cAnalyzeGenotype *> given_genotypes;
+ int num_insts = inst_set.GetSize();
-
- if (parent_genotype == NULL) { // The root of phylogenetic tree
- double entropy = AnalyzeEntropy(cur_genotype, mut_rate);
- community_cpx = cur_genotype->GetLength() - entropy;
- community_cpx_m2 = community_cpx;
- cpx_fp << cur_genotype->GetID() << " -1 " << cur_genotype->GetFitness() << " 0 "
- << community_cpx << " 0 " << community_cpx_m2 << endl;
- } else {
+ for (int i = 0; i < size_community; ++ i) {
+ genotype = sorted_community[i];
- // Compare with its parent to measure new information
- double newinfo = IncreasedInfo(cur_genotype, parent_genotype, mut_rate);
- community_cpx += newinfo;
- cpx_fp << cur_genotype->GetID() << " " << parent_genotype->GetID() << " "
- << cur_genotype->GetFitness() << " " << newinfo << " " << community_cpx;
+ // Skip the dead organisms
+ genotype->Recalculate();
+ if (genotype->GetFitness() == 0) {
+ continue;
+ }
- // Only if current genotype is viable, compare it with its parent for net new info
- if (cur_genotype->GetFitness() > 0) {
+ vector<double> one_line_prob(num_insts, 0.0);
+ vector< vector<double> > prob(length_genome, one_line_prob);
+
+ cout << endl << genotype->GetID() << endl;
+
+ if (given_genotypes.size() >= 2) {
+
+ ///////////////////////////////////////////////////////////////////
+ // Look for two given genotypes that has minimun depth dist with it
+
+ cAnalyzeGenotype * min_depth_gen = given_genotypes[0];
+ cAnalyzeGenotype * tmrca = mrca.find(gen_pair(genotype->GetID(),
+ given_genotypes[0]->GetID()))->second;
+ int min_depth_dist = genotype->GetDepth() + given_genotypes[0]->GetDepth() - 2 * tmrca->GetDepth();
+
+ cAnalyzeGenotype * second_min_gen = given_genotypes[1];
+ tmrca = mrca.find(gen_pair(genotype->GetID(), given_genotypes[1]->GetID()))->second;
+ int second_min_depth = genotype->GetDepth() + given_genotypes[1]->GetDepth() - 2 * tmrca->GetDepth();
+
+ for (int i = 2; i < given_genotypes.size(); ++ i) {
+ cAnalyzeGenotype * given_genotype = given_genotypes[i];
+ cAnalyzeGenotype * tmrca = mrca.find(gen_pair(genotype->GetID(),
+ given_genotype->GetID()))->second;
+ int dist = genotype->GetDepth() + given_genotype->GetDepth() - 2 * tmrca->GetDepth();
+
+ if (dist < min_depth_dist) {
+ second_min_depth = min_depth_dist;
+ second_min_gen = min_depth_gen;
+ min_depth_dist = dist;
+ min_depth_gen = given_genotype;
+ } else if (dist >= min_depth_dist && dist < second_min_depth) {
+ second_min_depth = dist;
+ second_min_gen = given_genotype;
+ }
+ }
+
+ const cGenome & given_genome1 = min_depth_gen->GetGenome();
+ const cGenome & given_genome2 = second_min_gen->GetGenome();
+ for (int line = 0; line < length_genome; ++ line) {
+ int given_inst = given_genome1[line].GetOp();
+ prob[line][given_inst] += pow(1 - 1.0/length_genome, min_depth_dist);
+ given_inst = given_genome2[line].GetOp();
+ prob[line][given_inst] += pow(1 - 1.0/length_genome, min_depth_dist);
+ }
+
- // Compare with its parent to measure net new information
- double lostinfo = IncreasedInfo(parent_genotype, cur_genotype, mut_rate);
- community_cpx_m2 += newinfo - lostinfo;
- cpx_fp << " " << newinfo - lostinfo << " " << community_cpx_m2 << endl;
+ } else if (given_genotypes.size() == 1) {
+ //////////////////////////////////////////////////////
+ // Calculate the probability of each inst at each line
+ cAnalyzeGenotype * tmrca = mrca.find(gen_pair(genotype->GetID(),
+ given_genotypes[0]->GetID()))->second;
+ int dist = genotype->GetDepth() + given_genotypes[0]->GetDepth() - 2 * tmrca->GetDepth();
+ const cGenome & given_genome = given_genotypes[0]->GetGenome();
+
+ for (int line = 0; line < length_genome; ++ line) {
+ int given_inst = given_genome[line].GetOp();
+ prob[line][given_inst] += pow(1 - 1.0/length_genome, dist);
+ }
- } else {
- cpx_fp << " " << 0 << " " << community_cpx_m2 << endl;
+ }
+
+ /////////////////////////////////////////
+ // Normalize the probability at each line
- // This genotype should not have children
- if (children.count(cur_genotype->GetID()) != 0) {
- cpx_fp << "Error: This genotype has children but it shouldn't." << endl;
- cpx_fp.close();
- return;
+ for (int line = 0; line < length_genome; ++ line) {
+ double cur_total_prob = 0.0;
+ for (int inst = 0; inst < num_insts; ++ inst) {
+ cur_total_prob += prob[line][inst];
+ }
+ if (cur_total_prob > 1) {
+ for (int inst = 0; inst < num_insts; ++ inst) {
+ prob[line][inst] = prob[line][inst] / cur_total_prob;
}
+ cur_total_prob = 1.0;
}
+ double left_prob = 1 - cur_total_prob;
+ for (int inst = 0; inst < num_insts; ++ inst) {
+ prob[line][inst] += left_prob / num_insts;
+ }
+
}
- // Add the children of current genotype to the queue
- typedef pair<tMultimap::iterator, tMultimap::iterator> pairii;
- pairii child_range = children.equal_range(cur_genotype->GetID());
- tMultimap::iterator child_pos = child_range.first;
- for (; child_pos != child_range.second; ++ child_pos) {
- cAnalyzeGenotype * cur_child = child_pos->second;
- genotype_queue.push( tGenotypePair(cur_child, cur_genotype) );
+ /////////////////////////////////
+ // Calculate entropy of each line
+ vector<double> entropy(length_genome, 0.0);
+ for (int line = 0; line < length_genome; ++ line) {
+ for (int inst = 0; inst < num_insts; ++ inst) {
+ if (prob[line][inst] > 0) {
+ entropy[line] -= prob[line][inst] * log(prob[line][inst]) / log(num_insts*1.0);
+ }
+ }
}
- // Delete the children of current genotype from map
- children.erase(child_range.first, child_range.second);
+
+ ///////////////////////////////////////////////////////////////////
+ // Point mutation at all lines of code to look for neutral mutation
+ cout << "Test point mutation." << endl;
+ vector<bool> one_line_neutral(num_insts, false);
+ vector< vector<bool> > neutral_mut(length_genome, one_line_neutral);
+
+ genotype->Recalculate();
+ double base_fitness = genotype->GetFitness();
+ const cGenome & base_genome = genotype->GetGenome();
+ cGenome mod_genome(base_genome);
+
+ for (int line = 0; line < length_genome; ++ line) {
+ int cur_inst = base_genome[line].GetOp();
+
+ for (int mod_inst = 0; mod_inst < num_insts; ++ mod_inst) {
+ mod_genome[line].SetOp(mod_inst);
+ cAnalyzeGenotype test_genotype(mod_genome, inst_set);
+ test_genotype.Recalculate();
+ if (test_genotype.GetFitness() >= base_fitness) {
+ neutral_mut[line][mod_inst] = true;
+ }
+ }
+
+ mod_genome[line].SetOp(cur_inst);
+ }
+
+
+ /////////////////////////////////////////////////////
+ // Redistribute the probability of insts at each line
+
+ for (int line = 0; line < length_genome; ++ line) {
+ double total_prob = 0.0;
+ for (int inst = 0; inst < num_insts; ++ inst) {
+ if (neutral_mut[line][inst] == true) {
+ total_prob += prob[line][inst];
+ }
+ }
+
+
+
+ for (int inst = 0; inst < num_insts; ++ inst) {
+ if (neutral_mut[line][inst] == true) {
+ prob[line][inst] = prob[line][inst] / total_prob;
+ } else {
+ prob[line][inst] = 0.0;
+ }
+ }
+
+ }
+
+ ////////////////////////////////////////////////
+ // Calculate the entropy given environment
+
+ vector<double> entropy_given_env(length_genome, 0.0);
+ for (int line = 0; line < length_genome; ++ line) {
+ for (int inst = 0; inst < num_insts; ++ inst) {
+ if (prob[line][inst] > 0) {
+ entropy_given_env[line] -= prob[line][inst] * log(prob[line][inst]) / log(num_insts*1.0);
+ }
+ }
+ }
+
+
+ ///////////////////////////////////////////////////////////////////////////
+ // Calculate the information between genotype and env given other genotypes
+ double information = 0.0;
+ double entropy_before = 0.0;
+ double entropy_after = 0.0;
+ for (int line = 0; line < length_genome; ++ line) {
+ entropy_before += entropy[line];
+ entropy_after += entropy_given_env[line];
+
+ if (entropy[line] >= entropy_given_env[line]) {
+ information += entropy[line] - entropy_given_env[line];
+ } else {
+ information += 0; // Env did not give more information about this site.
+ }
+
+ }
+ complexity += information;
+
+ cpx_fp << genotype->GetID() << " " << entropy_before << " " << entropy_after << " "
+ << information << " " << complexity << endl;
+
+ /////////////////////////////////////////////////////////////
+ // This genotype becomes the given condition of next genotype
+
+ given_genotypes.push_back(genotype);
+
}
-
+
+
+ // Set the test CPU back to the state it was
+ cTestCPU::UseResources() = backupUsage;
+ cTestCPU::SetupResourceArray(backupResources);
+
cpx_fp.close();
return;
-
+
}
void cAnalyze::CommandLandscape(cString cur_string)
More information about the Avida-cvs
mailing list