#include #include #include #include #include #include #include #include #include #include #include #include /********* HELPER FUNCTIONS *********/ // Functor to compare by the Mth element -- similar to key=lambda in Python template class F = std::less> struct TupleCompare { template bool operator()(T const &t1, T const &t2) { return F::type>()(std::get(t1), std::get(t2)); } }; // Whitespace-trimming functions using namespace std; // trim from start static inline string <rim(string &s) { s.erase(s.begin(), find_if(s.begin(), s.end(), not1(ptr_fun(isspace)))); return s; } // trim from end static inline string &rtrim(string &s) { s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun(isspace))).base(), s.end()); return s; } // trim from both ends static inline string &trim(string &s) { return ltrim(rtrim(s)); } // Used to split on a character/string std::vector &split(const std::string &s, char delim, std::vector &elems) { std::stringstream ss(s); std::string item; while (std::getline(ss, item, delim)) { elems.push_back(item); } return elems; } //--------------------------------------------------------------------------------------------------- // Parameters: String of PDB file name; // Returns: Hashmap of key: Residue Sequence (int)/value: list of x,y,z coords as floats map > parse_pdb(string pdb_file) { // Read in file ifstream file(pdb_file); string line; map > CA_locs; // Iterate through lines of file while(getline(file, line)) { if (line.size() < 50) { continue; } // Set all the values of the line we are interested in string recordName = line.substr(0, 6); recordName = trim(recordName); string atomName = (line.substr(12, 4)); atomName = trim(atomName); if ((recordName.compare("ATOM") != 0) || (atomName.compare("CA") != 0)) { continue; } if (recordName.compare("TER") == 0) { break; } // Get the resSeq value and convert to an int after trimming string resSeq = line.substr(22, 5); int resSeq2 = stoi(trim(resSeq)); // Trim and convert to floats string x_t = line.substr(30, 8); float x = stof(trim(x_t)); string y_t = line.substr(38, 8); float y = stof(trim(y_t)); string z_t = line.substr(46, 8); float z = stof(trim(z_t)); // Append coordinate values to a list vector value; value.push_back(x); value.push_back(y); value.push_back(z); // Insert k:v pair in hashmap CA_locs.insert(pair >(resSeq2, value)); } return CA_locs; } // Parameters: Two list of list of floats representing the two clusters // Returns: Float value of the maximum distance between the clusters float cluster_dist(vector > cluster1, vector > cluster2) { // Concatenates the two vectors -- preallocating memory and inserting saves time vector > points; points.reserve( cluster1.size() + cluster2.size() ); // preallocate memory points.insert( points.end(), cluster1.begin(), cluster1.end() ); points.insert( points.end(), cluster2.begin(), cluster2.end() ); if (points.size() == 1) { return 0.0; } vector distances; for (int p1=0; p1 point1 = points[p1]; vector point2 = points[p2]; // Distance math float distance = abs(sqrt(pow(point1[0] - point2[0], 2.0) + pow(point1[1] - point2[1], 2.0) + pow(point1[2] - point2[2], 2.0))); // Append to list distances.push_back(distance); } } // Return the element of maximum value (takes iterators as arguments) return *max_element(distances.begin(), distances.end()); } // Parameters: List of ints that represent points, Hashmap of k: int/v: list of floats, float of cl_dist // Returns: Hashmap of Minimum cluster sizes and a list of tuples of an int list and a float tuple, vector, float> > > cl_cluster(vector points, map > coords, float cl_dist) { vector > clusters; // Create list of lists of the points for (vector::iterator it = points.begin(); it != points.end(); ++it) { vector temp; temp.push_back(*it); clusters.push_back(temp); } map min_clust_sizes; vector > distances; while (clusters.size() > 1) { // Reinitialize distances without wasting memory distances.clear(); // Iterate through all clusters for (int c1=0; c1 < clusters.size(); c1++) { for (int c2=c1+1; c2 < clusters.size(); c2++) { vector c1_cluster = clusters.at(c1); vector c2_cluster = clusters.at(c2); // Get the value from the coords dictionary of each point in a cluster vector > c1_point; for (vector::iterator it = c1_cluster.begin(); it != c1_cluster.end(); ++it) { if (coords.find(*it) == coords.end()) { //Not found } else { vector value = coords.find(*it)->second; c1_point.push_back(value); } } vector > c2_point; for (vector::iterator it = c2_cluster.begin(); it != c2_cluster.end(); ++it) { if (coords.find(*it) == coords.end()) { //Not found } else { vector value = coords.find(*it)->second; c2_point.push_back(value); } } // Find max distance between the two points float distance = cluster_dist(c1_point, c2_point); // Create tuple and append tuple append_val = make_tuple(c1, c2, distance); distances.push_back(append_val); } } // Sort distances by the last element sort(distances.begin(), distances.end(), TupleCompare<2>()); tuple max_cluster = distances.at(0); // Get the individual values of max_cluster int merge_clust1 = get<0>(max_cluster); int merge_clust2 = get<1>(max_cluster); float separation = get<2>(max_cluster); if (separation > cl_dist) { break; } vector cluster1 = clusters.at(merge_clust1); vector cluster2 = clusters.at(merge_clust2); vector new_cluster; // Merge closest numbers -- concatenating lists new_cluster.reserve( cluster1.size() + cluster2.size() ); // preallocate memory new_cluster.insert( new_cluster.end(), cluster1.begin(), cluster1.end() ); new_cluster.insert( new_cluster.end(), cluster2.begin(), cluster2.end() ); clusters.at(merge_clust1) = new_cluster; int cluster_size = clusters.at(merge_clust1).size(); // Keep track of smallest cluster diameter associated with each cluster of size n residues if ( min_clust_sizes.find(cluster_size) == min_clust_sizes.end() ) { // not found min_clust_sizes[cluster_size] = separation; } else { // found } // Remove old cluster clusters.erase(clusters.begin()+merge_clust2); } // Iterate over clusters and get the values from coords of each point in each cluster vector, float> > clust_inp; for (vector >::iterator it = clusters.begin(); it != clusters.end(); ++it) { vector c = *it; vector > dist_inp; vector > empty; for (vector::iterator it2 = c.begin(); it2 != c.end(); ++it2) { float p = *it2; dist_inp.push_back(coords[p]); } clust_inp.push_back(make_tuple(c, cluster_dist(dist_inp, empty))); } return make_tuple(min_clust_sizes, clust_inp); } int main ( int argc, char *argv[] ) { // Fail if not enough arguments if (argc != 6) { cout<<"Please enter correct number of arguments"< mutations; split(mutation_input, ',', mutations); // comma-separated list of mutated amino acids float cl_dist = stof(argv[3]); // complete-linkage cutoff in Angstroms, after which no new clusters are formed int prot_len = stoi(argv[4]); //number of amino acids in protein (could be more than in model) int iterations = stoi(argv[5]); // number of random clusterings to do in order to calculate p-values // Iterate through each aa in mutations if it's in pdb_coords map > pdb_coords; pdb_coords = parse_pdb(pdb_file); vector aa_to_cluster; for(vector::iterator it = mutations.begin(); it != mutations.end(); ++it) { int input = stoi(*it); if ( pdb_coords.find(input) == pdb_coords.end() ) { } else { // found aa_to_cluster.push_back(input); } } tuple, vector, float> > > output = cl_cluster(aa_to_cluster, pdb_coords, cl_dist); vector, float> > observed_clusters = get<1>(output); // Iterate through oberserved clusters to get the maximum cluster diameter vector observed_cluster_sizes; float max_cluster_diameter = 0.0; for(vector, float> >::iterator it = observed_clusters.begin(); it != observed_clusters.end(); ++it) { vector first = get<0>(*it); float second = get<1>(*it); observed_cluster_sizes.push_back(first.size()); if (second > max_cluster_diameter) { max_cluster_diameter = second; } } // Iterate through observed clusters to get the oberved cluster sizes map > min_dist_lists; for (vector::iterator it = observed_cluster_sizes.begin(); it != observed_cluster_sizes.end(); ++it) { vector value; min_dist_lists.insert(pair >(*it, value)); } // Initialize random number generator srand (time(NULL)); // Iterate through the number of iterations to generate random clusters for(int i=0; i random_mutations; for (int j=0; j random_aa_to_cluster; for (vector::iterator it = random_mutations.begin(); it != random_mutations.end(); ++it) { if ( pdb_coords.find(*it) == pdb_coords.end() ) {/* not found */} else { // found random_aa_to_cluster.push_back(*it); } } // Call cl_cluster and get the first value of the tuple tuple, vector, float> > > clust; clust = cl_cluster(random_aa_to_cluster, pdb_coords, max_cluster_diameter); map rand_min_dist = get<0>(clust); // Iterate through the observed cluster sizes and if in rand_min_dist, append the value and infinity otherwise for (vector::iterator it = observed_cluster_sizes.begin(); it != observed_cluster_sizes.end(); ++it) { if ( rand_min_dist.find(*it) == rand_min_dist.end() ) { // not found min_dist_lists[*it].push_back(99999); } else { // found min_dist_lists[*it].push_back(rand_min_dist[*it]); } } } // Sort in increasing order the distances for (map >::iterator it = min_dist_lists.begin(); it != min_dist_lists.end(); ++it) { int key = it->first; vector value = it->second; sort(value.begin(), value.end()); min_dist_lists[key] = value; } // Iterate through the observed clusters and count the denominator and diameters for (vector, float> >::iterator it = observed_clusters.begin(); it != observed_clusters.end(); ++it) { vector indicies = get<0>(*it); float diameter = get<1>(*it); if (indicies.size() < 2){ continue; } float rank = 0.0; vector diameter_list = min_dist_lists[indicies.size()]; for (vector::iterator it2 = diameter_list.begin(); it2 != diameter_list.end(); ++it2) { float d = *it2; if (diameter <= d) { break; } else { rank++; } } int denominator = 0; if (rank == 0.0) { rank = 1.0; denominator = iterations + 1; } else { denominator = iterations; } for(vector::iterator i = indicies.begin(); i != indicies.end(); ++i) { if (next(i) == indicies.end()) { cout << *i; } else { cout << *i << ','; } } cout << "\t"; cout << diameter; cout << "\t"; cout << rank/denominator << endl; } // TIME // std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); // auto duration = std::chrono::duration_cast( t2 - t1 ).count(); // float duration2 = float(duration)/float(1000000); // cout << "Elapsed time: "; // cout << duration2 << endl; return 0; } }