ECOCPAK v0.9
|
00001 // Copyright (C) 2011 the authors listed below 00002 // http://ecocpak.sourceforge.net 00003 // 00004 // Authors: 00005 // - Dimitrios Bouzas (bouzas at ieee dot org) 00006 // - Nikolaos Arvanitopoulos (niarvani at ieee dot org) 00007 // - Anastasios Tefas (tefas at aiia dot csd dot auth dot gr) 00008 // 00009 // This file is part of the ECOC PAK C++ library. It is 00010 // provided without any warranty of fitness for any purpose. 00011 // 00012 // You can redistribute this file and/or modify it under 00013 // the terms of the GNU Lesser General Public License (LGPL) 00014 // as published by the Free Software Foundation, either 00015 // version 3 of the License or (at your option) any later 00016 // version. 00017 // (see http://www.opensource.org/licenses for more info) 00018 00019 00022 00023 00024 00034 template<typename eT> 00035 inline 00036 eT 00037 op_kmeans::compute_distance 00038 ( 00039 const subview_row<eT>& sample_in, 00040 const subview_row<eT>& centroid_in 00041 ) 00042 { 00043 arma_extra_debug_sigprint(); 00044 00045 Row<eT> temp_vec = sample_in - centroid_in; 00046 return accu(temp_vec % temp_vec); 00047 } 00048 00049 00050 00061 template<typename eT> 00062 inline 00063 void 00064 op_kmeans::compute_distance 00065 ( 00066 Row<eT>& dist_out, 00067 u32& winner, 00068 const Mat<eT>& centroids_in, 00069 const subview_row<eT>& sample_in 00070 ) 00071 { 00072 arma_extra_debug_sigprint(); 00073 00074 const u32 k = centroids_in.n_rows; 00075 const u32 n_cols = centroids_in.n_cols; 00076 00077 dist_out = zeros<Row<eT> >(k); 00078 eT winning_dist = numeric_limits<eT>::max(); 00079 winner = 0; 00080 00081 // Iterate through centroids 00082 for(u32 i = 0; i < k; ++i) 00083 { 00084 Row<eT> temp_row = centroids_in.row(i) - sample_in; 00085 dist_out(i) = accu(temp_row % temp_row); 00086 00087 if(dist_out(i) < winning_dist) 00088 { 00089 winning_dist = dist_out(i); 00090 winner = i; 00091 } 00092 } 00093 00094 } 00095 00096 00097 00107 template<typename eT> 00108 inline 00109 void 00110 op_kmeans::init_centroids 00111 ( 00112 Mat<eT>& centroids_out, 00113 const Mat<eT>& samples_in, 00114 const u32 k 00115 ) 00116 { 00117 arma_extra_debug_sigprint(); 00118 00119 const u32 n_samples = samples_in.n_rows; 00120 const u32 n_cols = samples_in.n_cols; 00121 00122 // Allocate memory for centroids matrix 00123 centroids_out = zeros<Mat<eT> >(k, n_cols); 00124 00125 // --- Kmeans++ algorithm introduced by David Arthur & Sergei Vassilvitskii initialization --- // 00126 00127 // Initialize current potential 00128 eT currentPot = 0; 00129 00130 // Allocate distances vector 00131 Col<eT> closestDistSq = zeros<Col<eT> >(n_samples); 00132 00133 // Choose one random center and set the closestDistSq values 00134 u32 index = (eT(std::rand()) / eT(RAND_MAX)) * n_samples; 00135 centroids_out.row(0) = samples_in.row(index); 00136 00137 // Compute distances and potentials 00138 for(u32 i = 0; i < n_samples; i++) 00139 { 00140 closestDistSq[i] = compute_distance(samples_in.row(i), centroids_out.row(0)); 00141 currentPot += closestDistSq[i]; 00142 } 00143 00144 // Repeat for the rest k - 1 centers to be chosen 00145 for(u32 centerCount = 1; centerCount < k; centerCount++) 00146 { 00147 // Repeat several trials 00148 eT bestNewPot = -1; 00149 u32 bestNewIndex; 00150 00151 // Compute number of local trials 00152 u32 numLocalTries = 2 + std::log(eT(k)); 00153 00154 for(u32 localTrial = 0; localTrial < numLocalTries; localTrial++) 00155 { 00156 // Choose center - have to be slightly careful to return a valid answer even accounting 00157 // for possible rounding errors 00158 eT randVal = (eT(std::rand()) / eT(RAND_MAX)) * currentPot; 00159 00160 for(index = 0; index < n_samples - 1; index++) 00161 { 00162 if (randVal <= closestDistSq[index]) 00163 { 00164 break; 00165 } 00166 else 00167 { 00168 randVal -= closestDistSq[index]; 00169 } 00170 } 00171 00172 // Compute the new potential 00173 eT newPot = 0; 00174 00175 // Iterate through samples 00176 for(u32 i = 0; i < n_samples; i++) 00177 { 00178 newPot += std::min(compute_distance(samples_in.row(i), samples_in.row(index)), closestDistSq[i]); 00179 } 00180 00181 // Store the best result 00182 if(bestNewPot < 0 || newPot < bestNewPot) 00183 { 00184 bestNewPot = newPot; 00185 bestNewIndex = index; 00186 } 00187 00188 } 00189 00190 // Add the appropriate center 00191 centroids_out.row(centerCount) = samples_in.row(bestNewIndex); 00192 currentPot = bestNewPot; 00193 00194 for(u32 i = 0; i < n_samples; i++) 00195 { 00196 closestDistSq[i] = std::min(compute_distance(samples_in.row(i), samples_in.row(bestNewIndex)), closestDistSq[i]); 00197 } 00198 00199 } 00200 00201 } 00202 00203 00204 00217 template<typename eT> 00218 inline 00219 void 00220 op_kmeans::direct_kmeans 00221 ( 00222 Col<u32>& indices_out, 00223 Col<u32>& ranks, 00224 Mat<eT>& centroids_out, 00225 const Mat<eT>& samples_in, 00226 const u32 k, 00227 const eT lr 00228 ) 00229 { 00230 arma_extra_debug_sigprint(); 00231 00232 #ifdef NCURSES_OUTPUT 00233 //cout << endl << "decoding Started on " << ctime(&tbegin); 00234 mvaddstr(3, 0, "Computing K-means:"); 00235 refresh(); 00236 #endif 00237 00238 const u32 n_samples = samples_in.n_rows; 00239 const u32 n_cols = samples_in.n_cols; 00240 00241 // Check wether the user provided any samples 00242 arma_check(n_samples == 0, "kmeans(): Input matrix contains no samples"); 00243 00244 // Check wether the user specified the number of clusters to be greater or equal with the number of samples 00245 // In this case return as centroids the samples 00246 if(k >= n_samples) 00247 { 00248 centroids_out = samples_in; 00249 00250 ranks = zeros<Col<u32> >(k); 00251 00252 if(n_samples > 1) 00253 { 00254 indices_out = conv_to<Col<u32> >::from(linspace(0, n_samples - 1, n_samples)); 00255 } 00256 else 00257 { 00258 indices_out = zeros<Col<u32> >(1); 00259 } 00260 00261 return; 00262 } 00263 00264 // Allocate memory for output arguments 00265 indices_out = zeros<Col<u32> >(n_samples); 00266 00267 // Initialize centroids 00268 init_centroids(centroids_out, samples_in, k); 00269 00270 // Declare distances matrix 00271 Mat<eT> dist_mat; 00272 00273 // Allocate centintel indices 00274 Col<u32> prv_inds = ones<Col<u32> >(n_samples); 00275 Col<u32> more_prv_inds = zeros<Col<u32> >(n_samples); 00276 00277 // Iterations counter 00278 u32 iters = 1; 00279 00280 // Repeat while there's change in centroids 00281 while(accu(prv_inds == indices_out) < n_samples) 00282 { 00283 // Update centintel indices 00284 more_prv_inds = prv_inds; 00285 prv_inds = indices_out; 00286 00287 // Initialize sum of distances and distances matrix 00288 dist_mat = zeros< Mat<eT> >(n_samples, k); 00289 00290 // Allocate ranks vector for each centroid -- i.e., vectors that counts the winning sample for each centroid 00291 ranks = zeros<Col<u32> >(k); 00292 00293 // Sample accumulator -- accumulates winning samples for each centroid for later update 00294 Mat<eT> sample_accu = zeros<Mat<eT> >(k, n_cols); 00295 00296 // Temporary vector holding the distances of the current sample from each one of the centroids 00297 // TODO: Can't pass dist_out.row(i) by reference to compute_distance() that's why we need this 00298 // temporary row vector. 00299 Row<eT> dist_tmp; 00300 00301 // Compute distances of each samples from the available centroids 00302 for(u32 i = 0; i < n_samples; ++i) 00303 { 00304 compute_distance(dist_tmp, indices_out(i), centroids_out, samples_in.row(i)); 00305 dist_mat.row(i) = dist_tmp; 00306 sample_accu.row(indices_out(i)) += samples_in.row(i); 00307 ranks(indices_out(i))++; 00308 00309 // Update winning centroid by moving it towards current sample -- Actually this is the only difference with the batch variant 00310 // centroids_out.row(indices_out(i)) += ((eT(lr) / eT(iters)) * (samples_in.row(i) - centroids_out.row(indices_out(i)))); 00311 centroids_out.row(indices_out(i)) += (eT(lr) * (samples_in.row(i) - centroids_out.row(indices_out(i)))); 00312 00313 #ifdef NCURSES_OUTPUT 00314 mvaddch(3, 20, spin_chars[i & 3]); 00315 refresh(); 00316 #endif 00317 } 00318 00319 // Update centroids 00320 centroids_out = sample_accu; 00321 00322 // Normalize updated centroids according to number of winning samples 00323 for(u32 i = 0; i < k; ++i) 00324 { 00325 centroids_out.row(i) /= eT(ranks(i)); 00326 00327 #ifdef NCURSES_OUTPUT 00328 mvaddch(3, 20, spin_chars[i & 3]); 00329 refresh(); 00330 #endif 00331 } 00332 00333 // Check for empty clusters 00334 if(min(ranks) == 0) 00335 { 00336 // Find the empty cluster -- centroid 00337 00338 Col<u32> empty_indx; 00339 Col<u32> max_indx; 00340 00341 // Iterate while there are empty clusters 00342 empty_indx = sort_index(ranks); 00343 00344 u32 k = 0; 00345 while(min(ranks) == 0) 00346 { 00347 max_indx = sort_index(dist_mat.col(empty_indx(k)),1); 00348 centroids_out.row(empty_indx(k)) = samples_in.row(max_indx(k)); 00349 ranks(empty_indx(k)) = 1; 00350 indices_out(max_indx(k)) = empty_indx(k); 00351 00352 #ifdef NCURSES_OUTPUT 00353 mvaddch(3, 20, spin_chars[k & 3]); 00354 refresh(); 00355 #endif 00356 00357 k++; 00358 } 00359 00360 // Check wether the system oscilates 00361 if(accu(more_prv_inds == indices_out) < n_samples) 00362 { 00363 prv_inds = indices_out; 00364 } 00365 00366 } 00367 00368 #ifdef NCURSES_OUTPUT 00369 mvaddch(3, 20, spin_chars[iters & 3]); 00370 refresh(); 00371 #endif 00372 00373 // Increase number of iterations 00374 iters++; 00375 } 00376 00377 } 00378 00379 00380