ECOCPAK: op_kmeans_meat.hpp Source File

ECOCPAK v0.9
00001 // Copyright (C) 2011 the authors listed below
00002 // http://ecocpak.sourceforge.net
00003 // 
00004 // Authors:
00005 // - Dimitrios Bouzas (bouzas at ieee dot org)
00006 // - Nikolaos Arvanitopoulos (niarvani at ieee dot org)
00007 // - Anastasios Tefas (tefas at aiia dot csd dot auth dot gr)
00008 // 
00009 // This file is part of the ECOC PAK C++ library. It is 
00010 // provided without any warranty of fitness for any purpose.
00011 //
00012 // You can redistribute this file and/or modify it under 
00013 // the terms of the GNU Lesser General Public License (LGPL) 
00014 // as published by the Free Software Foundation, either 
00015 // version 3 of the License or (at your option) any later 
00016 // version.
00017 // (see http://www.opensource.org/licenses for more info)
00018 
00019 
00022 
00023 
00024 
00034 template<typename eT>
00035 inline
00036 eT
00037 op_kmeans::compute_distance
00038   (
00039   const subview_row<eT>& sample_in, 
00040   const subview_row<eT>& centroid_in
00041   )
00042   {
00043   arma_extra_debug_sigprint();
00044 
00045   Row<eT> temp_vec = sample_in - centroid_in;
00046   return accu(temp_vec % temp_vec);   
00047   }
00048 
00049 
00050 
00061 template<typename eT>
00062 inline
00063 void
00064 op_kmeans::compute_distance
00065   (
00066         Row<eT>& dist_out, 
00067             u32& winner, 
00068   const Mat<eT>& centroids_in, 
00069   const subview_row<eT>& sample_in
00070   )
00071   {
00072   arma_extra_debug_sigprint();
00073   
00074   const u32 k = centroids_in.n_rows;
00075   const u32 n_cols = centroids_in.n_cols;
00076 
00077   dist_out = zeros<Row<eT> >(k); 
00078   eT winning_dist = numeric_limits<eT>::max();
00079   winner = 0;
00080 
00081   // Iterate through centroids
00082   for(u32 i = 0; i < k; ++i)
00083     {
00084     Row<eT> temp_row = centroids_in.row(i) - sample_in;      
00085     dist_out(i) =  accu(temp_row % temp_row);
00086         
00087     if(dist_out(i) < winning_dist)
00088       {
00089       winning_dist = dist_out(i);
00090       winner = i;
00091       }       
00092     }
00093 
00094   }  
00095 
00096 
00097 
00107 template<typename eT>
00108 inline
00109 void
00110 op_kmeans::init_centroids
00111   (
00112         Mat<eT>& centroids_out, 
00113   const Mat<eT>& samples_in, 
00114   const u32 k
00115   )
00116   {
00117   arma_extra_debug_sigprint();
00118   
00119   const u32 n_samples = samples_in.n_rows;
00120   const u32 n_cols    = samples_in.n_cols;
00121   
00122   // Allocate memory for centroids matrix
00123   centroids_out = zeros<Mat<eT> >(k, n_cols);
00124 
00125   // --- Kmeans++ algorithm introduced by David Arthur & Sergei Vassilvitskii initialization --- //
00126   
00127   // Initialize current potential      
00128   eT currentPot = 0;
00129   
00130   // Allocate distances vector
00131   Col<eT> closestDistSq = zeros<Col<eT> >(n_samples);
00132 
00133   // Choose one random center and set the closestDistSq values
00134   u32 index = (eT(std::rand()) / eT(RAND_MAX)) * n_samples;
00135   centroids_out.row(0) = samples_in.row(index);
00136   
00137   // Compute distances and potentials
00138   for(u32 i = 0; i < n_samples; i++) 
00139     {
00140     closestDistSq[i] = compute_distance(samples_in.row(i), centroids_out.row(0));
00141     currentPot += closestDistSq[i];
00142     }
00143 
00144   // Repeat for the rest k - 1 centers to be chosen 
00145   for(u32 centerCount = 1; centerCount < k; centerCount++) 
00146     {
00147     // Repeat several trials
00148     eT bestNewPot = -1;
00149     u32 bestNewIndex;
00150     
00151     // Compute number of local trials
00152     u32 numLocalTries = 2 + std::log(eT(k));
00153     
00154     for(u32 localTrial = 0; localTrial < numLocalTries; localTrial++) 
00155       {
00156                 // Choose center - have to be slightly careful to return a valid answer even accounting
00157           // for possible rounding errors
00158       eT randVal =  (eT(std::rand()) / eT(RAND_MAX)) * currentPot;
00159     
00160       for(index = 0; index < n_samples - 1; index++) 
00161         {
00162         if (randVal <= closestDistSq[index])
00163           {
00164           break;
00165           }
00166         else
00167           {
00168           randVal -= closestDistSq[index];
00169           }
00170         }
00171 
00172                   // Compute the new potential
00173       eT newPot = 0;
00174       
00175       // Iterate through samples
00176       for(u32 i = 0; i < n_samples; i++)
00177         {
00178         newPot += std::min(compute_distance(samples_in.row(i), samples_in.row(index)), closestDistSq[i]);
00179         }
00180 
00181       // Store the best result
00182       if(bestNewPot < 0 || newPot < bestNewPot) 
00183         {
00184         bestNewPot = newPot;
00185         bestNewIndex = index;
00186         }
00187             
00188       }
00189 
00190     // Add the appropriate center
00191     centroids_out.row(centerCount) = samples_in.row(bestNewIndex);
00192     currentPot = bestNewPot;
00193 
00194     for(u32 i = 0; i < n_samples; i++)
00195       {
00196       closestDistSq[i] = std::min(compute_distance(samples_in.row(i), samples_in.row(bestNewIndex)), closestDistSq[i]);
00197       }
00198   
00199     }
00200   
00201   }
00202 
00203 
00204 
00217 template<typename eT>
00218 inline
00219 void
00220 op_kmeans::direct_kmeans
00221   (
00222        Col<u32>& indices_out,
00223        Col<u32>& ranks, 
00224         Mat<eT>& centroids_out, 
00225   const Mat<eT>& samples_in,
00226   const u32      k, 
00227   const eT       lr
00228   )
00229   {
00230   arma_extra_debug_sigprint();
00231   
00232   #ifdef NCURSES_OUTPUT
00233   //cout << endl << "decoding Started on " << ctime(&tbegin);
00234   mvaddstr(3, 0, "Computing K-means:");
00235   refresh();
00236   #endif
00237 
00238   const u32 n_samples = samples_in.n_rows;
00239   const u32 n_cols    = samples_in.n_cols;
00240   
00241   // Check wether the user provided any samples
00242   arma_check(n_samples == 0, "kmeans(): Input matrix contains no samples");
00243   
00244   // Check wether the user specified the number of clusters to be greater or equal with the number of samples
00245   // In this case return as centroids the samples
00246   if(k >= n_samples)
00247     {
00248     centroids_out = samples_in;
00249 
00250     ranks = zeros<Col<u32> >(k);
00251     
00252     if(n_samples > 1)
00253       {
00254       indices_out = conv_to<Col<u32> >::from(linspace(0, n_samples - 1, n_samples));
00255       }
00256     else
00257       {
00258       indices_out = zeros<Col<u32> >(1);
00259       }
00260       
00261     return;
00262     }
00263   
00264   // Allocate memory for output arguments
00265   indices_out = zeros<Col<u32> >(n_samples);
00266       
00267   // Initialize centroids
00268   init_centroids(centroids_out, samples_in, k);
00269 
00270   // Declare distances matrix
00271   Mat<eT> dist_mat;
00272 
00273   // Allocate centintel indices  
00274   Col<u32> prv_inds = ones<Col<u32> >(n_samples);
00275   Col<u32> more_prv_inds = zeros<Col<u32> >(n_samples);
00276       
00277   // Iterations counter
00278   u32 iters = 1;
00279   
00280   // Repeat while there's change in centroids
00281   while(accu(prv_inds == indices_out) < n_samples) 
00282     {
00283     // Update centintel indices
00284     more_prv_inds = prv_inds;
00285     prv_inds = indices_out;
00286           
00287     // Initialize sum of distances and distances matrix
00288     dist_mat = zeros< Mat<eT> >(n_samples, k);  
00289 
00290     // Allocate ranks vector for each centroid -- i.e., vectors that counts the winning sample for each centroid
00291     ranks = zeros<Col<u32> >(k);
00292       
00293     // Sample accumulator -- accumulates winning samples for each centroid for later update
00294     Mat<eT> sample_accu = zeros<Mat<eT> >(k, n_cols);
00295     
00296     // Temporary vector holding the distances of the current sample from each one of the centroids
00297     // TODO: Can't pass dist_out.row(i) by reference to compute_distance() that's why we need this
00298     //       temporary row vector.
00299     Row<eT> dist_tmp;
00300      
00301     // Compute distances of each samples from the available centroids
00302     for(u32 i = 0; i < n_samples; ++i)
00303       {
00304       compute_distance(dist_tmp, indices_out(i), centroids_out, samples_in.row(i));
00305       dist_mat.row(i) = dist_tmp;
00306       sample_accu.row(indices_out(i)) += samples_in.row(i);
00307       ranks(indices_out(i))++;  
00308 
00309       // Update winning centroid by moving it towards current sample -- Actually this is the only difference with the batch variant
00310       // centroids_out.row(indices_out(i)) += ((eT(lr) / eT(iters)) * (samples_in.row(i) - centroids_out.row(indices_out(i))));  
00311        centroids_out.row(indices_out(i)) += (eT(lr) * (samples_in.row(i) - centroids_out.row(indices_out(i))));
00312       
00313       #ifdef NCURSES_OUTPUT
00314       mvaddch(3, 20, spin_chars[i & 3]); 
00315       refresh();
00316       #endif
00317       }       
00318 
00319     // Update centroids 
00320     centroids_out = sample_accu;
00321   
00322     // Normalize updated centroids according to number of winning samples
00323     for(u32 i = 0; i < k; ++i)
00324       {
00325       centroids_out.row(i) /= eT(ranks(i));
00326       
00327       #ifdef NCURSES_OUTPUT
00328       mvaddch(3, 20, spin_chars[i & 3]); 
00329       refresh();
00330       #endif
00331       }
00332 
00333     // Check for empty clusters
00334     if(min(ranks) == 0)
00335       {
00336       // Find the empty cluster -- centroid
00337       
00338       Col<u32> empty_indx;
00339       Col<u32> max_indx;
00340       
00341       // Iterate while there are empty clusters       
00342       empty_indx = sort_index(ranks);
00343       
00344       u32 k = 0;
00345       while(min(ranks) == 0)
00346         { 
00347         max_indx = sort_index(dist_mat.col(empty_indx(k)),1);             
00348         centroids_out.row(empty_indx(k)) = samples_in.row(max_indx(k));
00349         ranks(empty_indx(k)) = 1;
00350         indices_out(max_indx(k)) = empty_indx(k); 
00351         
00352         #ifdef NCURSES_OUTPUT
00353         mvaddch(3, 20, spin_chars[k & 3]); 
00354         refresh();
00355         #endif
00356         
00357         k++;             
00358         }
00359         
00360       // Check wether the system oscilates
00361       if(accu(more_prv_inds == indices_out) < n_samples)
00362         {
00363         prv_inds = indices_out;
00364         }
00365    
00366       }
00367      
00368     #ifdef NCURSES_OUTPUT
00369     mvaddch(3, 20, spin_chars[iters & 3]); 
00370     refresh();
00371     #endif
00372     
00373     // Increase number of iterations
00374     iters++;         
00375     } 
00376  
00377   }
00378 
00379 
00380