ECOCPAK v0.9
|
00001 // Copyright (C) 2011 the authors listed below 00002 // http://ecocpak.sourceforge.net 00003 // 00004 // Authors: 00005 // - Dimitrios Bouzas (bouzas at ieee dot org) 00006 // - Nikolaos Arvanitopoulos (niarvani at ieee dot org) 00007 // - Anastasios Tefas (tefas at aiia dot csd dot auth dot gr) 00008 // 00009 // This file is part of the ECOC PAK C++ library. It is 00010 // provided without any warranty of fitness for any purpose. 00011 // 00012 // You can redistribute this file and/or modify it under 00013 // the terms of the GNU Lesser General Public License (LGPL) 00014 // as published by the Free Software Foundation, either 00015 // version 3 of the License or (at your option) any later 00016 // version. 00017 // (see http://www.opensource.org/licenses for more info) 00018 00019 00022 00023 00044 double 00045 criterion_fqmi 00046 ( 00047 const vector<ClassData>& classes_vector, 00048 const ucolvec& indexPtr, 00049 const u32 curSubsetSize, 00050 const double sigma 00051 ) 00052 { 00053 // print header 00054 #ifdef NCURSES_OUTPUT 00055 mvaddstr(3, 0, "Computing FQMI: "); 00056 refresh(); 00057 #endif 00058 00059 if((classes_vector.size() - curSubsetSize) == 0) 00060 { 00061 return 0.0; 00062 } 00063 00064 // data samples 00065 mat samples; 00066 00067 // sample labels 00068 ucolvec labels; 00069 00070 // compute complement subset of current best subset 00071 ucolvec cSet = complement_sffs(indexPtr, curSubsetSize); 00072 00073 // create first bipartition 00074 for(u32 i = 0; i < curSubsetSize; i++) 00075 { 00076 samples = join_cols(samples, classes_vector[indexPtr[i]].Data()); 00077 labels = 00078 join_cols 00079 ( 00080 labels, 00081 ones<ucolvec>(classes_vector[indexPtr[i]].Data().n_rows) 00082 ); 00083 } 00084 00085 // create second bipartition 00086 for(u32 i = 0; i < cSet.n_rows; i++) 00087 { 00088 samples = join_cols(samples, classes_vector[cSet[i]].Data()); 00089 labels = 00090 join_cols 00091 ( 00092 labels, 00093 2 * ones<ucolvec>(classes_vector[cSet[i]].Data().n_rows) 00094 ); 00095 } 00096 00097 return fqmi(samples, labels, sigma); 00098 } 00099 00100 00101 00116 double 00117 criterion_fldr 00118 ( 00119 const vector<ClassData>& classes_vector, 00120 const ucolvec& indexPtr, 00121 const u32 curSubsetSize 00122 ) 00123 { 00124 #ifdef NCURSES_OUTPUT 00125 mvaddstr(3, 0, "Computing FLDR: "); 00126 refresh(); 00127 #endif 00128 00129 if((classes_vector.size() - curSubsetSize) == 0) 00130 { 00131 return 0.0; 00132 } 00133 00134 // size of current subset is 1 00135 if(curSubsetSize == 1) 00136 { 00137 // if the number of samples of the current subset is less than 2 00138 if(classes_vector[indexPtr[0]].Samples() < 2) 00139 { 00140 return 0.0; 00141 } 00142 } 00143 00144 // find the complement set of indexPtr 00145 ucolvec cSet = complement_sffs(indexPtr, curSubsetSize); 00146 00147 // number of classes and current subset size differ only by 1 00148 if((classes_vector.size() - curSubsetSize ) == 1) 00149 { 00150 // if the number of samples of the complement subset is less than 2 00151 if(classes_vector[cSet[0]].Samples() < 2) 00152 { 00153 return 0.0; 00154 } 00155 } 00156 00157 // ClassData vector for partition 1 00158 vector<ClassData> newClassVec1; 00159 00160 // push back the classes that belong to one set 00161 for(u32 i = 0; i < curSubsetSize; i++) 00162 { 00163 newClassVec1.push_back(classes_vector[indexPtr[i]]); 00164 } 00165 00166 // ClassData vector for partition 2 00167 vector<ClassData> newClassVec2; 00168 00169 // push back classes that belong to the complement set 00170 for(u32 i = 0; i < classes_vector.size() - curSubsetSize; i++) 00171 { 00172 newClassVec2.push_back(classes_vector[cSet[i]]); 00173 } 00174 00175 // create data matrices that correspond to the two bi-partitions 00176 mat first_bipartition = create_samples_matrix(newClassVec1); 00177 mat second_bipartition = create_samples_matrix(newClassVec2); 00178 00179 return fldr(first_bipartition, second_bipartition); 00180 } 00181 00182 00183 00199 double 00200 criterion 00201 ( 00202 const vector<ClassData>& class_vector, 00203 const ucolvec& indexPtr, 00204 const u32 curSubSetSize, 00205 const u32 criterion_option, 00206 const double sigma 00207 ) 00208 { 00209 // engage according to user enter criterion option 00210 switch(criterion_option) 00211 { 00212 case FQMI: 00213 { 00214 // call to fqmi procedure 00215 return criterion_fqmi 00216 ( 00217 class_vector, 00218 indexPtr, 00219 curSubSetSize, 00220 sigma 00221 ); 00222 00223 } 00224 00225 case FLDR: 00226 { 00227 // call to fldr procedure 00228 return criterion_fldr(class_vector, indexPtr, curSubSetSize); 00229 } 00230 00231 case CUSTOM_CRITERION: 00232 { 00233 // call to custom criterion procedure 00234 return criterion_custom(class_vector, indexPtr, curSubSetSize); 00235 } 00236 00237 default: 00238 { 00239 #ifdef NCURSES_OUTPUT 00240 endwin(); 00241 #endif 00242 cerr << "Unknown criterion option" << endl; 00243 exit(1); 00244 } 00245 00246 } 00247 00248 } 00249 00250 00251 00321 ucolvec 00322 sffs 00323 ( 00324 const vector<ClassData>& classes_vector, 00325 const u32 crit 00326 ) 00327 { 00328 // sigma for fqmi() 00329 double sigma = 1.0; 00330 00331 // if criterion is FQMI compute sigma 00332 if(crit == 1) 00333 { 00334 // compute sigma 00335 sigma = compute_sigma(classes_vector); 00336 } 00337 00338 // --- declarations of constants --- // 00339 00340 // full feature set size 00341 const u32 n = classes_vector.size(); 00342 00343 // desired feature subset size 00344 const u32 d = n - 1; 00345 00346 // set floating search stopping constraint 00347 int delta = 1; 00348 00349 // set generalization level 00350 int genLevel = 1; 00351 00352 // a multiplier of estimated delta 00353 const double DELTAMUL = 1.0; 00354 00355 // a constant to be added to estimated delta 00356 const int DELTAADD = 1; 00357 00358 // double overflow limit 00359 const double SAFEUP = 5e300; 00360 00361 // ---declarations of variables --- // 00362 00363 // of size n, stores 0/1 information on currently selected features 00364 ucolvec binPtr; 00365 00366 // of size d, it is computed from bin, stores indexPtrs of selected 00367 // features 00368 ucolvec indexPtr; 00369 00370 // temporary best subset array 00371 ucolvec tempBestSetPtr; 00372 00373 // best value 00374 double tempBestValue; 00375 00376 // so far best subsets 00377 ucolvec globalBestSetPtr; 00378 00379 // so far best values 00380 colvec globalBestValuePtr; 00381 00382 // indicates, if a better subset has been found during last step 00383 bool betterFound; 00384 00385 // criterion function return value 00386 double value; 00387 00388 // current subset size 00389 int curSubSetSize; 00390 00391 // current subset size in integers 00392 int curSubSetSizeInt; 00393 00394 // how to change curSubSetSize (when changing direction of search): 00395 // -2,-1,+1,+2 steps 00396 int curSubsetChange; 00397 00398 // initial generalization level 00399 int initGenLevel; 00400 00401 // auxiliary variable 00402 int aux; 00403 00404 // beginning of block of identifiers 00405 int begin; 00406 00407 // "pivot" - last identifier in a block 00408 int pivot; 00409 00410 // identifies end of internal exhaustive search 00411 bool stopExhaustive; 00412 00413 // identifies end of it all 00414 bool stopAll; 00415 00416 // identifies the dimension, for which the algorithm should end 00417 int stopDimension = 0; 00418 00419 // identifier 0 exhanged in case of backward search 00420 int id0 = 0; 00421 00422 // identifier 2 exhanged in case of backward search 00423 int id2 = 2; 00424 00425 // 0=sfs, 1=sbs current search direction 00426 bool sbs = 0; 00427 00428 // stores current backtracking depth (+forward,-backward) 00429 int backtrackDepth = 0; 00430 00431 // predicted delta estimate 00432 float predDeltaEst = 0.0; 00433 00434 // number of direction changes (needed for delta averaging) 00435 int dirCount = 0; 00436 00437 // number of steps 00438 long int stepsNum; 00439 00440 // set stopDimension according to delta value 00441 if(delta == 0) 00442 { 00443 stopDimension = n; 00444 } 00445 else 00446 { 00447 if(delta > 0) 00448 { 00449 stopDimension = d + delta; 00450 00451 if(stopDimension > n) 00452 { 00453 stopDimension = n; 00454 } 00455 00456 } 00457 00458 } 00459 00460 // initialize back tracking depth 00461 backtrackDepth = 0; 00462 00463 // initialize predicted delta estimate 00464 predDeltaEst = 0.0; 00465 00466 // initialize direction counter 00467 dirCount = 0; 00468 00469 // allocate memory for feature code word 00470 binPtr = zeros<ucolvec>(n + 1); 00471 00472 // allocate memory for indices of selected features 00473 indexPtr = zeros<ucolvec>(n); 00474 00475 // allocate memory for current best set 00476 tempBestSetPtr = zeros<ucolvec>(n); 00477 00478 // allocate memory for so far best subsets 00479 globalBestSetPtr = zeros<ucolvec>((n * ( n + 1 ) ) / 2); 00480 00481 // allocate memory for so far best subset values 00482 globalBestValuePtr.set_size(n); 00483 globalBestValuePtr.fill(-SAFEUP); 00484 00485 // set initial generalization level to current generalization level 00486 initGenLevel = genLevel; 00487 00488 // --- Beginning of printing info block according to type of --- // 00489 // --- floating search --- // 00490 00491 // set current subset size equal to initial generalization level 00492 curSubSetSize = initGenLevel; 00493 00494 // initialize stop centinel 00495 stopAll = false; 00496 00497 // do while floating search ends 00498 do 00499 { 00500 // initialize temporary best set 00501 tempBestSetPtr.zeros(); 00502 00503 // initialize best value with underflow limit 00504 tempBestValue = -SAFEUP; 00505 00506 // initialize binPtr for exhaustive step 00507 aux = initGenLevel; 00508 for(u32 i = 0; ((aux > 0) && ( i < n )); i++) 00509 { 00510 if(binPtr[i] == id0) 00511 { 00512 binPtr[i] = id2; 00513 aux--; 00514 } 00515 00516 } 00517 00518 // size of current subset size in number integer (usually 4 byte per 00519 // int) 00520 curSubSetSizeInt = curSubSetSize * sizeof( int ); 00521 00522 // --- estimate the number of steps --- // 00523 stepsNum = 1; 00524 00525 // backward step sbs = 1 00526 if(sbs) 00527 { 00528 aux = curSubSetSize + initGenLevel; 00529 } 00530 else // else forward step 00531 { 00532 aux = n - ( curSubSetSize - initGenLevel ); 00533 } 00534 00535 for(u32 i = 0; i < initGenLevel; i++) 00536 { 00537 stepsNum *= aux - i; 00538 } 00539 00540 for(u32 i = 2; i <= initGenLevel; i++) 00541 { 00542 stepsNum /= i; 00543 } 00544 // --- end of steps estimation --- // 00545 00546 stopExhaustive = false; 00547 00548 // do while internal exhaustive search ends 00549 do 00550 { 00551 // --- convert "binPtr" to "indexPtr" --- // 00552 aux = 0; 00553 for(u32 i = 0; i < curSubSetSize; i++) 00554 { 00555 while(binPtr[aux] <= 0) 00556 { 00557 aux++; 00558 } 00559 00560 indexPtr[i] = aux; 00561 aux++; 00562 } 00563 // --- end of conversion --- // 00564 00565 // evaluate criterion function 00566 value = criterion 00567 ( 00568 classes_vector, 00569 indexPtr, 00570 curSubSetSize, 00571 crit, 00572 sigma 00573 ); 00574 00575 // if the new value is better than the best value already 00576 if(value > tempBestValue) 00577 { 00578 tempBestSetPtr = indexPtr; 00579 tempBestValue = value; 00580 } 00581 00582 // iterate through pBegin to find the beginning of the block of 00583 // selected features 00584 for(begin = 0; binPtr[begin] != id2; begin++) 00585 { 00586 // just iterate 00587 } 00588 00589 // iterate through pBegin to find the pivot 00590 for(pivot = begin; (pivot < n) && (binPtr[pivot] != id0); pivot++) 00591 { 00592 // just iterate 00593 } 00594 00595 if(pivot == n) 00596 { 00597 stopExhaustive = true; 00598 } 00599 else 00600 { 00601 // remember the position of first 0 on the right 00602 aux = pivot; 00603 00604 // find a real pivot 00605 do 00606 { 00607 pivot--; 00608 } 00609 while(binPtr[pivot] != id2); 00610 00611 // shift pivot to the right 00612 binPtr[pivot] = id0; 00613 binPtr[aux] = id2; 00614 00615 aux = 0; 00616 00617 // --- run "aux" from left, "piv" from right. the 0,2 --- // 00618 // --- pairs found are changed to 2,0 --- // 00619 do 00620 { 00621 pivot--; 00622 } 00623 while((pivot > 0) && (binPtr[pivot] != id2)); 00624 00625 while((aux < pivot) && (binPtr[aux] != id0)) 00626 { 00627 aux++; 00628 } 00629 00630 while((pivot - aux) > 0) 00631 { 00632 binPtr[pivot] = id0; 00633 binPtr[aux] = id2; 00634 00635 do 00636 { 00637 pivot--; 00638 } 00639 while((pivot > 0) && (binPtr[pivot] != id2)); 00640 00641 while((aux < pivot) && (binPtr[aux] != id0)) 00642 { 00643 aux++; 00644 } 00645 00646 } 00647 // --- end of run --- // 00648 00649 } 00650 00651 } 00652 while(!stopExhaustive); 00653 00654 // update if best value is found 00655 if(tempBestValue > globalBestValuePtr[curSubSetSize - 1]) 00656 { 00657 memcpy 00658 (&globalBestSetPtr[(curSubSetSize * (curSubSetSize - 1)) / 2], 00659 tempBestSetPtr.memptr(), 00660 curSubSetSizeInt 00661 ); 00662 00663 globalBestValuePtr[curSubSetSize - 1] = tempBestValue; 00664 betterFound = true; 00665 } 00666 else 00667 { 00668 betterFound = false; 00669 } 00670 00671 // if last step was sbs 00672 if(sbs) 00673 { 00674 // a better subset was found 00675 if(betterFound) 00676 { 00677 // --- convert according to format --- // 00678 00679 // convert to sbs format 00680 if(curSubSetSize > genLevel) 00681 { 00682 curSubsetChange = -1; 00683 00684 for(u32 i = 0; i < n; i++) 00685 { 00686 binPtr[i] = -1; 00687 } 00688 00689 // freeze changes 00690 for(u32 i = 0; i < curSubSetSize; i++) 00691 { 00692 binPtr[tempBestSetPtr[i]] = 2; 00693 } 00694 00695 } 00696 else // nothing to remove, convert to sfs format 00697 { 00698 for(u32 i = 0; i < n; i++) 00699 { 00700 binPtr[i] = 0; 00701 } 00702 00703 for(u32 i = 0; i < curSubSetSize; i++) 00704 { 00705 binPtr[tempBestSetPtr[i]] = 1; 00706 } 00707 00708 // change to sfs 00709 sbs = false; 00710 00711 // change "binPtr" identifiers 0 and 2 00712 id0 = 0; 00713 id2 = 2; 00714 00715 curSubsetChange = 1; 00716 } 00717 00718 // --- end of conversion --- // 00719 00720 } 00721 else // no improvement during last step (sbs) 00722 { 00723 // change to sfs 00724 sbs = false; 00725 00726 // change to "binPtr" identifiers 0 and 2 00727 id0 = 0; 00728 id2 = 2; 00729 00730 // forget last step and perform one new sfs step 00731 curSubsetChange = 2; 00732 } 00733 00734 } 00735 else // last step was sfs 00736 { 00737 // if removing is possible, prepare sbs 00738 if(curSubSetSize > genLevel) 00739 { 00740 for(u32 i = 0; i < n; i++) 00741 { 00742 binPtr[i] = -1; 00743 } 00744 00745 for(u32 i = 0; i < curSubSetSize; i++) 00746 { 00747 binPtr[tempBestSetPtr[i]] = 2; 00748 } 00749 00750 // change to sbs 00751 sbs = true; 00752 00753 // change to "binPtr" identifiers 0 and 2 00754 id0 = 2; 00755 id2 = 0; 00756 00757 curSubsetChange = -1; 00758 00759 } 00760 else // otherwise stay by sfs 00761 { 00762 curSubsetChange = 1; 00763 00764 for(u32 i = 0; i < n; i++) 00765 { 00766 binPtr[i] = 0; 00767 } 00768 00769 // freeze changes 00770 for(u32 i = 0; i < curSubSetSize; i++) 00771 { 00772 binPtr[tempBestSetPtr[i]] = 1; 00773 } 00774 00775 } 00776 00777 } 00778 00779 // renew curSubSetSize and initGenLevel 00780 if(curSubsetChange == -1) 00781 { 00782 if((curSubSetSize == d) && (d % genLevel != 0)) 00783 { 00784 curSubSetSize = d - ( d % genLevel ); 00785 initGenLevel = d % genLevel; 00786 } 00787 else 00788 { 00789 curSubSetSize -= genLevel; 00790 initGenLevel = genLevel; 00791 } 00792 00793 if(backtrackDepth < 0) 00794 { 00795 // continue 00796 backtrackDepth -= initGenLevel; 00797 } 00798 else 00799 { 00800 // begin 00801 backtrackDepth = -initGenLevel; 00802 } 00803 00804 } 00805 else // else curSubsetChange == 1 or 2 00806 { 00807 if((curSubSetSize < d) && ((curSubSetSize + genLevel) > d)) 00808 { 00809 curSubSetSize = d; 00810 initGenLevel = d % genLevel; 00811 } 00812 else 00813 { 00814 curSubSetSize += genLevel; 00815 initGenLevel = genLevel; 00816 } 00817 00818 // if end of going down 00819 if(backtrackDepth < 0) 00820 { 00821 if(delta == -1) 00822 { 00823 // averaging 00824 predDeltaEst = 00825 ( ( dirCount * predDeltaEst ) - backtrackDepth ) / 00826 ( dirCount + 1 ); 00827 00828 dirCount++; 00829 } 00830 else // else maximize 00831 { 00832 if((-backtrackDepth) > predDeltaEst) 00833 { 00834 predDeltaEst = -backtrackDepth; 00835 } 00836 00837 } 00838 00839 } 00840 00841 backtrackDepth = 0; 00842 } 00843 00844 // once more and renew "binPtr" 00845 if(curSubsetChange == 2) 00846 { 00847 // change to sfs format now with actualized "curSubSetSize" 00848 for(u32 i = 0; i < n; i++) 00849 { 00850 binPtr[i] = 0; 00851 } 00852 00853 aux = (curSubSetSize * (curSubSetSize - 1)) / 2; 00854 00855 for(u32 i = 0; i < curSubSetSize; i++) 00856 { 00857 binPtr[globalBestSetPtr[aux + i]] = 1; 00858 } 00859 00860 if((curSubSetSize < d) && ((curSubSetSize + genLevel) > d)) 00861 { 00862 curSubSetSize = d; 00863 initGenLevel = d % genLevel; 00864 } 00865 else 00866 { 00867 curSubSetSize += genLevel; 00868 initGenLevel = genLevel; 00869 } 00870 00871 // no direction change 00872 } 00873 00874 if(delta < 0) 00875 { 00876 stopDimension = int(d + (DELTAMUL * predDeltaEst) + DELTAADD); 00877 00878 if(stopDimension > n) 00879 { 00880 stopDimension = n; 00881 } 00882 00883 } 00884 00885 // end if delta reached 00886 if(curSubSetSize > stopDimension) 00887 { 00888 stopAll = true; 00889 } 00890 00891 } 00892 while(!stopAll); 00893 00894 // --- find the best value and best subset from all possible --- // 00895 // --- subsets --- // 00896 double bestValue = globalBestValuePtr[0]; 00897 int bestValueIndex = 0; 00898 00899 // find the index of the best value 00900 for(u32 i = 1; i < n - 1; i++) 00901 { 00902 if(globalBestValuePtr[i] > bestValue) 00903 { 00904 bestValue = globalBestValuePtr[i]; 00905 bestValueIndex = i; 00906 } 00907 00908 } 00909 // --- end of best value and subset --- // 00910 00911 // auxiliary variable 00912 aux = ((bestValueIndex + 1) * bestValueIndex) / 2; 00913 00914 return globalBestSetPtr.rows(aux, aux + bestValueIndex); 00915 } 00916 00917 00918