ECOCPAK v0.9
fn_load_dataset_from_file.hpp
Go to the documentation of this file.
00001 // Copyright (C) 2011 the authors listed below
00002 // http://ecocpak.sourceforge.net
00003 // 
00004 // Authors:
00005 // - Dimitrios Bouzas (bouzas at ieee dot org)
00006 // - Nikolaos Arvanitopoulos (niarvani at ieee dot org)
00007 // - Anastasios Tefas (tefas at aiia dot csd dot auth dot gr)
00008 // 
00009 // This file is part of the ECOC PAK C++ library. It is 
00010 // provided without any warranty of fitness for any purpose.
00011 //
00012 // You can redistribute this file and/or modify it under 
00013 // the terms of the GNU Lesser General Public License (LGPL) 
00014 // as published by the Free Software Foundation, either 
00015 // version 3 of the License or (at your option) any later 
00016 // version.
00017 // (see http://www.opensource.org/licenses for more info)
00018 
00019 
00022 
00023 
00024 
00040 void 
00041 load_dataset_from_file
00042   (
00043   const char* filename,
00044   mat& samples,
00045   icolvec& labels,
00046   u32& n_classes
00047   )
00048   {
00049   // open file
00050   FILE *fp = fopen(filename, "r");
00051 
00052   // in case the file opening failed exit with error message
00053   if(fp == 0) 
00054     {
00055     #ifdef NCURSES_OUTPUT
00056     endwin();
00057     #endif
00058     
00059     fprintf(stderr, "error: can't open input file %s\n\n", filename);
00060     
00061     #ifdef ECOCPAK_PROGRAM
00062     exit_with_help();
00063     #endif
00064     
00065     exit(1);
00066     }
00067   
00068   // temporary attributes
00069   u32 tmp_attr = 0;
00070   
00071   // final attributes
00072   u32 attr = 0;
00073 
00074   // number of samples in dataset
00075   u32 n_samples = 0;
00076   
00077   // number of elements in dataset
00078   u32 elements = 0;
00079   
00080   // initialize number of classes
00081   n_classes = 0;
00082 
00083   // temporary vector for parsing the number of dataset's attributes
00084   vector<int> tmp_vec;
00085 
00086   while(true) 
00087     {
00088     int c = fgetc(fp);
00089     switch(c) 
00090       {
00091       case '\n':
00092         {
00093         ++n_samples;
00094           
00095         tmp_attr = 0;
00096         // fall through,
00097         // count the '-1' element
00098         break;
00099         }
00100       case ':':
00101         {       
00102         int i = 1;
00103         int j = tmp_vec.size() - 1;
00104         tmp_attr = 0;
00105 
00106                                 // until we find a space character
00107         while(!isspace(tmp_vec[j]))
00108           {
00109           // compute the number of the current attribute
00110           tmp_attr += ((tmp_vec[j] - 48) * i);
00111           i *= 10;
00112           j--;
00113           }
00114 
00115                                 // update the number of attributes
00116         if(tmp_attr > attr)
00117           {
00118           attr = tmp_attr;
00119           }
00120         
00121         // clear temporary vector
00122         tmp_vec.clear();
00123         
00124         // increase the number of elements
00125         ++elements;
00126         break;
00127         }
00128       
00129       // end of file reached
00130       case EOF:
00131         {
00132         goto out;
00133         }
00134         
00135       default:
00136         {
00137         tmp_vec.push_back(c);
00138         }
00139         
00140       }
00141       
00142    }
00143 
00144   out: //
00145   
00146   // rewind file
00147   rewind(fp);
00148 
00149   attr;
00150   labels.zeros(n_samples);
00151   samples.zeros(n_samples, attr);
00152   
00153   vector<int> distinct_labels;
00154 
00155   for(u32 i = 0; i < n_samples; i++) 
00156     {
00157     // read label
00158     fscanf(fp, "%d", &labels[i]);
00159 
00160     // if distinct_labels vector is empty insert its first element
00161     if(distinct_labels.empty() == true)
00162       {
00163       distinct_labels.push_back(labels[i]);
00164       n_classes++;
00165       }
00166     else
00167       {
00168       bool found = false;       
00169       
00170       // check wether current label has been stored in distinct_labels vector
00171       for(u32 j = 0; j < distinct_labels.size(); j++)
00172         {
00173         if(distinct_labels[j] == labels[i])
00174           {
00175           found = true;
00176           }
00177           
00178         }
00179 
00180       // curent label is seen for first time store it and increase classes number
00181       if(found == false)
00182         {
00183         distinct_labels.push_back(labels[i]);
00184         n_classes++;
00185         }
00186         
00187       }
00188     
00189     // read rest of line which contains the sample vector
00190     while(true)
00191       {
00192       int c;
00193       do 
00194         {
00195         c = getc(fp);
00196         if(c == '\n') 
00197           {
00198           // line end reached go to next line
00199           goto out2;
00200           }
00201         } 
00202       while(isspace(c));
00203 
00204       ungetc(c, fp);
00205 
00206       u32 cur_index;
00207       double cur_value;
00208       if(fscanf(fp, "%d:%lf", &cur_index, &cur_value) < 2 ) 
00209         {
00210         #ifdef NCURSES_OUTPUT
00211         endwin();
00212         #endif
00213         
00214         fprintf(stderr, "Wrong input format at line %d\n", i + 1);
00215         exit(1);
00216         }
00217 
00218       // Copy value to dataMat
00219       samples(i, cur_index - 1) = cur_value;
00220       
00221       }
00222 
00223     out2:;
00224     }
00225   
00226   fclose(fp);
00227   }
00228  
00229  
00230   
00244 void 
00245 load_dataset_from_file
00246   (
00247   const char* filename,
00248   mat& samples,
00249   icolvec& labels
00250   )
00251   {
00252   u32 n_classes = 0;
00253   
00254   // open file
00255   FILE *fp = fopen(filename, "r");
00256 
00257   // in case the file opening failed exit with error message
00258   if(fp == 0) 
00259     {
00260     #ifdef NCURSES_OUTPUT
00261     endwin();
00262     #endif
00263     
00264     fprintf(stderr, "error: can't open input file %s\n\n", filename);
00265     
00266     #ifdef ECOCPAK_PROGRAM
00267     exit_with_help();
00268     #endif
00269     
00270     exit(1);
00271     }
00272   
00273   // temporary attributes
00274   u32 tmp_attr = 0;
00275   
00276   // final attributes
00277   u32 attr = 0;
00278 
00279   // number of samples in dataset
00280   u32 n_samples = 0;
00281   
00282   // number of elements in dataset
00283   u32 elements = 0;
00284   
00285   // initialize number of classes
00286   n_classes = 0;
00287 
00288   // temporary vector for parsing the number of dataset's attributes
00289   vector<int> tmp_vec;
00290 
00291   while(true) 
00292     {
00293     int c = fgetc(fp);
00294     switch(c) 
00295       {
00296       case '\n':
00297         {
00298         ++n_samples;
00299           
00300         tmp_attr = 0;
00301         // fall through,
00302         // count the '-1' element
00303         break;
00304         }
00305       case ':':
00306         {       
00307         int i = 1;
00308         int j = tmp_vec.size() - 1;
00309         tmp_attr = 0;
00310 
00311                                 // until we find a space character
00312         while(!isspace(tmp_vec[j]))
00313           {
00314           // compute the number of the current attribute
00315           tmp_attr += ((tmp_vec[j] - 48) * i);
00316           i *= 10;
00317           j--;
00318           }
00319 
00320                                 // update the number of attributes
00321         if(tmp_attr > attr)
00322           {
00323           attr = tmp_attr;
00324           }
00325         
00326         // clear temporary vector
00327         tmp_vec.clear();
00328         
00329         // increase the number of elements
00330         ++elements;
00331         break;
00332         }
00333       
00334       // end of file reached
00335       case EOF:
00336         {
00337         goto out;
00338         }
00339         
00340       default:
00341         {
00342         tmp_vec.push_back(c);
00343         }
00344         
00345       }
00346       
00347    }
00348 
00349   out: //
00350   
00351   // rewind file
00352   rewind(fp);
00353 
00354   attr;
00355   labels.zeros(n_samples);
00356   samples.zeros(n_samples, attr);
00357   
00358   vector<int> distinct_labels;
00359 
00360   for(u32 i = 0; i < n_samples; i++) 
00361     {
00362     // read label
00363     fscanf(fp, "%d", &labels[i]);
00364 
00365     // if distinct_labels vector is empty insert its first element
00366     if(distinct_labels.empty() == true)
00367       {
00368       distinct_labels.push_back(labels[i]);
00369       n_classes++;
00370       }
00371     else
00372       {
00373       bool found = false;       
00374       
00375       // check wether current label has been stored in distinct_labels vector
00376       for(u32 j = 0; j < distinct_labels.size(); j++)
00377         {
00378         if(distinct_labels[j] == labels[i])
00379           {
00380           found = true;
00381           }
00382           
00383         }
00384 
00385       // curent label is seen for first time store it and increase classes number
00386       if(found == false)
00387         {
00388         distinct_labels.push_back(labels[i]);
00389         n_classes++;
00390         }
00391         
00392       }
00393     
00394     // read rest of line which contains the sample vector
00395     while(true)
00396       {
00397       int c;
00398       do 
00399         {
00400         c = getc(fp);
00401         if(c == '\n') 
00402           {
00403           // line end reached go to next line
00404           goto out2;
00405           }
00406         } 
00407       while(isspace(c));
00408 
00409       ungetc(c, fp);
00410 
00411       u32 cur_index;
00412       double cur_value;
00413       if(fscanf(fp, "%d:%lf", &cur_index, &cur_value) < 2 ) 
00414         {
00415         #ifdef NCURSES_OUTPUT
00416         endwin();
00417         #endif
00418         
00419         fprintf(stderr, "Wrong input format at line %d\n", i + 1);
00420         exit(1);
00421         }
00422 
00423       // Copy value to dataMat
00424       samples(i, cur_index - 1) = cur_value;
00425       
00426       }
00427 
00428     out2:;
00429     }
00430   
00431   fclose(fp);
00432   }
00433 
00434 
00435 
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerator Defines