ECOCPAK v0.9
|
00001 // Copyright (C) 2011 the authors listed below 00002 // http://ecocpak.sourceforge.net 00003 // 00004 // Authors: 00005 // - Dimitrios Bouzas (bouzas at ieee dot org) 00006 // - Nikolaos Arvanitopoulos (niarvani at ieee dot org) 00007 // - Anastasios Tefas (tefas at aiia dot csd dot auth dot gr) 00008 // 00009 // This file is part of the ECOC PAK C++ library. It is 00010 // provided without any warranty of fitness for any purpose. 00011 // 00012 // You can redistribute this file and/or modify it under 00013 // the terms of the GNU Lesser General Public License (LGPL) 00014 // as published by the Free Software Foundation, either 00015 // version 3 of the License or (at your option) any later 00016 // version. 00017 // (see http://www.opensource.org/licenses for more info) 00018 00019 00022 00023 00024 00040 void 00041 load_dataset_from_file 00042 ( 00043 const char* filename, 00044 mat& samples, 00045 icolvec& labels, 00046 u32& n_classes 00047 ) 00048 { 00049 // open file 00050 FILE *fp = fopen(filename, "r"); 00051 00052 // in case the file opening failed exit with error message 00053 if(fp == 0) 00054 { 00055 #ifdef NCURSES_OUTPUT 00056 endwin(); 00057 #endif 00058 00059 fprintf(stderr, "error: can't open input file %s\n\n", filename); 00060 00061 #ifdef ECOCPAK_PROGRAM 00062 exit_with_help(); 00063 #endif 00064 00065 exit(1); 00066 } 00067 00068 // temporary attributes 00069 u32 tmp_attr = 0; 00070 00071 // final attributes 00072 u32 attr = 0; 00073 00074 // number of samples in dataset 00075 u32 n_samples = 0; 00076 00077 // number of elements in dataset 00078 u32 elements = 0; 00079 00080 // initialize number of classes 00081 n_classes = 0; 00082 00083 // temporary vector for parsing the number of dataset's attributes 00084 vector<int> tmp_vec; 00085 00086 while(true) 00087 { 00088 int c = fgetc(fp); 00089 switch(c) 00090 { 00091 case '\n': 00092 { 00093 ++n_samples; 00094 00095 tmp_attr = 0; 00096 // fall through, 00097 // count the '-1' element 00098 break; 00099 } 00100 case ':': 00101 { 00102 int i = 1; 00103 int j = tmp_vec.size() - 1; 00104 tmp_attr = 0; 00105 00106 // until we find a space character 00107 while(!isspace(tmp_vec[j])) 00108 { 00109 // compute the number of the current attribute 00110 tmp_attr += ((tmp_vec[j] - 48) * i); 00111 i *= 10; 00112 j--; 00113 } 00114 00115 // update the number of attributes 00116 if(tmp_attr > attr) 00117 { 00118 attr = tmp_attr; 00119 } 00120 00121 // clear temporary vector 00122 tmp_vec.clear(); 00123 00124 // increase the number of elements 00125 ++elements; 00126 break; 00127 } 00128 00129 // end of file reached 00130 case EOF: 00131 { 00132 goto out; 00133 } 00134 00135 default: 00136 { 00137 tmp_vec.push_back(c); 00138 } 00139 00140 } 00141 00142 } 00143 00144 out: // 00145 00146 // rewind file 00147 rewind(fp); 00148 00149 attr; 00150 labels.zeros(n_samples); 00151 samples.zeros(n_samples, attr); 00152 00153 vector<int> distinct_labels; 00154 00155 for(u32 i = 0; i < n_samples; i++) 00156 { 00157 // read label 00158 fscanf(fp, "%d", &labels[i]); 00159 00160 // if distinct_labels vector is empty insert its first element 00161 if(distinct_labels.empty() == true) 00162 { 00163 distinct_labels.push_back(labels[i]); 00164 n_classes++; 00165 } 00166 else 00167 { 00168 bool found = false; 00169 00170 // check wether current label has been stored in distinct_labels vector 00171 for(u32 j = 0; j < distinct_labels.size(); j++) 00172 { 00173 if(distinct_labels[j] == labels[i]) 00174 { 00175 found = true; 00176 } 00177 00178 } 00179 00180 // curent label is seen for first time store it and increase classes number 00181 if(found == false) 00182 { 00183 distinct_labels.push_back(labels[i]); 00184 n_classes++; 00185 } 00186 00187 } 00188 00189 // read rest of line which contains the sample vector 00190 while(true) 00191 { 00192 int c; 00193 do 00194 { 00195 c = getc(fp); 00196 if(c == '\n') 00197 { 00198 // line end reached go to next line 00199 goto out2; 00200 } 00201 } 00202 while(isspace(c)); 00203 00204 ungetc(c, fp); 00205 00206 u32 cur_index; 00207 double cur_value; 00208 if(fscanf(fp, "%d:%lf", &cur_index, &cur_value) < 2 ) 00209 { 00210 #ifdef NCURSES_OUTPUT 00211 endwin(); 00212 #endif 00213 00214 fprintf(stderr, "Wrong input format at line %d\n", i + 1); 00215 exit(1); 00216 } 00217 00218 // Copy value to dataMat 00219 samples(i, cur_index - 1) = cur_value; 00220 00221 } 00222 00223 out2:; 00224 } 00225 00226 fclose(fp); 00227 } 00228 00229 00230 00244 void 00245 load_dataset_from_file 00246 ( 00247 const char* filename, 00248 mat& samples, 00249 icolvec& labels 00250 ) 00251 { 00252 u32 n_classes = 0; 00253 00254 // open file 00255 FILE *fp = fopen(filename, "r"); 00256 00257 // in case the file opening failed exit with error message 00258 if(fp == 0) 00259 { 00260 #ifdef NCURSES_OUTPUT 00261 endwin(); 00262 #endif 00263 00264 fprintf(stderr, "error: can't open input file %s\n\n", filename); 00265 00266 #ifdef ECOCPAK_PROGRAM 00267 exit_with_help(); 00268 #endif 00269 00270 exit(1); 00271 } 00272 00273 // temporary attributes 00274 u32 tmp_attr = 0; 00275 00276 // final attributes 00277 u32 attr = 0; 00278 00279 // number of samples in dataset 00280 u32 n_samples = 0; 00281 00282 // number of elements in dataset 00283 u32 elements = 0; 00284 00285 // initialize number of classes 00286 n_classes = 0; 00287 00288 // temporary vector for parsing the number of dataset's attributes 00289 vector<int> tmp_vec; 00290 00291 while(true) 00292 { 00293 int c = fgetc(fp); 00294 switch(c) 00295 { 00296 case '\n': 00297 { 00298 ++n_samples; 00299 00300 tmp_attr = 0; 00301 // fall through, 00302 // count the '-1' element 00303 break; 00304 } 00305 case ':': 00306 { 00307 int i = 1; 00308 int j = tmp_vec.size() - 1; 00309 tmp_attr = 0; 00310 00311 // until we find a space character 00312 while(!isspace(tmp_vec[j])) 00313 { 00314 // compute the number of the current attribute 00315 tmp_attr += ((tmp_vec[j] - 48) * i); 00316 i *= 10; 00317 j--; 00318 } 00319 00320 // update the number of attributes 00321 if(tmp_attr > attr) 00322 { 00323 attr = tmp_attr; 00324 } 00325 00326 // clear temporary vector 00327 tmp_vec.clear(); 00328 00329 // increase the number of elements 00330 ++elements; 00331 break; 00332 } 00333 00334 // end of file reached 00335 case EOF: 00336 { 00337 goto out; 00338 } 00339 00340 default: 00341 { 00342 tmp_vec.push_back(c); 00343 } 00344 00345 } 00346 00347 } 00348 00349 out: // 00350 00351 // rewind file 00352 rewind(fp); 00353 00354 attr; 00355 labels.zeros(n_samples); 00356 samples.zeros(n_samples, attr); 00357 00358 vector<int> distinct_labels; 00359 00360 for(u32 i = 0; i < n_samples; i++) 00361 { 00362 // read label 00363 fscanf(fp, "%d", &labels[i]); 00364 00365 // if distinct_labels vector is empty insert its first element 00366 if(distinct_labels.empty() == true) 00367 { 00368 distinct_labels.push_back(labels[i]); 00369 n_classes++; 00370 } 00371 else 00372 { 00373 bool found = false; 00374 00375 // check wether current label has been stored in distinct_labels vector 00376 for(u32 j = 0; j < distinct_labels.size(); j++) 00377 { 00378 if(distinct_labels[j] == labels[i]) 00379 { 00380 found = true; 00381 } 00382 00383 } 00384 00385 // curent label is seen for first time store it and increase classes number 00386 if(found == false) 00387 { 00388 distinct_labels.push_back(labels[i]); 00389 n_classes++; 00390 } 00391 00392 } 00393 00394 // read rest of line which contains the sample vector 00395 while(true) 00396 { 00397 int c; 00398 do 00399 { 00400 c = getc(fp); 00401 if(c == '\n') 00402 { 00403 // line end reached go to next line 00404 goto out2; 00405 } 00406 } 00407 while(isspace(c)); 00408 00409 ungetc(c, fp); 00410 00411 u32 cur_index; 00412 double cur_value; 00413 if(fscanf(fp, "%d:%lf", &cur_index, &cur_value) < 2 ) 00414 { 00415 #ifdef NCURSES_OUTPUT 00416 endwin(); 00417 #endif 00418 00419 fprintf(stderr, "Wrong input format at line %d\n", i + 1); 00420 exit(1); 00421 } 00422 00423 // Copy value to dataMat 00424 samples(i, cur_index - 1) = cur_value; 00425 00426 } 00427 00428 out2:; 00429 } 00430 00431 fclose(fp); 00432 } 00433 00434 00435