00001 #include "common.hpp"
00002 #include "common/log.h"
00003 #include "common/allocators.hpp"
00004 #include "io/input/transaction_reader/LBufferedTransactionReader.hpp"
00005 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00006
00007 #include "io/codec/coder/Coder.hpp"
00008 #include "io/codec/decoder/df/SimpleDFDecoder.hpp"
00009
00010 #include "io/db_cache/RBTreeDBCache.hpp"
00011
00012 #include "util/StreamParser.hpp"
00013 #include "util/FrequentFilter.cpp"
00014 #include "util/Frequent2Filter.cpp"
00015
00016 #include "test/apriori/bodon/FattenableLeaf.hpp"
00017 #include "apriori/bodon/Trie.hpp"
00018 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00019
00020 #include "apriori/bodon/trie/trie_manipulators/FrequentItemInserter.hpp"
00021 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserter.hpp"
00022 #include "apriori/OneByOneSupportCounter.hpp"
00023 #include "apriori/bodon/trie/trie_manipulators/support_counter/SupportCounterMerge.hpp"
00024 #include "apriori/bodon/trie/trie_manipulators/SimplePruner.hpp"
00025 #include "apriori/bodon/trie/trie_manipulators/CandidateGeneratorPrune.hpp"
00026 #include "apriori/bodon/trie/trie_manipulators/InfreqRemover.hpp"
00027
00028 #include "apriori/Apriori.hpp"
00029
00030 #include <vector>
00031 #include <iostream>
00032 #include <string>
00033
00034
00035 std::string file_format;
00036
00037 void init()
00038 {
00039 file_format = "File format:";
00040 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00041 file_format += "represents a transaction. \n";
00042 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00043 file_format += "character.\nIt can be for example a white space, comma, ";
00044 file_format += "colon, etc.\n";
00045 file_format += "Items are nonnegative integers.\n";
00046 }
00048 void usage()
00049 {
00050 log_err(0,"Usage: apriori-simple transactionfile min_supp outcomefile [options]");
00051 log_err(0," transactionfile\t file, that contains the tranasctions of items");
00052 log_err(0," outcomefile\t file to write the outcome");
00053 log_err(0," min_supp\t absolute support threshold");
00054
00055 std::cerr << file_format;
00056 log_err(0,"\t\t\tHave a succesful mining ;-)\n\n");
00057 }
00058
00069 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00070 bool &isrel, double &relminsupp )
00071 {
00072 if ( argc < 5 )
00073 {
00074 log_err(0,"There are 4 mandatory arguments.");
00075 return 2;
00076 }
00077 std::string mins=argv[2];
00078 if (mins[mins.size()-1]=='%') {
00079 mins.erase(mins.size()-1);
00080 isrel=true;
00081 relminsupp=atof(mins.c_str());
00082 relminsupp/=100;
00083 log_info(0,"Using relative minimum support of %lg",relminsupp);
00084 return 0;
00085 }
00086 isrel=false;
00087 int min_supp_i;
00088 try
00089 {
00090 convert(argv[2], min_supp_i);
00091 if ( min_supp_i <= 0 )
00092 {
00093 log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00094 return 3;
00095 }
00096 }
00097 catch(BadConversion e)
00098 {
00099 log_err(0,"min_supp conversion problem.");
00100 return 3;
00101 }
00102 min_supp = static_cast<counter_t>(min_supp_i);
00103 log_info(0,"min_supp is set to %d", min_supp);
00104 return 0;
00105 }
00106 template<class LEAF, class T_R, class DF_D> void
00107 helperFunction(
00108 T_R& tr_reader, DF_D& df_decoder, counter_t nr_of_transactions,
00109 std::vector< std::pair<counter_t, item_t> >& freq_items_with_counters,
00110 char* input_file, counter_t min_supp)
00111 {
00112 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE;
00113 TRIE main_trie;
00114 typedef SortedTransactionReader< Coder<T_R, DF_D> > S_C_T_R;
00115 typedef Bodon::RBTreeDBCache<S_C_T_R, std::vector<item_t> > S_C;
00116 typename S_C::params_t par_c;
00117 par_c.file_name = input_file;
00118 par_c.mode=FileReprBase::READ;
00119 par_c.largest_item = tr_reader.getLargestItem();
00120 par_c.decoder = &df_decoder;
00121 par_c.freq_items_with_counters = &freq_items_with_counters;
00122 par_c.codemode = ASC;
00123 log_status(0,"Doing sorted codec.");
00124 S_C sorted_coder(&par_c);
00125
00126 std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00127 freq_pairs_with_counters;
00128 Frequent2Filter<S_C> fr_2_filter(
00129 &sorted_coder );
00130
00131
00132 log_status(0,"Finding frequent pairs.")
00133 fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00134
00135 const NEELevel NEE = NEE_Off;
00136 typedef NewWrapperAlloc<TRIE> TRIE_ALLOCATOR;
00137 TRIE_ALLOCATOR s_alloc;
00138 typedef Bodon::FrequentItemInserter<DF_D, TRIE, NEE> FII;
00139 FII fii(main_trie, df_decoder);
00140 typedef Bodon::SupportCounterMerge<TRIE> SUPP_C_BASE;
00141 typedef OneByOneSupportCounter<TRIE, S_C, SUPP_C_BASE> SUPP_C;
00142
00143 typedef Bodon::FrequentPairInserter<DF_D, TRIE, TRIE, TRIE_ALLOCATOR, NEE> FPI;
00144 typedef Bodon::trie::SimplePruner<DF_D, TRIE, NewWrapperAlloc<TRIE>, NEE> PRUNER;
00145 typedef Bodon::CandidateGeneratorPrune<PRUNER, DF_D, TRIE, TRIE_ALLOCATOR, NEE> CG;
00146 typedef Bodon::trie::InfreqRemover<DF_D, TRIE, TRIE_ALLOCATOR, NEE> IR;
00147 IR infrequent_remover(main_trie, df_decoder, s_alloc);
00148 typedef Apriori<S_C, DF_D, TRIE, TRIE_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00149 A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00150 log_status(0,"Finding frequent itemsets.")
00151 apriori.findFrequentItemsets(
00152 nr_of_transactions, *par_c.freq_counters,
00153 freq_pairs_with_counters, min_supp );
00154 }
00155
00156
00157 int main( int argc, char *argv[] )
00158 {
00159 init();
00160 counter_t min_supp;
00161 bool relative;
00162 double relminsupp;
00163
00164 {
00165 int return_val =
00166 process_arguments( argc, argv, min_supp, relative, relminsupp );
00167 if(return_val)
00168 return return_val;
00169 }
00170
00171 char* input_file = argv[1];
00172 char* output_file = argv[3];
00173
00174 try
00175 {
00176 typedef LBufferedTransactionReader< > T_R;
00177
00178 T_R::params_t par_i;
00179 par_i.file_name = input_file;
00180 par_i.mode=FileReprBase::READ;
00181 par_i.file_buffer_size = 16 * 1024;
00182 T_R tr_reader(&par_i);
00183 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00184 counter_t nr_of_transactions;
00185
00186 FrequentFilter<T_R>
00187 fr_filter(tr_reader);
00188 log_status(0,"Finding frequent items.");
00189 fr_filter.findFrequentItems( freq_items_with_counters,
00190 nr_of_transactions, min_supp);
00191
00192 if(!freq_items_with_counters.empty())
00193 {
00194 log_status(0,"Doing decoder.");
00195 typedef SimpleDFDecoder< > DF_D;
00196
00197 DF_D::params_t par_d;
00198 par_d.file_name = output_file;
00199 par_d.mode=FileReprBase::WRITE;
00200 DF_D df_decoder(&par_d);
00201
00202 if(strcmp(argv[4], "0") == 0 )
00203 {
00204 log_status(0,"No vector is added to the nodes")
00205 typedef Bodon::Leaf LEAF;
00206 helperFunction<LEAF, T_R, DF_D>(
00207 tr_reader, df_decoder, nr_of_transactions,
00208 freq_items_with_counters, input_file, min_supp );
00209 }
00210 else if(strcmp(argv[4], "1") ==0 )
00211 {
00212 log_status(0,"A vector of size 1 is added to each node")
00213 typedef Bodon::FattenableLeaf<1> LEAF;
00214 helperFunction<LEAF, T_R, DF_D>(
00215 tr_reader, df_decoder, nr_of_transactions,
00216 freq_items_with_counters, input_file, min_supp );
00217 }
00218 else if(strcmp(argv[4], "2") ==0 )
00219 {
00220 log_status(0,"A vector of size 2 is added to each node")
00221 typedef Bodon::FattenableLeaf<2> LEAF;
00222 helperFunction<LEAF, T_R, DF_D>(
00223 tr_reader, df_decoder, nr_of_transactions,
00224 freq_items_with_counters, input_file, min_supp );
00225 }
00226 else if(strcmp(argv[4], "4") ==0 )
00227 {
00228 log_status(0,"A vector of size 4 is added to each node")
00229 typedef Bodon::FattenableLeaf<4> LEAF;
00230 helperFunction<LEAF, T_R, DF_D>(
00231 tr_reader, df_decoder, nr_of_transactions,
00232 freq_items_with_counters, input_file, min_supp );
00233 }
00234 else if(strcmp(argv[4], "6") ==0 )
00235 {
00236 log_status(0,"A vector of size 6 is added to each node")
00237 typedef Bodon::FattenableLeaf<6> LEAF;
00238 helperFunction<LEAF, T_R, DF_D>(
00239 tr_reader, df_decoder, nr_of_transactions,
00240 freq_items_with_counters, input_file, min_supp );
00241 }
00242 else if(strcmp(argv[4], "8") ==0 )
00243 {
00244 log_status(0,"A vector of size 8 is added to each node")
00245 typedef Bodon::FattenableLeaf<8> LEAF;
00246 helperFunction<LEAF, T_R, DF_D>(
00247 tr_reader, df_decoder, nr_of_transactions,
00248 freq_items_with_counters, input_file, min_supp );
00249 }
00250 else if(strcmp(argv[4], "10") ==0 )
00251 {
00252 log_status(0,"A vector of size 10 is added to each node")
00253 typedef Bodon::FattenableLeaf<10> LEAF;
00254 helperFunction<LEAF, T_R, DF_D>(
00255 tr_reader, df_decoder, nr_of_transactions,
00256 freq_items_with_counters, input_file, min_supp );
00257 }
00258 else if(strcmp(argv[4], "11") ==0 )
00259 {
00260 log_status(0,"A vector of size 11 is added to each node")
00261 typedef Bodon::FattenableLeaf<11> LEAF;
00262 helperFunction<LEAF, T_R, DF_D>(
00263 tr_reader, df_decoder, nr_of_transactions,
00264 freq_items_with_counters, input_file, min_supp );
00265 }
00266 else if(strcmp(argv[4], "12") ==0 )
00267 {
00268 log_status(0,"A vector of size 12 is added to each node")
00269 typedef Bodon::FattenableLeaf<12> LEAF;
00270 helperFunction<LEAF, T_R, DF_D>(
00271 tr_reader, df_decoder, nr_of_transactions,
00272 freq_items_with_counters, input_file, min_supp );
00273 }
00274 else if(strcmp(argv[4], "13") ==0 )
00275 {
00276 log_status(0,"A vector of size 13 is added to each node")
00277 typedef Bodon::FattenableLeaf<13> LEAF;
00278 helperFunction<LEAF, T_R, DF_D>(
00279 tr_reader, df_decoder, nr_of_transactions,
00280 freq_items_with_counters, input_file, min_supp );
00281 }
00282 else if(strcmp(argv[4], "14") ==0 )
00283 {
00284 log_status(0,"A vector of size 14 is added to each node")
00285 typedef Bodon::FattenableLeaf<14> LEAF;
00286 helperFunction<LEAF, T_R, DF_D>(
00287 tr_reader, df_decoder, nr_of_transactions,
00288 freq_items_with_counters, input_file, min_supp );
00289 }
00290 else if(strcmp(argv[4], "15") ==0 )
00291 {
00292 log_status(0,"A vector of size 15 is added to each node")
00293 typedef Bodon::FattenableLeaf<15> LEAF;
00294 helperFunction<LEAF, T_R, DF_D>(
00295 tr_reader, df_decoder, nr_of_transactions,
00296 freq_items_with_counters, input_file, min_supp );
00297 }
00298 else if(strcmp(argv[4], "16") ==0 )
00299 {
00300 log_status(0,"A vector of size 16 is added to each node")
00301 typedef Bodon::FattenableLeaf<16> LEAF;
00302 helperFunction<LEAF, T_R, DF_D>(
00303 tr_reader, df_decoder, nr_of_transactions,
00304 freq_items_with_counters, input_file, min_supp );
00305 }
00306 else if(strcmp(argv[4], "17") ==0 )
00307 {
00308 log_status(0,"A vector of size 17 is added to each node")
00309 typedef Bodon::FattenableLeaf<17> LEAF;
00310 helperFunction<LEAF, T_R, DF_D>(
00311 tr_reader, df_decoder, nr_of_transactions,
00312 freq_items_with_counters, input_file, min_supp );
00313 }
00314 else if(strcmp(argv[4], "18") ==0 )
00315 {
00316 log_status(0,"A vector of size 18 is added to each node")
00317 typedef Bodon::FattenableLeaf<18> LEAF;
00318 helperFunction<LEAF, T_R, DF_D>(
00319 tr_reader, df_decoder, nr_of_transactions,
00320 freq_items_with_counters, input_file, min_supp );
00321 }
00322 else if(strcmp(argv[4], "19") ==0 )
00323 {
00324 log_status(0,"A vector of size 19 is added to each node")
00325 typedef Bodon::FattenableLeaf<19> LEAF;
00326 helperFunction<LEAF, T_R, DF_D>(
00327 tr_reader, df_decoder, nr_of_transactions,
00328 freq_items_with_counters, input_file, min_supp );
00329 }
00330 else if(strcmp(argv[4], "20") ==0 )
00331 {
00332 log_status(0,"A vector of size 20 is added to each node")
00333 typedef Bodon::FattenableLeaf<20> LEAF;
00334 helperFunction<LEAF, T_R, DF_D>(
00335 tr_reader, df_decoder, nr_of_transactions,
00336 freq_items_with_counters, input_file, min_supp );
00337 }
00338 else if(strcmp(argv[4], "21") ==0 )
00339 {
00340 log_status(0,"A vector of size 21 is added to each node")
00341 typedef Bodon::FattenableLeaf<21> LEAF;
00342 helperFunction<LEAF, T_R, DF_D>(
00343 tr_reader, df_decoder, nr_of_transactions,
00344 freq_items_with_counters, input_file, min_supp );
00345 }
00346 else if(strcmp(argv[4], "22") ==0 )
00347 {
00348 log_status(0,"A vector of size 22 is added to each node")
00349 typedef Bodon::FattenableLeaf<22> LEAF;
00350 helperFunction<LEAF, T_R, DF_D>(
00351 tr_reader, df_decoder, nr_of_transactions,
00352 freq_items_with_counters, input_file, min_supp );
00353 }
00354 else if(strcmp(argv[4], "23") ==0 )
00355 {
00356 log_status(0,"A vector of size 23 is added to each node")
00357 typedef Bodon::FattenableLeaf<23> LEAF;
00358 helperFunction<LEAF, T_R, DF_D>(
00359 tr_reader, df_decoder, nr_of_transactions,
00360 freq_items_with_counters, input_file, min_supp );
00361 }
00362 else if(strcmp(argv[4], "24") ==0 )
00363 {
00364 log_status(0,"A vector of size 24 is added to each node")
00365 typedef Bodon::FattenableLeaf<24> LEAF;
00366 helperFunction<LEAF, T_R, DF_D>(
00367 tr_reader, df_decoder, nr_of_transactions,
00368 freq_items_with_counters, input_file, min_supp );
00369 }
00370 else if(strcmp(argv[4], "26") ==0 )
00371 {
00372 log_status(0,"A vector of size 26 is added to each node")
00373 typedef Bodon::FattenableLeaf<26> LEAF;
00374 helperFunction<LEAF, T_R, DF_D>(
00375 tr_reader, df_decoder, nr_of_transactions,
00376 freq_items_with_counters, input_file, min_supp );
00377 }
00378 else if(strcmp(argv[4], "27") ==0 )
00379 {
00380 log_status(0,"A vector of size 27 is added to each node")
00381 typedef Bodon::FattenableLeaf<27> LEAF;
00382 helperFunction<LEAF, T_R, DF_D>(
00383 tr_reader, df_decoder, nr_of_transactions,
00384 freq_items_with_counters, input_file, min_supp );
00385 }
00386 else if(strcmp(argv[4], "28") ==0 )
00387 {
00388 log_status(0,"A vector of size 28 is added to each node")
00389 typedef Bodon::FattenableLeaf<28> LEAF;
00390 helperFunction<LEAF, T_R, DF_D>(
00391 tr_reader, df_decoder, nr_of_transactions,
00392 freq_items_with_counters, input_file, min_supp );
00393 }
00394 else if(strcmp(argv[4], "30") ==0 )
00395 {
00396 log_status(0,"A vector of size 30 is added to each node")
00397 typedef Bodon::FattenableLeaf<30> LEAF;
00398 helperFunction<LEAF, T_R, DF_D>(
00399 tr_reader, df_decoder, nr_of_transactions,
00400 freq_items_with_counters, input_file, min_supp );
00401 }
00402 else if(strcmp(argv[4], "31") ==0 )
00403 {
00404 log_status(0,"A vector of size 31 is added to each node")
00405 typedef Bodon::FattenableLeaf<31> LEAF;
00406 helperFunction<LEAF, T_R, DF_D>(
00407 tr_reader, df_decoder, nr_of_transactions,
00408 freq_items_with_counters, input_file, min_supp );
00409 }
00410 else if(strcmp(argv[4], "32") ==0 )
00411 {
00412 log_status(0,"A vector of size 32 is added to each node")
00413 typedef Bodon::FattenableLeaf<32> LEAF;
00414 helperFunction<LEAF, T_R, DF_D>(
00415 tr_reader, df_decoder, nr_of_transactions,
00416 freq_items_with_counters, input_file, min_supp );
00417 }
00418 else if(strcmp(argv[4], "34") ==0 )
00419 {
00420 log_status(0,"A vector of size 34 is added to each node")
00421 typedef Bodon::FattenableLeaf<34> LEAF;
00422 helperFunction<LEAF, T_R, DF_D>(
00423 tr_reader, df_decoder, nr_of_transactions,
00424 freq_items_with_counters, input_file, min_supp );
00425 }
00426 else if(strcmp(argv[4], "36") ==0 )
00427 {
00428 log_status(0,"A vector of size 36 is added to each node")
00429 typedef Bodon::FattenableLeaf<36> LEAF;
00430 helperFunction<LEAF, T_R, DF_D>(
00431 tr_reader, df_decoder, nr_of_transactions,
00432 freq_items_with_counters, input_file, min_supp );
00433 }
00434 else if(strcmp(argv[4], "40") ==0 )
00435 {
00436 log_status(0,"A vector of size 40 is added to each node")
00437 typedef Bodon::FattenableLeaf<40> LEAF;
00438 helperFunction<LEAF, T_R, DF_D>(
00439 tr_reader, df_decoder, nr_of_transactions,
00440 freq_items_with_counters, input_file, min_supp );
00441 }
00442 else if(strcmp(argv[4], "50") ==0 )
00443 {
00444 log_status(0,"A vector of size 50 is added to each node")
00445 typedef Bodon::FattenableLeaf<50> LEAF;
00446 helperFunction<LEAF, T_R, DF_D>(
00447 tr_reader, df_decoder, nr_of_transactions,
00448 freq_items_with_counters, input_file, min_supp );
00449 }
00450 }
00451 }
00452 catch (std::ios_base::failure e)
00453 {
00454 log_err(0,"Exiting the program due to IO exception");
00455 return 1;
00456 }
00457 }
00458
00459