00001 #include "common.hpp"
00002 #include "common/log.h"
00003 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00004 #include "io/input/transaction_reader/SortedTransactionReader.hpp" 
00005 
00006 
00007 #include "io/codec/decoder/df/SimpleDFDecoder.hpp"
00008 
00009 #include "io/codec/decoder/df/DFDecoderWithEEManagement.hpp"
00010 
00011 #include "io/output/normal/SortOutput.hpp"
00012 
00013 
00014 #include "util/StreamParser.hpp"
00015 #include "util/FrequentFilter.cpp"
00016 
00017 #include "datastructures/maxvector.hpp"
00018 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00019 
00020 #include "datastructures/trie/edgelist/OffsetIndexVector.hpp"
00021 
00022 #include "apriori/bodon/Leaf.hpp"
00023 #include "apriori/bodon/Trie.hpp"
00024 #include "apriori/bodon/TrieNEE.hpp"
00025 
00026 #include "test/apriori/bodon/AprioriSelector.hpp"
00027 #include <vector>
00028 #include <iostream>
00029 #include <string>
00030 
00031 
00032 std::string file_format;
00033 
00034 void init()
00035 {
00036    file_format = "File format:";
00037    file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00038    file_format += "represents a transaction. \n";
00039    file_format += "A transaction is a set of items seperated by a nonnumeric ";
00040    file_format += "character.\nIt can be for example a white space, comma, ";
00041    file_format += "colon, etc.\n";
00042    file_format += "Items are nonnegative integers.\n";
00043 }
00045 void usage()
00046 {
00047    log_err(0,"Usage: candGendTester candidate_option[-nee_option] transactionfile min_supp outcomefile [maxsize]");
00048    log_err(0," candidate_option\t    the candidate generation technique, ");
00049    log_err(0,"i.e: simpleprune, intersectprune or noprune");
00050    log_err(0,"nee_option\t   the way equisupport items are handled");
00051    log_err(0,"i.e: nee (for prefix equisupport), neefull (for full equisupport pruning)");
00052    log_err(0," transactionfile\t    file, that contains the tranasctions of items");
00053    log_err(0," outcomefile\t    file to write the outcome");
00054    log_err(0," min_supp\t    absolute support threshold");
00055    log_err(0,"Example:\t candGendTester simpleprune ../data/kosarak.dat 1900 output.txt");
00056    log_err(0," \t candGendTester intersectprune-neefull ../data/kosarak.dat 1900 output.txt");
00057 
00058    std::cerr << file_format;
00059    log_err(0,"\t\t\tHave a succesful mining ;-)\n\n");
00060 }
00061 
00072 int process_arguments( int argc, char *argv[], counter_t& min_supp, 
00073                        bool &isrel, double &relminsupp, unsigned int& maxsize )
00074 {
00075    if ( argc < 5 )
00076    {
00077      log_err(0,"There are 4 mandatory arguments.");
00078      return 2;
00079    }
00080    std::string mins=argv[3];
00081    if (mins[mins.size()-1]=='%') {
00082      mins.erase(mins.size()-1);
00083      isrel=true;
00084      relminsupp=atof(mins.c_str());
00085      relminsupp/=100;
00086      log_status(0,"Using relative minimum support of %lg",relminsupp);
00087      return 0;
00088    }
00089    isrel=false;
00090    int min_supp_i;
00091    try
00092    {
00093       convert(argv[3], min_supp_i);
00094       if ( min_supp_i <= 0  )
00095       {
00096          log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00097          return 3;
00098       }
00099    }
00100    catch(BadConversion e)
00101    {
00102       log_err(0,"min_supp conversion problem.");
00103       return 3;
00104    }
00105    min_supp = static_cast<counter_t>(min_supp_i);
00106    log_status(0,"min_supp is set to %d", min_supp);
00107 
00108    if(argc == 6)
00109    {
00110       int maxsize_i;
00111       try
00112       {
00113          convert(argv[5], maxsize_i);
00114          if ( maxsize_i <= 0  )
00115          {
00116             log_err(0,"%s cannot be converted to a positive integer.",argv[4]);
00117             return 4;
00118          }
00119       }
00120       catch(BadConversion e)
00121       {
00122          log_err(0,"min_supp conversion problem.");
00123          return 4;
00124       }
00125       maxsize = static_cast<unsigned int>(maxsize_i);
00126       log_status(0,"maxsize is set to %d", maxsize);
00127    }
00128    else
00129       maxsize = largest_itemsetsize;
00130 
00131    return 0;
00132 }
00133 
00134 int main( int argc, char *argv[] )
00135 {
00136    init();
00137    counter_t min_supp;
00138    unsigned int maxsize;
00139    bool relative;
00140    double relminsupp;
00141       
00142    {
00143       int return_val = 
00144          process_arguments( argc, argv, min_supp, 
00145                             relative, relminsupp, maxsize );
00146       if(return_val)
00147          return return_val;
00148    }
00149 
00150    char* algorithm = argv[1];
00151    char* input_file = argv[2];
00152    char* output_file = argv[4];
00153 
00154    try
00155    {
00156       
00157       typedef brBufferedTransactionReader< > T_R;
00158       
00159       
00160 
00161       T_R::params_t par_i;
00162       par_i.file_name = input_file;
00163       par_i.mode=FileReprBase::READ;
00164       par_i.file_buffer_size = 16 * 1024;
00165       T_R tr_reader(&par_i);
00166       std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00167       counter_t nr_of_transactions;
00168       
00169       FrequentFilter<T_R>
00170          fr_filter(tr_reader);
00171       log_status(0,"Finding frequent items.");
00172       fr_filter.findFrequentItems( freq_items_with_counters,  
00173                                    nr_of_transactions, min_supp);
00174 
00175       if(!freq_items_with_counters.empty())
00176       {
00177          log_status(0,"Doing decoder.");
00178          typedef DFDecoderWithEEManagement< > DF_D;
00179 
00180 
00181 
00182          DF_D::params_t par_d;
00183          par_d.file_name = output_file;
00184          par_d.mode=FileReprBase::WRITE;
00185 
00186          DF_D df_decoder(&par_d);
00187 
00188          typedef Bodon::LeafWithoutConstructor LEAF_WC;  
00189          typedef Bodon::Leaf LEAF;       
00190          typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE_OEL;
00191 
00192          typedef Bodon::Trie< LEAF, Bodon::OffsetIndexVector< std::vector<void*> > > TRIE_OI;
00193 
00194          if(strstr(argv[1],"-nee"))
00195          {
00196 
00197             typedef Bodon::TrieNEE<TRIE_OEL> TRIENEE_OEL;
00198             typedef Bodon::TrieNEE<TRIE_OI> TRIENEE_OI;
00199 
00200             if(strstr(argv[1],"-neefull"))
00201             {
00202                log_status(0,"NEE_FULL is enabled");
00203                AprioriSelector<TRIENEE_OEL, TRIENEE_OI, LEAF_WC, T_R, DF_D, NEE_Full>( 
00204                   min_supp, algorithm, input_file, nr_of_transactions, 
00205                   freq_items_with_counters, tr_reader, df_decoder, maxsize);
00206             }
00207             else
00208             {
00209                log_status(0,"NEE_Level1 is enabled");
00210                AprioriSelector<TRIENEE_OEL, TRIENEE_OI, LEAF_WC, T_R, DF_D, NEE_Level1>( 
00211                   min_supp, algorithm, input_file, nr_of_transactions, 
00212                   freq_items_with_counters, tr_reader, df_decoder, maxsize);
00213             }
00214          }
00215          else
00216          {
00217             log_status(0,"NEE is disabled");
00218 
00219             AprioriSelector<TRIE_OEL, TRIE_OI, LEAF_WC, T_R, DF_D, NEE_Off>( 
00220                min_supp, algorithm, input_file, nr_of_transactions, 
00221                freq_items_with_counters, tr_reader, df_decoder, maxsize );
00222 
00223          }
00224       }
00225    }
00226    catch (std::ios_base::failure e)
00227    {
00228       log_err(0,"Exiting the program due to IO exception");
00229       return 1;
00230    }
00231 }
00232 
00233