00001 #include "common.hpp"
00002 #include "common/log.h"
00003 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00004 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00005
00006
00007 #include "io/codec/decoder/df/SimpleDFDecoder.hpp"
00008
00009 #include "io/codec/decoder/df/DFDecoderWithEEManagement.hpp"
00010
00011 #include "io/output/normal/SortOutput.hpp"
00012
00013
00014 #include "util/StreamParser.hpp"
00015 #include "util/FrequentFilter.cpp"
00016
00017 #include "datastructures/maxvector.hpp"
00018 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00019
00020 #include "datastructures/trie/edgelist/OffsetIndexVector.hpp"
00021
00022 #include "apriori/bodon/Leaf.hpp"
00023 #include "apriori/bodon/Trie.hpp"
00024 #include "apriori/bodon/TrieNEE.hpp"
00025
00026 #include "test/apriori/bodon/AprioriSelector.hpp"
00027 #include <vector>
00028 #include <iostream>
00029 #include <string>
00030
00031
00032 std::string file_format;
00033
00034 void init()
00035 {
00036 file_format = "File format:";
00037 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00038 file_format += "represents a transaction. \n";
00039 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00040 file_format += "character.\nIt can be for example a white space, comma, ";
00041 file_format += "colon, etc.\n";
00042 file_format += "Items are nonnegative integers.\n";
00043 }
00045 void usage()
00046 {
00047 log_err(0,"Usage: candGendTester candidate_option[-nee_option] transactionfile min_supp outcomefile [maxsize]");
00048 log_err(0," candidate_option\t the candidate generation technique, ");
00049 log_err(0,"i.e: simpleprune, intersectprune or noprune");
00050 log_err(0,"nee_option\t the way equisupport items are handled");
00051 log_err(0,"i.e: nee (for prefix equisupport), neefull (for full equisupport pruning)");
00052 log_err(0," transactionfile\t file, that contains the tranasctions of items");
00053 log_err(0," outcomefile\t file to write the outcome");
00054 log_err(0," min_supp\t absolute support threshold");
00055 log_err(0,"Example:\t candGendTester simpleprune ../data/kosarak.dat 1900 output.txt");
00056 log_err(0," \t candGendTester intersectprune-neefull ../data/kosarak.dat 1900 output.txt");
00057
00058 std::cerr << file_format;
00059 log_err(0,"\t\t\tHave a succesful mining ;-)\n\n");
00060 }
00061
00072 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00073 bool &isrel, double &relminsupp, unsigned int& maxsize )
00074 {
00075 if ( argc < 5 )
00076 {
00077 log_err(0,"There are 4 mandatory arguments.");
00078 return 2;
00079 }
00080 std::string mins=argv[3];
00081 if (mins[mins.size()-1]=='%') {
00082 mins.erase(mins.size()-1);
00083 isrel=true;
00084 relminsupp=atof(mins.c_str());
00085 relminsupp/=100;
00086 log_status(0,"Using relative minimum support of %lg",relminsupp);
00087 return 0;
00088 }
00089 isrel=false;
00090 int min_supp_i;
00091 try
00092 {
00093 convert(argv[3], min_supp_i);
00094 if ( min_supp_i <= 0 )
00095 {
00096 log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00097 return 3;
00098 }
00099 }
00100 catch(BadConversion e)
00101 {
00102 log_err(0,"min_supp conversion problem.");
00103 return 3;
00104 }
00105 min_supp = static_cast<counter_t>(min_supp_i);
00106 log_status(0,"min_supp is set to %d", min_supp);
00107
00108 if(argc == 6)
00109 {
00110 int maxsize_i;
00111 try
00112 {
00113 convert(argv[5], maxsize_i);
00114 if ( maxsize_i <= 0 )
00115 {
00116 log_err(0,"%s cannot be converted to a positive integer.",argv[4]);
00117 return 4;
00118 }
00119 }
00120 catch(BadConversion e)
00121 {
00122 log_err(0,"min_supp conversion problem.");
00123 return 4;
00124 }
00125 maxsize = static_cast<unsigned int>(maxsize_i);
00126 log_status(0,"maxsize is set to %d", maxsize);
00127 }
00128 else
00129 maxsize = largest_itemsetsize;
00130
00131 return 0;
00132 }
00133
00134 int main( int argc, char *argv[] )
00135 {
00136 init();
00137 counter_t min_supp;
00138 unsigned int maxsize;
00139 bool relative;
00140 double relminsupp;
00141
00142 {
00143 int return_val =
00144 process_arguments( argc, argv, min_supp,
00145 relative, relminsupp, maxsize );
00146 if(return_val)
00147 return return_val;
00148 }
00149
00150 char* algorithm = argv[1];
00151 char* input_file = argv[2];
00152 char* output_file = argv[4];
00153
00154 try
00155 {
00156
00157 typedef brBufferedTransactionReader< > T_R;
00158
00159
00160
00161 T_R::params_t par_i;
00162 par_i.file_name = input_file;
00163 par_i.mode=FileReprBase::READ;
00164 par_i.file_buffer_size = 16 * 1024;
00165 T_R tr_reader(&par_i);
00166 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00167 counter_t nr_of_transactions;
00168
00169 FrequentFilter<T_R>
00170 fr_filter(tr_reader);
00171 log_status(0,"Finding frequent items.");
00172 fr_filter.findFrequentItems( freq_items_with_counters,
00173 nr_of_transactions, min_supp);
00174
00175 if(!freq_items_with_counters.empty())
00176 {
00177 log_status(0,"Doing decoder.");
00178 typedef DFDecoderWithEEManagement< > DF_D;
00179
00180
00181
00182 DF_D::params_t par_d;
00183 par_d.file_name = output_file;
00184 par_d.mode=FileReprBase::WRITE;
00185
00186 DF_D df_decoder(&par_d);
00187
00188 typedef Bodon::LeafWithoutConstructor LEAF_WC;
00189 typedef Bodon::Leaf LEAF;
00190 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE_OEL;
00191
00192 typedef Bodon::Trie< LEAF, Bodon::OffsetIndexVector< std::vector<void*> > > TRIE_OI;
00193
00194 if(strstr(argv[1],"-nee"))
00195 {
00196
00197 typedef Bodon::TrieNEE<TRIE_OEL> TRIENEE_OEL;
00198 typedef Bodon::TrieNEE<TRIE_OI> TRIENEE_OI;
00199
00200 if(strstr(argv[1],"-neefull"))
00201 {
00202 log_status(0,"NEE_FULL is enabled");
00203 AprioriSelector<TRIENEE_OEL, TRIENEE_OI, LEAF_WC, T_R, DF_D, NEE_Full>(
00204 min_supp, algorithm, input_file, nr_of_transactions,
00205 freq_items_with_counters, tr_reader, df_decoder, maxsize);
00206 }
00207 else
00208 {
00209 log_status(0,"NEE_Level1 is enabled");
00210 AprioriSelector<TRIENEE_OEL, TRIENEE_OI, LEAF_WC, T_R, DF_D, NEE_Level1>(
00211 min_supp, algorithm, input_file, nr_of_transactions,
00212 freq_items_with_counters, tr_reader, df_decoder, maxsize);
00213 }
00214 }
00215 else
00216 {
00217 log_status(0,"NEE is disabled");
00218
00219 AprioriSelector<TRIE_OEL, TRIE_OI, LEAF_WC, T_R, DF_D, NEE_Off>(
00220 min_supp, algorithm, input_file, nr_of_transactions,
00221 freq_items_with_counters, tr_reader, df_decoder, maxsize );
00222
00223 }
00224 }
00225 }
00226 catch (std::ios_base::failure e)
00227 {
00228 log_err(0,"Exiting the program due to IO exception");
00229 return 1;
00230 }
00231 }
00232
00233