00001 #include "common.hpp"
00002 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00003 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00004
00005 #include "io/output/BufferedOutput.hpp"
00006 #include "io/codec/decoder/df/DFDecoderWithEEManagement.hpp"
00007
00008 #include "util/StreamParser.hpp"
00009 #include "util/FrequentFilter.cpp"
00010
00011 #include "datastructures/maxvector.hpp"
00012 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00013
00014 #include "datastructures/trie/edgelist/OffsetIndexVector.hpp"
00015
00016
00017 #include "apriori/bodon/Trie.hpp"
00018
00019 #include "test/apriori/bodon/trie_manipulators/RoutingSelector.hpp"
00020
00021 #include <vector>
00022 #include <iostream>
00023 #include <string>
00024
00025
00026 std::string file_format;
00027
00028 void init()
00029 {
00030 file_format = "File format:";
00031 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00032 file_format += "represents a transaction. \n";
00033 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00034 file_format += "character.\nIt can be for example a white space, comma, ";
00035 file_format += "colon, etc.\n";
00036 file_format += "Items are nonnegative integers.\n";
00037 }
00039 void usage()
00040 {
00041 std::cerr << "\nUsage: routing routing_strat edge_repr transactionfile min_supp ";
00042 std::cerr << "outcomefile [options]\n";
00043 std::cerr << "\n routing_strat\t the routing strategy, i.e: ";
00044 std::cerr << "merge, merge2, merge3, lookup_edge,\n\t\t lookup_edge_prev_mem, ";
00045 std::cerr << "bitvector, indexvector, lookup_tr, hybrid, default\n";
00046 std::cerr << " edge_repr\t the representation of the edge, i.e: ";
00047 std::cerr << "ordered_list, \n\t\t offsetindex, hybrid";
00048 std::cerr << "\n transactionfile file, that contains the tranasctions of items";
00049 std::cerr << "\n min_supp\t absolute support threshold";
00050 std::cerr << "\n outcomefile\t file to write the outcome";
00051
00052 std::cerr << file_format;
00053 std::cerr << "\n\nHave a succesful mining ;-)"<<std::endl<<std::endl;
00054 }
00055
00066 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00067 bool &isrel, double &relminsupp )
00068 {
00069 if ( argc < 6 )
00070 {
00071 usage();
00072 log_err(0,"There are 5 mandatory arguments.");
00073 return 2;
00074 }
00075 std::string mins=argv[4];
00076 if (mins[mins.size()-1]=='%') {
00077 mins.erase(mins.size()-1);
00078 isrel=true;
00079 relminsupp=atof(mins.c_str());
00080 relminsupp/=100;
00081 log_info(0,"Using relative minimum support of %lg",relminsupp);
00082 return 0;
00083 }
00084 isrel=false;
00085 int min_supp_i;
00086 try
00087 {
00088 convert(argv[4], min_supp_i);
00089 if ( min_supp_i <= 0 )
00090 {
00091 log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00092 return 3;
00093 }
00094 }
00095 catch(BadConversion e)
00096 {
00097 log_err(0,"min_supp conversion problem.");
00098 return 3;
00099 }
00100 min_supp = static_cast<counter_t>(min_supp_i);
00101 log_info(0,"min_supp is set to %d", min_supp);
00102 return 0;
00103 }
00104
00105 int main( int argc, char *argv[] )
00106 {
00107 init();
00108 counter_t min_supp;
00109 bool relative;
00110 double relminsupp;
00111
00112 {
00113 int return_val =
00114 process_arguments( argc, argv, min_supp, relative, relminsupp );
00115 if(return_val)
00116 return return_val;
00117 }
00118
00119 char* routing = argv[1];
00120 char* input_file = argv[3];
00121 char* output_file = argv[5];
00122
00123 try
00124 {
00125
00126 typedef brBufferedTransactionReader< > T_R;
00127
00128
00129
00130 T_R::params_t par_i;
00131 par_i.file_name = input_file;
00132 par_i.mode=FileReprBase::READ;
00133 par_i.file_buffer_size = 16 * 1024;
00134 T_R tr_reader(&par_i);
00135 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00136 counter_t nr_of_transactions;
00137
00138 FrequentFilter<T_R>
00139 fr_filter(tr_reader);
00140 log_status(0,"Finding frequent items.");
00141 fr_filter.findFrequentItems( freq_items_with_counters,
00142 nr_of_transactions, min_supp);
00143
00144 log_status(0,"Doing decoder.");
00145 typedef DFDecoderWithEEManagement< > DF_D;
00146
00147 DF_D::params_t par_d;
00148 par_d.file_name = output_file;
00149 par_d.mode=FileReprBase::WRITE;
00150 DF_D df_decoder(&par_d);
00151
00152 typedef Bodon::LeafWithoutConstructor LEAF_WC;
00153 typedef Bodon::Leaf LEAF;
00154 typedef bracz::singleualloc<LEAF_WC, 1024> LEAF_ALLOCATOR;
00155
00156 if(strcmp(argv[2],"ordered_list")==0)
00157 {
00158 log_info(0,"Ordered edgelist representation is selected");
00159 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE;
00160 RoutingSelector<TRIE, LEAF_WC, LEAF_ALLOCATOR, T_R, DF_D>(
00161 min_supp, routing, input_file, nr_of_transactions,
00162 freq_items_with_counters, tr_reader, df_decoder );
00163 }
00164 else if(strcmp(argv[2],"offsetindex")==0)
00165 {
00166 log_info(0,"Offsetindex edgelist representation is selected");
00167 typedef Bodon::Trie< LEAF, Bodon::OffsetIndexVector< std::vector<void*> > > TRIE;
00168 RoutingSelectorOffset<TRIE, LEAF_WC, LEAF_ALLOCATOR, T_R, DF_D>(
00169 min_supp, routing, input_file, nr_of_transactions,
00170 freq_items_with_counters, tr_reader, df_decoder );
00171 }
00172 else if(strcmp(argv[2],"hybrid")==0)
00173 {
00174 log_info(0,"Hybrid edgelist representation is selected");
00175 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE_OEL;
00176 typedef Bodon::Trie< LEAF, Bodon::OffsetIndexVector< std::vector<void*> > > TRIE_OI;
00177 RoutingSelectorHybrid<TRIE_OEL, TRIE_OI, LEAF_WC, LEAF_ALLOCATOR, T_R, DF_D>(
00178 min_supp, routing, input_file, nr_of_transactions,
00179 freq_items_with_counters, tr_reader, df_decoder );
00180 }
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258 else
00259 {
00260 usage();
00261 log_err(0,"edge_repr should be either ordered_list, offsetindex!");
00262 return 1;
00263 }
00264 }
00265 catch (std::ios_base::failure e)
00266 {
00267 log_err(0,"Exiting the program due to IO exception");
00268 return 1;
00269 }
00270 }
00271
00272