00001
00028 #include "common.hpp"
00029 #include "common/log.h"
00030 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00031 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00032
00033
00034 #include "io/codec/decoder/df/SimpleDFDecoder.hpp"
00035
00036 #include "io/codec/decoder/df/DFDecoderWithEEManagement.hpp"
00037
00038 #include "io/output/normal/SortOutput.hpp"
00039
00040
00041 #include "util/StreamParser.hpp"
00042 #include "util/FrequentFilter.cpp"
00043
00044 #include "datastructures/maxvector.hpp"
00045 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00046
00047 #include "datastructures/trie/edgelist/OffsetIndexVector.hpp"
00048
00049 #include "apriori/bodon/Leaf.hpp"
00050 #include "apriori/bodon/Trie.hpp"
00051 #include "apriori/bodon/TrieNEE.hpp"
00052 #include "io/input/transaction_reader/OrderReverser.hpp"
00053 #include "io/codec/coder/Coder.hpp"
00054 #include "io/db_cache/BuildTreeDBCache.hpp"
00055 #include "util/Frequent2Filter.cpp"
00056 #include "util/Frequent2FilterOnline.cpp"
00057 #include "apriori/OneByOneSupportCounter.hpp"
00058 #include "apriori/bodon/dynamic_trie/trie_manipulators/SupportCounter.hpp"
00059 #include "apriori/AprioriSelector.hpp"
00060 #include "fpgrowth/FPGrowthSelector.hpp"
00061 #include <vector>
00062 #include <iostream>
00063 #include <string>
00064
00065
00066 std::string file_format;
00067
00068 void init()
00069 {
00070 file_format = "File format:";
00071 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00072 file_format += "represents a transaction. \n";
00073 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00074 file_format += "character.\nIt can be for example a white space, comma, ";
00075 file_format += "colon, etc.\n";
00076 file_format += "Items are nonnegative integers.\n";
00077 }
00079 void usage()
00080 {
00081 std::cerr<<"\nUsage: fim algorithm transactionfile min_supp outcomefile [maxsize]\n";
00082 std::cerr<<" algorithm\t the name of the algorithm, i.e: apriori, eclat or fp-growth\n";
00083 std::cerr<<" transactionfile\t file, that contains the tranasctions of items\n";
00084 std::cerr<<" outcomefile\t file to write the outcome\n";
00085 std::cerr<<" min_supp\t absolute support threshold\n";
00086 std::cerr<<" maxsize\t the upper limit of the size of the frequent sets\n";
00087
00088 std::cerr << file_format;
00089 std::cerr<<"\n\t\t\tHave a succesful mining ;-)\n\n";
00090 }
00091
00103 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00104 bool &isrel, double &relminsupp, unsigned int& maxsize )
00105 {
00106 if ( argc < 5 )
00107 {
00108 log_err(0,"There are 4 mandatory arguments!");
00109 usage();
00110 return 2;
00111 }
00112 std::string mins=argv[3];
00113 if (mins[mins.size()-1]=='%') {
00114 mins.erase(mins.size()-1);
00115 isrel=true;
00116 relminsupp=atof(mins.c_str());
00117 relminsupp/=100;
00118 log_info(0,"Using relative minimum support of %lg",relminsupp);
00119 return 0;
00120 }
00121 isrel=false; relminsupp=0;
00122 int min_supp_i;
00123 try
00124 {
00125 convert(argv[3], min_supp_i);
00126 if ( min_supp_i <= 0 )
00127 {
00128 log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00129 return 3;
00130 }
00131 }
00132 catch(BadConversion e)
00133 {
00134 log_err(0,"min_supp conversion problem.");
00135 return 3;
00136 }
00137 min_supp = static_cast<counter_t>(min_supp_i);
00138 log_info(0,"min_supp is set to %d", min_supp);
00139 if(argc == 6)
00140 {
00141 int maxsize_i;
00142 try
00143 {
00144 convert(argv[5], maxsize_i);
00145 if ( maxsize_i <= 0 )
00146 {
00147 log_err(0,"%s cannot be converted to a positive integer.",argv[5]);
00148 return 4;
00149 }
00150 }
00151 catch(BadConversion e)
00152 {
00153 log_err(0,"max_size conversion problem.");
00154 return 4;
00155 }
00156 maxsize = static_cast<unsigned int>(maxsize_i);
00157 log_status(0,"maxsize is set to %d", maxsize);
00158 }
00159 else
00160 maxsize = largest_itemsetsize;
00161 return 0;
00162 }
00163
00164 int main( int argc, char *argv[] )
00165 {
00166 init();
00167 counter_t min_supp;
00168 unsigned int maxsize;
00169 bool relative;
00170 double relminsupp;
00171
00172 {
00173 int return_val =
00174 process_arguments( argc, argv, min_supp,
00175 relative, relminsupp, maxsize );
00176 if(return_val)
00177 return return_val;
00178 }
00179
00180 char* algorithm = argv[1];
00181 char* input_file = argv[2];
00182 char* output_file = argv[4];
00183
00184 try
00185 {
00186
00187 typedef brBufferedTransactionReader< > T_R;
00188
00189
00190
00191 T_R::params_t par_i;
00192 par_i.file_name = input_file;
00193 par_i.mode=FileReprBase::READ;
00194 par_i.file_buffer_size = 16 * 1024;
00195 T_R tr_reader(&par_i);
00196 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00197 counter_t nr_of_transactions;
00198
00199 FrequentFilter<T_R>
00200 fr_filter(tr_reader);
00201 log_status(0,"Finding frequent items.");
00202 fr_filter.findFrequentItems( freq_items_with_counters,
00203 nr_of_transactions, min_supp);
00204
00205 if(!freq_items_with_counters.empty())
00206 {
00207 log_status(0,"Doing decoder.");
00208 typedef DFDecoderWithEEManagement< > DF_D;
00209
00210
00211
00212 DF_D::params_t par_d;
00213 par_d.file_name = output_file;
00214 par_d.mode=FileReprBase::WRITE;
00215
00216 DF_D df_decoder(&par_d);
00217
00218 if(strncmp(argv[1],"apriori",7) == 0)
00219 {
00220 log_status(0,"APRIORI is selected");
00221 typedef Bodon::LeafWithoutConstructor LEAF_WC;
00222 typedef Bodon::Leaf LEAF;
00223 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE_OEL;
00224
00225 typedef Bodon::Trie< LEAF, Bodon::OffsetIndexVector< std::vector<void*> > > TRIE_OI;
00226
00227 typedef Bodon::TrieNEE<TRIE_OEL> TRIENEE_OEL;
00228 typedef Bodon::TrieNEE<TRIE_OI> TRIENEE_OI;
00229
00230 const NEELevel NEE = NEE_Full;
00231 log_status(0,"NEE_FULL is enabled");
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246 if(strstr(argv[1],"lowmem"))
00247 {
00248 log_status(0,"Low memory need option is on.");
00249 typedef SortedTransactionReader<Coder<T_R, DF_D> > S_C;
00250 typedef Bodon::dynamic_trie::SupportCounter<TRIE_OEL, TRIE_OI> SUPP_C_BASE;
00251 typedef OneByOneSupportCounter<TRIE_OEL, S_C, SUPP_C_BASE> SUPP_C;
00252 typedef Frequent2FilterOnline<S_C> F2F;
00253 AprioriSelector<TRIENEE_OEL, TRIENEE_OI, LEAF_WC, S_C, F2F, SUPP_C, T_R, DF_D, NEE>(
00254 min_supp, "apriori-intersect", input_file, nr_of_transactions,
00255 freq_items_with_counters, tr_reader, df_decoder, maxsize);
00256 }
00257 else
00258 {
00259 typedef SortedTransactionReader< Coder<T_R, DF_D>, false, false > S_C_T_R;
00260 const bool ENDONLY = true;
00261 typedef OrderReverser< bracz::BuildTreeDBCache<
00262 S_C_T_R, std::vector<item_t>, bracz::EndPatriciaBuildTree<ENDONLY>, ENDONLY > >S_C;
00263 typedef Bodon::dynamic_trie::SupportCounter<TRIE_OEL, TRIE_OI> SUPP_C_BASE;
00264 typedef OneByOneSupportCounter<TRIE_OEL, S_C, SUPP_C_BASE> SUPP_C;
00265 typedef Frequent2Filter<S_C> F2F;
00266 AprioriSelector<TRIENEE_OEL, TRIENEE_OI, LEAF_WC, S_C,
00267 F2F, SUPP_C, T_R, DF_D, NEE>(
00268 min_supp, algorithm, input_file, nr_of_transactions,
00269 freq_items_with_counters, tr_reader, df_decoder, maxsize);
00270 }
00271 }
00272 else if(strncmp(argv[1],"fp-growth",9) == 0)
00273 FPGrowthSelector<T_R, DF_D>(
00274 min_supp, relative, relminsupp, algorithm, input_file,
00275 nr_of_transactions, freq_items_with_counters, tr_reader, df_decoder);
00276 else
00277 {
00278 usage();
00279 log_err(0,"algorithm should be either apriori, fp-growth or eclat!");
00280 return 1;
00281 }
00282 }
00283 }
00284 catch (std::ios_base::failure e)
00285 {
00286 log_err(0,"Exiting the program due to IO exception");
00287 return 1;
00288 }
00289 }
00290
00291