00001 #include "common.hpp"
00002 #include "common/log.h"
00003 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00004 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00005
00006
00007 #include "io/codec/decoder/df/SimpleDFDecoder.hpp"
00008
00009 #include "io/codec/decoder/df/DFDecoderWithEEManagement.hpp"
00010
00011 #include "io/output/normal/SortOutput.hpp"
00012
00013
00014 #include "util/StreamParser.hpp"
00015 #include "util/FrequentFilter.cpp"
00016
00017 #include "datastructures/maxvector.hpp"
00018 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00019
00020 #include "datastructures/trie/edgelist/OffsetIndexVector.hpp"
00021 #include "common/allocators.hpp"
00022
00023 #include "apriori/bodon/Leaf.hpp"
00024 #include "apriori/bodon/Trie.hpp"
00025 #include "apriori/bodon/TrieNEE.hpp"
00026 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00027 #include "io/input/transaction_reader/OrderReverser.hpp"
00028 #include "io/codec/coder/Coder.hpp"
00029 #include "util/Frequent2Filter.cpp"
00030 #include "util/Frequent2FilterOnline.cpp"
00031 #include "apriori/bodon/trie/trie_manipulators/FrequentItemInserter.hpp"
00032 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserter.hpp"
00033 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserterNoprune.hpp"
00034
00035 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/SimplePruner.hpp"
00036 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/CandidateGeneratorPrune.hpp"
00037 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/InfreqRemover.hpp"
00038
00039 #include "apriori/OneByOneSupportCounter.hpp"
00040 #include "apriori/bodon/trie/trie_manipulators/support_counter/SupportCounterMerge.hpp"
00041
00042 #include "apriori/Apriori.hpp"
00043
00044 #include <vector>
00045 #include <iostream>
00046 #include <string>
00047
00048
00049 std::string file_format;
00050
00051 void init()
00052 {
00053 file_format = "File format:";
00054 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00055 file_format += "represents a transaction. \n";
00056 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00057 file_format += "character.\nIt can be for example a white space, comma, ";
00058 file_format += "colon, etc.\n";
00059 file_format += "Items are nonnegative integers.\n";
00060 }
00062 void usage()
00063 {
00064 log_err(0,"Usage: deadend deadend-option transactionfile min_supp outcomefile [options]");
00065 log_err(0," deadend-option\t option of dead-end pruning, i.e: on, off");
00066 log_err(0," transactionfile\t file, that contains the tranasctions of items");
00067 log_err(0," outcomefile\t file to write the outcome");
00068 log_err(0," min_supp\t absolute support threshold");
00069
00070 std::cerr << file_format;
00071 log_err(0,"\t\t\tHave a succesful mining ;-)\n\n");
00072 }
00073
00085 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00086 bool &isrel, double &relminsupp, unsigned int& maxsize )
00087 {
00088 if ( argc < 5 )
00089 {
00090 log_err(0,"There are 4 mandatory arguments.");
00091 return 2;
00092 }
00093 std::string mins=argv[3];
00094 if (mins[mins.size()-1]=='%') {
00095 mins.erase(mins.size()-1);
00096 isrel=true;
00097 relminsupp=atof(mins.c_str());
00098 relminsupp/=100;
00099 log_info(0,"Using relative minimum support of %lg",relminsupp);
00100 return 0;
00101 }
00102 isrel=false; relminsupp=0;
00103 int min_supp_i;
00104 try
00105 {
00106 convert(argv[3], min_supp_i);
00107 if ( min_supp_i <= 0 )
00108 {
00109 log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00110 return 3;
00111 }
00112 }
00113 catch(BadConversion e)
00114 {
00115 log_err(0,"min_supp conversion problem.");
00116 return 3;
00117 }
00118 min_supp = static_cast<counter_t>(min_supp_i);
00119 log_info(0,"min_supp is set to %d", min_supp);
00120 if(argc == 6)
00121 {
00122 int maxsize_i;
00123 try
00124 {
00125 convert(argv[5], maxsize_i);
00126 if ( maxsize_i <= 0 )
00127 {
00128 log_err(0,"%s cannot be converted to a positive integer.",argv[4]);
00129 return 4;
00130 }
00131 }
00132 catch(BadConversion e)
00133 {
00134 log_err(0,"max_size conversion problem.");
00135 return 4;
00136 }
00137 maxsize = static_cast<unsigned int>(maxsize_i);
00138 log_status(0,"maxsize is set to %d", maxsize);
00139 }
00140 else
00141 maxsize = largest_itemsetsize;
00142 return 0;
00143 }
00144
00145 int main( int argc, char *argv[] )
00146 {
00147 init();
00148 counter_t min_supp;
00149 unsigned int maxsize;
00150 bool relative;
00151 double relminsupp;
00152
00153 {
00154 int return_val =
00155 process_arguments( argc, argv, min_supp,
00156 relative, relminsupp, maxsize );
00157 if(return_val)
00158 return return_val;
00159 }
00160
00161 char *deadend_option=argv[1];
00162 char* input_file = argv[2];
00163 char* output_file = argv[4];
00164
00165 try
00166 {
00167
00168 typedef brBufferedTransactionReader< > T_R;
00169
00170
00171
00172 T_R::params_t par_i;
00173 par_i.file_name = input_file;
00174 par_i.mode=FileReprBase::READ;
00175 par_i.file_buffer_size = 16 * 1024;
00176 T_R tr_reader(&par_i);
00177 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00178 counter_t nr_of_transactions;
00179
00180 FrequentFilter<T_R>
00181 fr_filter(tr_reader);
00182 log_status(0,"Finding frequent items.");
00183 fr_filter.findFrequentItems( freq_items_with_counters,
00184 nr_of_transactions, min_supp);
00185
00186 if(!freq_items_with_counters.empty())
00187 {
00188 log_status(0,"Doing decoder.");
00189 typedef DFDecoderWithEEManagement< > DF_D;
00190
00191
00192
00193 DF_D::params_t par_d;
00194 par_d.file_name = output_file;
00195 par_d.mode=FileReprBase::WRITE;
00196
00197 DF_D df_decoder(&par_d);
00198
00199 log_status(0,"APRIORI is selected");
00200 typedef Bodon::LeafWithoutConstructor LEAF_WC;
00201 typedef Bodon::Leaf LEAF;
00202 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE;
00203
00204
00205
00206 log_status(0,"NEE_FULL is disabled");
00207 log_status(0,"Low memory need option is on.");
00208 typedef SortedTransactionReader<Coder<T_R, DF_D> > S_C;
00209 typedef Bodon::SupportCounterMerge<TRIE> SUPP_C_BASE;
00210 typedef OneByOneSupportCounter<TRIE, S_C, SUPP_C_BASE> SUPP_C;
00211 typedef Frequent2FilterOnline<S_C> F2F;
00212
00213 S_C::params_t par_c;
00214 par_c.file_name = input_file;
00215 par_c.mode=FileReprBase::READ;
00216 par_c.largest_item = tr_reader.getLargestItem();
00217 par_c.decoder = &df_decoder;
00218 par_c.freq_items_with_counters = &freq_items_with_counters;
00219 par_c.codemode = ASC;
00220
00221 log_status(0,"Doing sorted codec.");
00222 S_C sorted_coder(&par_c);
00223
00224 std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00225 freq_pairs_with_counters;
00226 F2F fr_2_filter(&sorted_coder );
00227 log_status(0,"Finding frequent pairs.")
00228 fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00229
00230 TRIE main_trie;
00231 typedef Bodon::FrequentItemInserter<DF_D, TRIE, NEE_Off> FII;
00232 FII fii(main_trie, df_decoder);
00233 typedef bracz::singleualloc<LEAF_WC, 64 * 1024> LEAF_ALLOCATOR;
00234 LEAF_ALLOCATOR s_alloc;
00235
00236
00237 log_status(0,"Simple pruning is selected.");
00238 typedef Bodon::FrequentPairInserter<DF_D, TRIE, LEAF_WC,
00239 LEAF_ALLOCATOR, NEE_Off> FPI;
00240 if(strstr(deadend_option, "on"))
00241 {
00242 log_status(0,"deadend option is on.");
00243 typedef Bodon::inhomogeneous_trie::SimplePruner<
00244 DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE_Off> PRUNER;
00245 typedef Bodon::inhomogeneous_trie::CandidateGeneratorPrune<PRUNER, DF_D,
00246 TRIE, LEAF_ALLOCATOR, NEE_Off> CG;
00247 typedef Bodon::inhomogeneous_trie::InfreqRemover<
00248 DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE_Off> IR;
00249 IR infrequent_remover(main_trie, df_decoder, s_alloc);
00250 typedef Apriori<S_C, DF_D, TRIE, LEAF_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00251 A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00252 log_status(0,"Finding frequent itemsets.");
00253 apriori.findFrequentItemsets(
00254 nr_of_transactions, *par_c.freq_counters,
00255 freq_pairs_with_counters, min_supp, maxsize );
00256 }
00257 else
00258 {
00259 log_status(0,"deadend option is off.");
00260 const bool deadend = false;
00261 typedef Bodon::inhomogeneous_trie::SimplePruner<
00262 DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE_Off, deadend> PRUNER;
00263 typedef Bodon::inhomogeneous_trie::CandidateGeneratorPrune<PRUNER, DF_D,
00264 TRIE, LEAF_ALLOCATOR, NEE_Off, deadend> CG;
00265 typedef Bodon::inhomogeneous_trie::InfreqRemover<
00266 DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE_Off, deadend> IR;
00267 IR infrequent_remover(main_trie, df_decoder, s_alloc);
00268 typedef Apriori<S_C, DF_D, TRIE, LEAF_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00269 A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00270 log_status(0,"Finding frequent itemsets.");
00271 apriori.findFrequentItemsets(
00272 nr_of_transactions, *par_c.freq_counters,
00273 freq_pairs_with_counters, min_supp, maxsize );
00274 }
00275
00276
00277 }
00278
00279 }
00280 catch (std::ios_base::failure e)
00281 {
00282 log_err(0,"Exiting the program due to IO exception");
00283 return 1;
00284 }
00285 }
00286
00287