00001
00008 #include "common.hpp"
00009 #include "common/log.h"
00010 #include "common/allocators.hpp"
00011 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00012 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00013
00014 #include "io/codec/coder/Coder.hpp"
00015 #include "io/output/assoc_rule_writer/SimpleAssocRuleWriter.hpp"
00016
00017 #include "io/input/transaction_reader/OrderReverser.hpp"
00018 #include "io/db_cache/BuildTreeDBCache.hpp"
00019
00020 #include "util/StreamParser.hpp"
00021 #include "util/FrequentFilter.cpp"
00022 #include "util/Frequent2Filter.cpp"
00023
00024 #include "apriori/bodon/Trie.hpp"
00025 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00026
00027 #include "apriori/bodon/trie/trie_manipulators/FrequentItemInserter.hpp"
00028 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserter.hpp"
00029 #include "apriori/bodon/trie/trie_manipulators/support_counter/SupportCounterMerge.hpp"
00030 #include "apriori/OneByOneSupportCounter.hpp"
00031 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/IntersectProPruner.hpp"
00032 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/CandidateGeneratorPrune.hpp"
00033 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/InfreqRemover_AssocRuleGen.hpp"
00034
00035 #include "apriori/Apriori.hpp"
00036
00037 #include <vector>
00038 #include <iostream>
00039 #include <string>
00040
00041
00042 std::string file_format;
00043
00044 void init()
00045 {
00046 file_format = "File format:";
00047 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00048 file_format += "represents a transaction. \n";
00049 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00050 file_format += "character.\nIt can be for example a white space, comma, ";
00051 file_format += "colon, etc.\n";
00052 file_format += "Items are nonnegative integers.\n";
00053 }
00055 void usage()
00056 {
00057 std::cerr<<"\nUsage: assoc_rule transactionfile min_supp min_conf min_lift outcomefile [options]\n";
00058 std::cerr<<"\n transactionfile file, that contains the tranasctions of items\n";
00059 std::cerr<<" min_supp\t absolute support threshold\n";
00060 std::cerr<<" min_conf\t confidence threshold\n";
00061 std::cerr<<" min_lift\t lift threshold\n";
00062 std::cerr<<" outcomefile\t file to write the frequent itemsets and association rules\n\n";
00063
00064 std::cerr<<"\t\t\tHave a succesful mining ;-)\n\n";
00065 }
00066
00077 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00078 bool &isrel, double &relminsupp )
00079 {
00080 if ( argc < 6 )
00081 {
00082 log_err(0,"There are 5 mandatory arguments.");
00083 usage();
00084 return 2;
00085 }
00086 std::string mins=argv[2];
00087 if (mins[mins.size()-1]=='%') {
00088 mins.erase(mins.size()-1);
00089 isrel=true;
00090 relminsupp=atof(mins.c_str());
00091 relminsupp/=100;
00092 log_info(0,"Using relative minimum support of %lg",relminsupp);
00093 return 0;
00094 }
00095 isrel=false;
00096 int min_supp_i;
00097 try
00098 {
00099 convert(argv[2], min_supp_i);
00100 if ( min_supp_i <= 0 )
00101 {
00102 log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00103 return 3;
00104 }
00105 }
00106 catch(BadConversion e)
00107 {
00108 log_err(0,"min_supp conversion problem.");
00109 return 3;
00110 }
00111 min_supp = static_cast<counter_t>(min_supp_i);
00112 log_info(0,"min_supp is set to %d", min_supp);
00113 return 0;
00114 }
00115
00128 int main( int argc, char *argv[] )
00129 {
00130 init();
00131 counter_t min_supp;
00132 bool relative;
00133 double relminsupp;
00134
00135 {
00136 int return_val =
00137 process_arguments( argc, argv, min_supp, relative, relminsupp );
00138 if(return_val)
00139 return return_val;
00140 }
00141
00142 char* input_file = argv[1];
00143 char* output_file = argv[5];
00144 float min_conf = atof(argv[3]);
00145 float min_lift = atof(argv[4]);
00146
00147 try
00148 {
00149
00150 typedef brBufferedTransactionReader< > T_R;
00151
00152
00153
00154 T_R::params_t par_i;
00155 par_i.file_name = input_file;
00156 par_i.mode=FileReprBase::READ;
00157 par_i.file_buffer_size = 16 * 1024;
00158 T_R tr_reader(&par_i);
00159 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00160 counter_t nr_of_transactions;
00161
00162 FrequentFilter<T_R>
00163 fr_filter(tr_reader);
00164 log_status(0,"Finding frequent items.");
00165 fr_filter.findFrequentItems( freq_items_with_counters,
00166 nr_of_transactions, min_supp);
00167
00168 if(!freq_items_with_counters.empty())
00169 {
00170 log_status(0,"Doing decoder.");
00171 typedef SimpleAssocRuleWriter DF_D;
00172
00173 DF_D::params_t par_d;
00174 par_d.file_name = output_file;
00175 par_d.mode=FileReprBase::WRITE;
00176
00177 DF_D df_decoder(&par_d);
00178
00179 typedef Bodon::LeafWithoutConstructor LEAF_WC;
00180 typedef Bodon::Leaf LEAF;
00181 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE;
00182 typedef SortedTransactionReader< Coder<T_R, DF_D>, false, false > S_C_T_R;
00183 const bool ENDONLY = true;
00184 typedef OrderReverser< bracz::BuildTreeDBCache<
00185 S_C_T_R, std::vector<item_t>, bracz::EndPatriciaBuildTree<ENDONLY>, ENDONLY > >S_C;
00186 S_C::params_t par_c;
00187 par_c.file_name = input_file;
00188 par_c.mode=FileReprBase::READ;
00189 par_c.largest_item = tr_reader.getLargestItem();
00190 par_c.decoder = &df_decoder;
00191 par_c.freq_items_with_counters = &freq_items_with_counters;
00192 par_c.codemode = ASC;
00193 log_status(0,"Doing sorted codec.");
00194 S_C sorted_coder(&par_c);
00195
00196 std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00197 freq_pairs_with_counters;
00198 Frequent2Filter<S_C> fr_2_filter(
00199 &sorted_coder );
00200
00201
00202 log_status(0,"Finding frequent pairs.")
00203 fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00204
00205 const NEELevel NEE = NEE_Off;
00206 const bool DEADENDPRUNE = false;
00207
00208 typedef bracz::singleualloc<LEAF_WC, 64 * 1024> LEAF_ALLOCATOR;
00209 typedef Bodon::FrequentItemInserter<DF_D, TRIE, NEE> FII;
00210 typedef Bodon::SupportCounterMerge<TRIE> SUPP_C_BASE;
00211 typedef OneByOneSupportCounter<TRIE, S_C, SUPP_C_BASE> SUPP_C;
00212
00213 typedef Bodon::FrequentPairInserter<DF_D, TRIE, TRIE, LEAF_ALLOCATOR, NEE> FPI;
00214 typedef Bodon::inhomogeneous_trie::IntersectProPruner<
00215 DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE, DEADENDPRUNE> PRUNER;
00216 typedef Bodon::inhomogeneous_trie::CandidateGeneratorPrune<PRUNER, DF_D, TRIE, LEAF_ALLOCATOR, NEE, DEADENDPRUNE> CG;
00217 typedef Bodon::inhomogeneous_trie::InfreqRemover_AssocRuleGen<DF_D, TRIE, LEAF, LEAF_ALLOCATOR, NEE, DEADENDPRUNE> IR;
00218
00219 typedef Apriori<S_C, DF_D, TRIE, LEAF_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00220 TRIE main_trie;
00221 LEAF_ALLOCATOR s_alloc;
00222 IR infrequent_remover(main_trie, df_decoder, s_alloc,
00223 min_conf, min_lift, nr_of_transactions);
00224
00225 FII fii(main_trie, df_decoder);
00226 A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00227 log_status(0,"Finding frequent itemsets.")
00228 apriori.findFrequentItemsets(
00229 nr_of_transactions, *par_c.freq_counters,
00230 freq_pairs_with_counters, min_supp );
00231 }
00232 }
00233 catch (std::ios_base::failure e)
00234 {
00235 log_err(0,"Exiting the program due to IO exception");
00236 return 1;
00237 }
00238 }
00239
00240