Main Page | Namespace List | Class Hierarchy | Class List | Directories | File List | Namespace Members | Class Members | File Members

main-assoc-rule.cpp

Go to the documentation of this file.
00001 
00008 #include "common.hpp"
00009 #include "common/log.h"
00010 #include "common/allocators.hpp"
00011 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00012 #include "io/input/transaction_reader/SortedTransactionReader.hpp" 
00013 
00014 #include "io/codec/coder/Coder.hpp"
00015 #include "io/output/assoc_rule_writer/SimpleAssocRuleWriter.hpp"
00016 
00017 #include "io/input/transaction_reader/OrderReverser.hpp"
00018 #include "io/db_cache/BuildTreeDBCache.hpp"
00019 
00020 #include "util/StreamParser.hpp"
00021 #include "util/FrequentFilter.cpp"
00022 #include "util/Frequent2Filter.cpp"
00023 
00024 #include "apriori/bodon/Trie.hpp"
00025 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00026 
00027 #include "apriori/bodon/trie/trie_manipulators/FrequentItemInserter.hpp"
00028 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserter.hpp"
00029 #include "apriori/bodon/trie/trie_manipulators/support_counter/SupportCounterMerge.hpp"
00030 #include "apriori/OneByOneSupportCounter.hpp"
00031 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/IntersectProPruner.hpp"
00032 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/CandidateGeneratorPrune.hpp"
00033 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/InfreqRemover_AssocRuleGen.hpp"
00034 
00035 #include "apriori/Apriori.hpp"
00036 
00037 #include <vector>
00038 #include <iostream>
00039 #include <string>
00040 
00041 
00042 std::string file_format;
00043 
00044 void init()
00045 {
00046    file_format = "File format:";
00047    file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00048    file_format += "represents a transaction. \n";
00049    file_format += "A transaction is a set of items seperated by a nonnumeric ";
00050    file_format += "character.\nIt can be for example a white space, comma, ";
00051    file_format += "colon, etc.\n";
00052    file_format += "Items are nonnegative integers.\n";
00053 }
00055 void usage()
00056 {
00057    std::cerr<<"\nUsage: assoc_rule transactionfile min_supp min_conf min_lift outcomefile [options]\n";
00058    std::cerr<<"\n transactionfile    file, that contains the tranasctions of items\n";
00059    std::cerr<<" min_supp\t    absolute support threshold\n";
00060    std::cerr<<" min_conf\t    confidence threshold\n";
00061    std::cerr<<" min_lift\t    lift threshold\n";
00062    std::cerr<<" outcomefile\t    file to write the frequent itemsets and association rules\n\n";
00063 //   std::cerr << file_format;
00064    std::cerr<<"\t\t\tHave a succesful mining ;-)\n\n";
00065 }
00066 
00077 int process_arguments( int argc, char *argv[], counter_t& min_supp, 
00078                        bool &isrel, double &relminsupp )
00079 {
00080    if ( argc < 6 )
00081    {
00082      log_err(0,"There are 5 mandatory arguments.");
00083      usage();
00084      return 2;
00085    }
00086    std::string mins=argv[2];
00087    if (mins[mins.size()-1]=='%') {
00088      mins.erase(mins.size()-1);
00089      isrel=true;
00090      relminsupp=atof(mins.c_str());
00091      relminsupp/=100;
00092      log_info(0,"Using relative minimum support of %lg",relminsupp);
00093      return 0;
00094    }
00095    isrel=false;
00096    int min_supp_i;
00097    try
00098    {
00099       convert(argv[2], min_supp_i);
00100       if ( min_supp_i <= 0  )
00101       {
00102          log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00103          return 3;
00104       }
00105    }
00106    catch(BadConversion e)
00107    {
00108       log_err(0,"min_supp conversion problem.");
00109       return 3;
00110    }
00111    min_supp = static_cast<counter_t>(min_supp_i);
00112    log_info(0,"min_supp is set to %d", min_supp);
00113    return 0;
00114 }
00115 
00128 int main( int argc, char *argv[] )
00129 {
00130    init();
00131    counter_t min_supp;
00132    bool relative;
00133    double relminsupp;
00134    
00135    {
00136       int return_val = 
00137          process_arguments( argc, argv, min_supp, relative, relminsupp );
00138       if(return_val)
00139          return return_val;
00140    }
00141 
00142    char* input_file = argv[1];
00143    char* output_file = argv[5];
00144    float min_conf = atof(argv[3]);
00145    float min_lift = atof(argv[4]);
00146    
00147    try
00148    {
00149       // We assume that the transactions does not contain duplicates!!!
00150       typedef brBufferedTransactionReader< > T_R;
00151       // Otherwise uncomment this:
00152       // typedef SortedTransactionReader<LBufferedTransactionReader< >, true> T_R;
00153       
00154       T_R::params_t par_i;
00155       par_i.file_name = input_file;
00156       par_i.mode=FileReprBase::READ;
00157       par_i.file_buffer_size = 16 * 1024;
00158       T_R tr_reader(&par_i);
00159       std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00160       counter_t nr_of_transactions;
00161       // The first step of each algorithms is determining the frequent items.
00162       FrequentFilter<T_R>
00163          fr_filter(tr_reader);
00164       log_status(0,"Finding frequent items.");
00165       fr_filter.findFrequentItems( freq_items_with_counters,  
00166                                    nr_of_transactions, min_supp);
00167       
00168       if(!freq_items_with_counters.empty())
00169       {
00170          log_status(0,"Doing decoder.");
00171          typedef SimpleAssocRuleWriter DF_D;
00172 
00173          DF_D::params_t par_d;
00174          par_d.file_name = output_file;
00175          par_d.mode=FileReprBase::WRITE;
00176 //          par_d.numfreq = freq_items_with_counters.size(); // If StatOutput is used!!!
00177          DF_D df_decoder(&par_d);
00178          
00179          typedef Bodon::LeafWithoutConstructor LEAF_WC;  
00180          typedef Bodon::Leaf LEAF;       
00181          typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE;
00182          typedef SortedTransactionReader< Coder<T_R, DF_D>, false, false > S_C_T_R;
00183          const bool ENDONLY = true;
00184          typedef OrderReverser< bracz::BuildTreeDBCache< 
00185             S_C_T_R, std::vector<item_t>, bracz::EndPatriciaBuildTree<ENDONLY>, ENDONLY > >S_C;
00186          S_C::params_t par_c;
00187          par_c.file_name = input_file;
00188          par_c.mode=FileReprBase::READ;
00189          par_c.largest_item = tr_reader.getLargestItem();
00190          par_c.decoder = &df_decoder;
00191          par_c.freq_items_with_counters = &freq_items_with_counters;
00192          par_c.codemode = ASC;
00193          log_status(0,"Doing sorted codec.");
00194          S_C sorted_coder(&par_c);
00195 
00196          std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00197             freq_pairs_with_counters;
00198          Frequent2Filter<S_C> fr_2_filter(
00199             &sorted_coder );
00200 //   Frequent2FilterOnline<S_C> fr_2_filter(
00201 //      &sorted_coder );
00202          log_status(0,"Finding frequent pairs.")
00203             fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00204 
00205          const NEELevel NEE =  NEE_Off;
00206          const bool DEADENDPRUNE = false;
00207 
00208          typedef bracz::singleualloc<LEAF_WC, 64 * 1024> LEAF_ALLOCATOR;
00209          typedef Bodon::FrequentItemInserter<DF_D, TRIE, NEE> FII;
00210          typedef Bodon::SupportCounterMerge<TRIE> SUPP_C_BASE;
00211          typedef OneByOneSupportCounter<TRIE, S_C, SUPP_C_BASE> SUPP_C;
00212 
00213          typedef Bodon::FrequentPairInserter<DF_D, TRIE, TRIE, LEAF_ALLOCATOR, NEE> FPI;
00214          typedef Bodon::inhomogeneous_trie::IntersectProPruner<
00215             DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE, DEADENDPRUNE> PRUNER;
00216          typedef Bodon::inhomogeneous_trie::CandidateGeneratorPrune<PRUNER, DF_D, TRIE, LEAF_ALLOCATOR, NEE, DEADENDPRUNE> CG;
00217          typedef Bodon::inhomogeneous_trie::InfreqRemover_AssocRuleGen<DF_D, TRIE, LEAF, LEAF_ALLOCATOR, NEE, DEADENDPRUNE> IR;
00218 
00219          typedef Apriori<S_C, DF_D, TRIE, LEAF_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00220          TRIE main_trie;
00221          LEAF_ALLOCATOR s_alloc;
00222          IR infrequent_remover(main_trie, df_decoder, s_alloc, 
00223                                min_conf, min_lift, nr_of_transactions);
00224 
00225          FII fii(main_trie, df_decoder);
00226          A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00227          log_status(0,"Finding frequent itemsets.")
00228             apriori.findFrequentItemsets( 
00229                nr_of_transactions, *par_c.freq_counters,
00230                freq_pairs_with_counters, min_supp );
00231       }
00232    }
00233    catch (std::ios_base::failure e)
00234    {
00235       log_err(0,"Exiting the program due to IO exception");
00236       return 1;
00237    }
00238 }
00239 
00240 

Generated on Sun Sep 17 17:50:39 2006 for FIM environment by  doxygen 1.4.4