00001
00008 #include "common.hpp"
00009 #include "common/log.h"
00010 #include "common/allocators.hpp"
00011 #include "io/input/transaction_reader/LBufferedTransactionReader.hpp"
00012 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00013
00014 #include "io/codec/coder/Coder.hpp"
00015 #include "io/codec/decoder/df/SimpleDFDecoder.hpp"
00016
00017
00018
00019 #include "io/db_cache/RBTreeDBCache.hpp"
00020
00021 #include "util/StreamParser.hpp"
00022 #include "util/FrequentFilter.cpp"
00023 #include "util/Frequent2Filter.cpp"
00024
00025 #include "apriori/bodon/Trie.hpp"
00026 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00027
00028 #include "apriori/bodon/trie/trie_manipulators/FrequentItemInserter.hpp"
00029 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserter.hpp"
00030 #include "apriori/bodon/trie/trie_manipulators/support_counter/SupportCounterMerge.hpp"
00031 #include "apriori/OneByOneSupportCounter.hpp"
00032 #include "apriori/bodon/trie/trie_manipulators/SimplePruner.hpp"
00033 #include "apriori/bodon/trie/trie_manipulators/CandidateGeneratorPrune.hpp"
00034 #include "apriori/bodon/trie/trie_manipulators/InfreqRemover.hpp"
00035
00036 #include "apriori/Apriori.hpp"
00037
00038 #include <vector>
00039 #include <iostream>
00040 #include <string>
00041
00042
00043 std::string file_format;
00044
00045 void init()
00046 {
00047 file_format = "File format:";
00048 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00049 file_format += "represents a transaction. \n";
00050 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00051 file_format += "character.\nIt can be for example a white space, comma, ";
00052 file_format += "colon, etc.\n";
00053 file_format += "Items are nonnegative integers.\n";
00054 }
00056 void usage()
00057 {
00058 std::cerr<<"\nUsage: apriori-simple transactionfile min_supp outcomefile [maxsize]\n";
00059 std::cerr<<" transactionfile\t file, that contains the tranasctions of items\n";
00060 std::cerr<<" outcomefile\t file to write the outcome\n";
00061 std::cerr<<" min_supp\t absolute support threshold\n";
00062 std::cerr<<" maxsize\t the upper limit of the size of the frequent sets\n";
00063
00064 std::cerr << file_format;
00065 std::cerr<<"\n\t\t\tHave a succesful mining ;-)\n\n";
00066 }
00067
00078 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00079 bool &isrel, double &relminsupp, unsigned int& maxsize )
00080 {
00081 if ( argc < 4 )
00082 {
00083 log_err(0,"There are 3 mandatory arguments.");
00084 usage();
00085 return 2;
00086 }
00087 std::string mins=argv[2];
00088 if (mins[mins.size()-1]=='%') {
00089 mins.erase(mins.size()-1);
00090 isrel=true;
00091 relminsupp=atof(mins.c_str());
00092 relminsupp/=100;
00093 log_info(0,"Using relative minimum support of %lg",relminsupp);
00094 return 0;
00095 }
00096 isrel=false;
00097 int min_supp_i;
00098 try
00099 {
00100 convert(argv[2], min_supp_i);
00101 if ( min_supp_i <= 0 )
00102 {
00103 log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00104 return 3;
00105 }
00106 }
00107 catch(BadConversion e)
00108 {
00109 log_err(0,"min_supp conversion problem.");
00110 return 3;
00111 }
00112 min_supp = static_cast<counter_t>(min_supp_i);
00113 log_info(0,"min_supp is set to %d", min_supp);
00114 if(argc == 5)
00115 {
00116 int maxsize_i;
00117 try
00118 {
00119 convert(argv[4], maxsize_i);
00120 if ( maxsize_i <= 0 )
00121 {
00122 log_err(0,"%s cannot be converted to a positive integer.",argv[4]);
00123 return 4;
00124 }
00125 }
00126 catch(BadConversion e)
00127 {
00128 log_err(0,"max_size conversion problem.");
00129 return 4;
00130 }
00131 maxsize = static_cast<unsigned int>(maxsize_i);
00132 log_status(0,"maxsize is set to %d", maxsize);
00133 }
00134 else
00135 maxsize = largest_itemsetsize;
00136 return 0;
00137 }
00138
00151 int main( int argc, char *argv[] )
00152 {
00153 init();
00154 counter_t min_supp;
00155 unsigned int maxsize;
00156 bool relative;
00157 double relminsupp;
00158
00159 {
00160 int return_val =
00161 process_arguments( argc, argv, min_supp, relative, relminsupp, maxsize );
00162 if(return_val)
00163 return return_val;
00164 }
00165
00166 char* input_file = argv[1];
00167 char* output_file = argv[3];
00168
00169 try
00170 {
00171
00172 typedef LBufferedTransactionReader< > T_R;
00173
00174
00175
00176 T_R::params_t par_i;
00177 par_i.file_name = input_file;
00178 par_i.mode=FileReprBase::READ;
00179 par_i.file_buffer_size = 16 * 1024;
00180 T_R tr_reader(&par_i);
00181 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00182 counter_t nr_of_transactions;
00183
00184 FrequentFilter<T_R>
00185 fr_filter(tr_reader);
00186 log_status(0,"Finding frequent items.");
00187 fr_filter.findFrequentItems( freq_items_with_counters,
00188 nr_of_transactions, min_supp);
00189
00190 if(!freq_items_with_counters.empty())
00191 {
00192 log_status(0,"Doing decoder.");
00193 typedef SimpleDFDecoder< > DF_D;
00194
00195
00196
00197 DF_D::params_t par_d;
00198 par_d.file_name = output_file;
00199 par_d.mode=FileReprBase::WRITE;
00200
00201 DF_D df_decoder(&par_d);
00202
00203 typedef Bodon::Leaf LEAF;
00204 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE;
00205 typedef SortedTransactionReader< Coder<T_R, DF_D> > S_C_T_R;
00206 typedef Bodon::RBTreeDBCache<S_C_T_R, std::vector<item_t> > S_C;
00207 S_C::params_t par_c;
00208 par_c.file_name = input_file;
00209 par_c.mode=FileReprBase::READ;
00210 par_c.largest_item = tr_reader.getLargestItem();
00211 par_c.decoder = &df_decoder;
00212 par_c.freq_items_with_counters = &freq_items_with_counters;
00213 par_c.codemode = ASC;
00214 log_status(0,"Doing sorted codec.");
00215 S_C sorted_coder(&par_c);
00216
00217 std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00218 freq_pairs_with_counters;
00219 Frequent2Filter<S_C> fr_2_filter(
00220 &sorted_coder );
00221
00222
00223 log_status(0,"Finding frequent pairs.")
00224 fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00225
00226 const NEELevel NEE = NEE_Off;
00227 typedef NewWrapperAlloc<TRIE> TRIE_ALLOCATOR;
00228 typedef Bodon::FrequentItemInserter<DF_D, TRIE, NEE> FII;
00229 typedef Bodon::SupportCounterMerge<TRIE> SUPP_C_BASE;
00230 typedef OneByOneSupportCounter<TRIE, S_C, SUPP_C_BASE> SUPP_C;
00231
00232 TRIE main_trie;
00233 TRIE_ALLOCATOR s_alloc;
00234 typedef Bodon::FrequentPairInserter<DF_D, TRIE, TRIE, TRIE_ALLOCATOR, NEE> FPI;
00235 typedef Bodon::trie::SimplePruner<DF_D, TRIE, NewWrapperAlloc<TRIE>, NEE> PRUNER;
00236 typedef Bodon::CandidateGeneratorPrune<PRUNER, DF_D, TRIE, TRIE_ALLOCATOR, NEE> CG;
00237 typedef Bodon::trie::InfreqRemover<DF_D, TRIE, TRIE_ALLOCATOR, NEE> IR;
00238 IR infrequent_remover(main_trie, df_decoder, s_alloc);
00239 typedef Apriori<S_C, DF_D, TRIE, TRIE_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00240
00241 FII fii(main_trie, df_decoder);
00242 A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00243 log_status(0,"Finding frequent itemsets.")
00244 apriori.findFrequentItemsets(
00245 nr_of_transactions, *par_c.freq_counters,
00246 freq_pairs_with_counters, min_supp, maxsize );
00247 }
00248 }
00249 catch (std::ios_base::failure e)
00250 {
00251 log_err(0,"Exiting the program due to IO exception");
00252 return 1;
00253 }
00254 }
00255
00256