00001
00002
00003 #include "common.hpp"
00004 #include "common/log.h"
00005 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00006 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00007
00008
00009 #include "io/codec/decoder/normal/CacheNormalDecoder.hpp"
00010
00011 #include "io/output/normal/SortOutput.hpp"
00012
00013 #include "util/StreamParser.hpp"
00014 #include "util/FrequentFilter.cpp"
00015
00016 #include "datastructures/maxvector.hpp"
00017
00018 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00019 #include "io/input/transaction_reader/OrderReverser.hpp"
00020 #include "io/codec/coder/Coder.hpp"
00021 #include "io/db_cache/BuildTreeDBCache.hpp"
00022 #include "util/Frequent2Filter.cpp"
00023 #include "util/Frequent2FilterOnline.cpp"
00024 #include "apriori/bodon-vector/vector_typedef.hpp"
00025 #include "apriori/OneByOneSupportCounter.hpp"
00026 #include "apriori/bodon-vector/SupportCounter.hpp"
00027 #include "apriori/bodon-vector/CandidateGeneratorNoprune.hpp"
00028 #include "apriori/bodon-vector/CandidateGeneratorSimplePrune.hpp"
00029 #include "apriori/bodon-vector/InfrequentRemover.hpp"
00030 #include "apriori/bodon-vector/FrequentItemInserter.hpp"
00031 #include "apriori/bodon-vector/FrequentPairInserterNoprune.hpp"
00032
00033 #include "datastructures/PrefixArrayList.hpp"
00034 #include "apriori/bodon-prefixarray/SupportCounter.hpp"
00035 #include "apriori/bodon-prefixarray/CandidateGeneratorNoprune.hpp"
00036 #include "apriori/bodon-prefixarray/InfrequentRemover.hpp"
00037 #include "apriori/bodon-prefixarray/FrequentPairInserterNoprune.hpp"
00038
00039
00040
00041 #include "apriori/Apriori.hpp"
00042
00043 #include <vector>
00044 #include <iostream>
00045 #include <string>
00046
00047
00048 std::string file_format;
00049
00050 void init()
00051 {
00052 file_format = "File format:";
00053 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00054 file_format += "represents a transaction. \n";
00055 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00056 file_format += "character.\nIt can be for example a white space, comma, ";
00057 file_format += "colon, etc.\n";
00058 file_format += "Items are nonnegative integers.\n";
00059 }
00061 void usage()
00062 {
00063 log_err(0,"Usage: apriori-nontrie transactionfile min_supp outcomefile data_structure [maxsize]");
00064 log_err(0," transactionfile\t file, that contains the tranasctions of items");
00065 log_err(0," outcomefile\t file to write the outcome");
00066 log_err(0," min_supp\t absolute support threshold");
00067 log_err(0," data_structure\t data structure to store the candidates: i.e vector, prefixarray");
00068
00069 std::cerr << file_format;
00070 log_err(0,"\t\t\tHave a succesful mining ;-)\n\n");
00071 }
00072
00084 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00085 bool &isrel, double &relminsupp, unsigned int& maxsize )
00086 {
00087 if( argc < 5 )
00088 {
00089 log_err(0,"There are 4 mandatory arguments.");
00090 return 2;
00091 }
00092
00093 std::string mins=argv[2];
00094 if (mins[mins.size()-1]=='%') {
00095 mins.erase(mins.size()-1);
00096 isrel=true;
00097 relminsupp=atof(mins.c_str());
00098 relminsupp/=100;
00099 log_status(0,"Using relative minimum support of %lg",relminsupp);
00100 return 0;
00101 }
00102 isrel=false;
00103 int min_supp_i;
00104 try
00105 {
00106 convert(argv[2], min_supp_i);
00107 if ( min_supp_i <= 0 )
00108 {
00109 log_err(0,"%s cannot be converted to a positive integer.",argv[2]);
00110 return 3;
00111 }
00112 }
00113 catch(BadConversion e)
00114 {
00115 log_err(0,"min_supp conversion problem.");
00116 return 3;
00117 }
00118 min_supp = static_cast<counter_t>(min_supp_i);
00119 log_status(0,"min_supp is set to %d", min_supp);
00120
00121 if(argc == 6)
00122 {
00123 int maxsize_i;
00124 try
00125 {
00126 convert(argv[5], maxsize_i);
00127 if ( maxsize_i <= 0 )
00128 {
00129 log_err(0,"%s cannot be converted to a positive integer.",argv[4]);
00130 return 4;
00131 }
00132 }
00133 catch(BadConversion e)
00134 {
00135 log_err(0,"min_supp conversion problem.");
00136 return 4;
00137 }
00138 maxsize = static_cast<unsigned int>(maxsize_i);
00139 log_status(0,"maxsize is set to %d", maxsize);
00140 }
00141 else
00142 maxsize = largest_itemsetsize;
00143 return 0;
00144 }
00145
00146 int main( int argc, char *argv[] )
00147 {
00148 init();
00149 counter_t min_supp;
00150 unsigned int maxsize;
00151 bool relative;
00152 double relminsupp;
00153
00154 {
00155 int return_val =
00156 process_arguments( argc, argv, min_supp,
00157 relative, relminsupp, maxsize );
00158 if(return_val)
00159 return return_val;
00160 }
00161
00162 char* input_file = argv[1];
00163 char* output_file = argv[3];
00164
00165 try
00166 {
00167
00168 typedef brBufferedTransactionReader< > T_R;
00169
00170
00171
00172 T_R::params_t par_i;
00173 par_i.file_name = input_file;
00174 par_i.mode=FileReprBase::READ;
00175 par_i.file_buffer_size = 16 * 1024;
00176 T_R tr_reader(&par_i);
00177 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00178 counter_t nr_of_transactions;
00179
00180 FrequentFilter<T_R>
00181 fr_filter(tr_reader);
00182 log_status(0,"Finding frequent items.");
00183 fr_filter.findFrequentItems( freq_items_with_counters,
00184 nr_of_transactions, min_supp);
00185
00186 if(!freq_items_with_counters.empty())
00187 {
00188 log_status(0,"Doing decoder.");
00189 typedef CacheNormalDecoder< > D;
00190
00191 D::params_t par_d;
00192 par_d.file_name = output_file;
00193 par_d.mode=FileReprBase::WRITE;
00194
00195 D decoder(&par_d);
00196
00197
00198 typedef SortedTransactionReader< Coder<T_R, D>, false, false > S_C_T_R;
00199 typedef OrderReverser< bracz::BuildTreeDBCache<
00200 S_C_T_R, std::vector<item_t>, bracz::EndPatriciaBuildTree<true> > >S_C;
00201
00202
00203 typedef Frequent2Filter<S_C> F2F;
00204
00205 S_C::params_t par_c;
00206 par_c.file_name = input_file;
00207 par_c.mode=FileReprBase::READ;
00208 par_c.largest_item = tr_reader.getLargestItem();
00209 par_c.decoder = &decoder;
00210 par_c.freq_items_with_counters = &freq_items_with_counters;
00211 par_c.codemode = ASC;
00212
00213 log_status(0,"Doing sorted codec.");
00214 S_C sorted_coder(&par_c);
00215 std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00216 freq_pairs_with_counters;
00217 F2F fr_2_filter(&sorted_coder );
00218 log_status(0,"Finding frequent pairs.")
00219 fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00220 typedef bool DUMMY;
00221 DUMMY dummy;
00222 if( strstr(argv[4],"vector") )
00223 {
00224 log_status(0,"Candidates are stored in a vector");
00225 typedef cand_vector_t DATA_STRUCTURE;
00226 DATA_STRUCTURE candidates;
00227 typedef Bodon::vector_based::FrequentItemInserter<DATA_STRUCTURE, D> FII;
00228 FII fii(candidates, decoder);
00229 typedef Bodon::vector_based::SupportCounter<DATA_STRUCTURE> SUPP_C_BASE;
00230 typedef OneByOneSupportCounter<DATA_STRUCTURE, S_C, SUPP_C_BASE> SUPP_C;
00231
00232 typedef Bodon::vector_based::FrequentPairInserterNoprune<D, DUMMY> FPI;
00233
00234 typedef Bodon::vector_based::CandidateGeneratorSimplePrune<D, DUMMY> CG;
00235 typedef Bodon::vector_based::InfrequentRemover<D, DUMMY> IR;
00236 IR infrequent_remover(candidates, decoder, dummy);
00237 typedef Apriori<S_C, D, DATA_STRUCTURE, DUMMY, FII,
00238 FPI, CG, IR, SUPP_C> A;
00239 A apriori(candidates, dummy, infrequent_remover, sorted_coder, decoder, fii);
00240 log_status(0,"Finding frequent itemsets.")
00241 apriori.findFrequentItemsets(
00242 nr_of_transactions, *par_c.freq_counters,
00243 freq_pairs_with_counters, min_supp, maxsize );
00244 }
00245 else
00246 {
00247 log_status(0,"Candidates are stored in a prefix-array.");
00248 typedef PrefixArrayList<> DATA_STRUCTURE;
00249 DATA_STRUCTURE candidates(freq_items_with_counters.size()-1);
00250 typedef Bodon::vector_based::FrequentItemInserter<DATA_STRUCTURE, D> FII;
00251 FII fii(candidates, decoder);
00252 typedef Bodon::prefix_array::SupportCounter<DATA_STRUCTURE> SUPP_C_BASE;
00253 typedef OneByOneSupportCounter<DATA_STRUCTURE, S_C, SUPP_C_BASE> SUPP_C;
00254 typedef Bodon::prefix_array::FrequentPairInserterNoprune<DATA_STRUCTURE, D, DUMMY> FPI;
00255 typedef Bodon::prefix_array::CandidateGeneratorNoprune<DATA_STRUCTURE, D, DUMMY> CG;
00256 typedef Bodon::prefix_array::InfrequentRemover<DATA_STRUCTURE, D, DUMMY> IR;
00257 IR infrequent_remover(candidates, decoder, dummy);
00258 typedef Apriori<S_C, D, DATA_STRUCTURE, DUMMY, FII,
00259 FPI, CG, IR, SUPP_C> A;
00260 A apriori(candidates, dummy, infrequent_remover, sorted_coder, decoder, fii);
00261 log_status(0,"Finding frequent itemsets.")
00262 apriori.findFrequentItemsets(
00263 nr_of_transactions, *par_c.freq_counters,
00264 freq_pairs_with_counters, min_supp, maxsize );
00265 }
00266 }
00267 }
00268 catch (std::ios_base::failure e)
00269 {
00270 log_err(0,"Exiting the program due to IO exception");
00271 return 1;
00272 }
00273 }
00274
00275