00001
00029 #include "common.hpp"
00030 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00031
00032 #include "io/codec/decoder/df/CacheDFDecoder.hpp"
00033
00034
00035 #include "util/StreamParser.hpp"
00036 #include "util/SeqFrequentFilter.cpp"
00037
00038 #include "datastructures/maxvector.hpp"
00039 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00040
00041
00042 #include "apriori/bodon/Trie.hpp"
00043
00044 #include "apriori/SeqAprioriSelector.hpp"
00045 #include <vector>
00046 #include <iostream>
00047 #include <string>
00048
00049
00050 std::string file_format;
00051
00052 void init()
00053 {
00054 file_format = "File format:";
00055 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00056 file_format += "represents a transaction. \n";
00057 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00058 file_format += "character.\nIt can be for example a white space, comma, ";
00059 file_format += "colon, etc.\n";
00060 file_format += "Items are nonnegative integers.\n";
00061 }
00063 void usage()
00064 {
00065 std::cerr << "\nUsage: fsm algorithm transactionfile min_supp ";
00066 std::cerr << "outcomefile [maxsize]\n";
00067 std::cerr << "\n algorithm\t the name of the algorithm, i.e:\n";
00068 std::cerr << "\t\t apriori, apriori-noprune, apriori-intersectprune";
00069 std::cerr << "\n transactionfile file, that contains the tranasctions of items";
00070 std::cerr << "\n outcomefile\t file to write the outcome";
00071 std::cerr << "\n min_supp\t absolute support threshold";
00072 std::cerr << "\n maxsize\t the upper limit of the size of the frequent sets\n";
00073
00074 std::cerr << file_format;
00075 std::cerr << "\n\nHave a succesful mining ;-)"<<std::endl<<std::endl;
00076 }
00077
00088 int process_arguments( int argc, char *argv[], counter_t& min_supp,
00089 bool &isrel, double &relminsupp, unsigned int& maxsize )
00090 {
00091 if ( argc < 5 )
00092 {
00093 log_err(0,"There are 4 mandatory arguments.");
00094 usage();
00095 return 2;
00096 }
00097 std::string mins=argv[3];
00098 if (mins[mins.size()-1]=='%') {
00099 mins.erase(mins.size()-1);
00100 isrel=true;
00101 relminsupp=atof(mins.c_str());
00102 relminsupp/=100;
00103 log_status(0,"Using relative minimum support of %lg",relminsupp);
00104 return 0;
00105 }
00106 isrel=false;
00107 int min_supp_i;
00108 try
00109 {
00110 convert(argv[3], min_supp_i);
00111 if ( min_supp_i <= 0 )
00112 {
00113 log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00114 return 3;
00115 }
00116 }
00117 catch(BadConversion e)
00118 {
00119 log_err(0,"min_supp conversion problem.");
00120 return 3;
00121 }
00122 min_supp = static_cast<counter_t>(min_supp_i);
00123 log_status(0,"min_supp is set to %d", min_supp);
00124 if(argc == 6)
00125 {
00126 int maxsize_i;
00127 try
00128 {
00129 convert(argv[5], maxsize_i);
00130 if ( maxsize_i <= 0 )
00131 {
00132 log_err(0,"%s cannot be converted to a positive integer.",argv[5]);
00133 return 4;
00134 }
00135 }
00136 catch(BadConversion e)
00137 {
00138 log_err(0,"max_size conversion problem.");
00139 return 4;
00140 }
00141 maxsize = static_cast<unsigned int>(maxsize_i);
00142 log_status(0,"maxsize is set to %d", maxsize);
00143 }
00144 else
00145 maxsize = largest_itemsetsize;
00146 return 0;
00147 }
00148
00149 int main( int argc, char *argv[] )
00150 {
00151 init();
00152 counter_t min_supp;
00153 unsigned int maxsize;
00154 bool relative;
00155 double relminsupp;
00156
00157 {
00158 int return_val =
00159 process_arguments( argc, argv, min_supp, relative, relminsupp, maxsize );
00160 if(return_val)
00161 return return_val;
00162 }
00163
00164 if( strncmp(argv[1],"apriori",7) == 0 )
00165 {
00166
00167 char* algorithm = argv[1];
00168 char* input_file = argv[2];
00169 char* output_file = argv[4];
00170
00171 try
00172 {
00173 typedef brBufferedTransactionReader< > T_R;
00174 T_R::params_t par_i;
00175 par_i.file_name = input_file;
00176 par_i.mode=FileReprBase::READ;
00177 par_i.file_buffer_size = 16 * 1024;
00178 T_R tr_reader(&par_i);
00179 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00180 counter_t nr_of_transactions;
00181
00182 SeqFrequentFilter<T_R>
00183 fr_filter(tr_reader);
00184 log_status(0,"Finding frequent items.");
00185 fr_filter.findFrequentItems( freq_items_with_counters,
00186 nr_of_transactions, min_supp);
00187
00188 log_status(0,"Doing decoder.");
00189 typedef CacheDFDecoder< > DF_D;
00190
00191
00192 DF_D::params_t par_d;
00193 par_d.file_name = output_file;
00194 par_d.mode=FileReprBase::WRITE;
00195
00196 DF_D df_decoder(&par_d);
00197 typedef Bodon::LeafWithoutConstructor LEAF_WC;
00198 typedef Bodon::Leaf LEAF;
00199 if(strncmp(argv[1],"apriori",7) == 0)
00200 {
00201 log_status(0,"APRIORI is selected");
00202
00203 typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE_BASE;
00204
00205
00206 SeqAprioriSelector<TRIE_BASE, LEAF_WC, T_R, DF_D>(
00207 min_supp, algorithm, input_file, nr_of_transactions,
00208 freq_items_with_counters, tr_reader, df_decoder, maxsize );
00209 }
00210 }
00211 catch (std::ios_base::failure e)
00212 {
00213 log_err(0,"Exiting the program due to IO exception");
00214 return 1;
00215 }
00216 }
00217 else
00218 {
00219 usage();
00220 log_err(0,"algorithm should be apriori");
00221 return 1;
00222 }
00223 }
00224
00225