00001 #include "common.hpp"
00002 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00003 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00004
00005 #include "io/codec/decoder/df/CacheDFDecoder.hpp"
00006 #include "io/output/StatOutput.hpp"
00007
00008 #include "util/StreamParser.hpp"
00009 #include "util/FrequentFilter.cpp"
00010
00011
00012 #include <vector>
00013 #include <iostream>
00014 #include <string>
00015
00016
00017 std::string file_format;
00018
00019 void init()
00020 {
00021 file_format = "File format:";
00022 file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00023 file_format += "represents a transaction. \n";
00024 file_format += "A transaction is a set of items seperated by a nonnumeric ";
00025 file_format += "character.\nIt can be for example a white space, comma, ";
00026 file_format += "colon, etc.\n";
00027 file_format += "Items are nonnegative integers.\n";
00028 }
00030 void usage()
00031 {
00032 std::cerr << "\nUsage: independence transactionfile min_supp ";
00033 std::cerr << "outcomefile [options]\n";
00034 std::cerr << "\n transactionfile file, that contains the transactions of items";
00035 std::cerr << "\n outcomefile\t file to write the outcome";
00036 std::cerr << "\n min_supp\t absolute support threshold";
00037
00038 std::cerr << file_format;
00039 std::cerr << "\n\nHave a succesful mining ;-)"<<std::endl<<std::endl;
00040 }
00041
00053 int process_arguments( int argc, char *argv[],
00054 counter_t& min_supp, bool &isrel, double &relminsupp )
00055 {
00056 if ( argc < 4 )
00057 {
00058 usage();
00059 log_err(0,"There are 3 mandatory arguments.");
00060 return 2;
00061 }
00062 std::string mins=argv[2];
00063 if (mins[mins.size()-1]=='%') {
00064 mins.erase(mins.size()-1);
00065 isrel=true;
00066 relminsupp=atof(mins.c_str());
00067 relminsupp/=100;
00068 log_info(0,"Using relative minimum support of %lg",relminsupp);
00069 return 0;
00070 }
00071 isrel=false;
00072
00073 int min_supp_i;
00074 try
00075 {
00076 convert(argv[2], min_supp_i);
00077 if ( min_supp_i <= 0 )
00078 {
00079 log_err(0,"%s cannot be converted to a positive integer.",argv[2]);
00080 return 3;
00081 }
00082 }
00083 catch(BadConversion e)
00084 {
00085 log_err(0,"min_supp conversion problem.");
00086 return 3;
00087 }
00088 min_supp = static_cast<counter_t>(min_supp_i);
00089 log_info(0,"min_supp is set to %d", min_supp);
00090 return 0;
00091 }
00092 void findFrequentSets(const std::vector<double>& frequencies,
00093 const double min_freq, const counter_t nr_of_transactions,
00094 double prefixfrequency,
00095 size_t index, StatOutput< CacheDFDecoder< >, true >& df_decoder )
00096 {
00097 double new_prefixfrequency;
00098 while(index < frequencies.size())
00099 {
00100 new_prefixfrequency = prefixfrequency * frequencies[index];
00101 if( new_prefixfrequency >= min_freq)
00102 {
00103 df_decoder.pushItemWithWrite(
00104 index, static_cast<counter_t>( new_prefixfrequency * nr_of_transactions));
00105 findFrequentSets( frequencies, min_freq, nr_of_transactions,
00106 new_prefixfrequency, ++index, df_decoder );
00107 df_decoder.popItem();
00108 }
00109 else
00110 break;
00111 }
00112 }
00113 int main( int argc, char *argv[] )
00114 {
00115 init();
00116 counter_t min_supp;
00117 bool relative;
00118 double relminsupp;
00119
00120 {
00121 int return_val =
00122 process_arguments( argc, argv, min_supp, relative, relminsupp );
00123 if(return_val)
00124 return return_val;
00125 }
00126
00127 char* input_file = argv[1];
00128 char* output_file = argv[3];
00129
00130 try
00131 {
00132
00133 typedef brBufferedTransactionReader< > T_R;
00134
00135
00136
00137 T_R::params_t par_i;
00138 par_i.file_name = input_file;
00139 par_i.mode=FileReprBase::READ;
00140 par_i.file_buffer_size = 16 * 1024;
00141 T_R tr_reader(&par_i);
00142 std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00143 counter_t nr_of_transactions;
00144
00145 FrequentFilter<T_R>
00146 fr_filter(tr_reader);
00147 log_status(0,"Finding frequent items.");
00148 fr_filter.findFrequentItems( freq_items_with_counters,
00149 nr_of_transactions, min_supp);
00150
00151 sort( freq_items_with_counters.begin(),
00152 freq_items_with_counters.end() );
00153 reverse( freq_items_with_counters.begin(),
00154 freq_items_with_counters.end() );
00155 std::vector<item_t> code_inverse;
00156 code_inverse.reserve(freq_items_with_counters.size());
00157 std::vector<double> frequencies;
00158 frequencies.reserve(freq_items_with_counters.size());
00159 std::vector< std::pair<counter_t, item_t> >::size_type index;
00160 for( index = 0; index < freq_items_with_counters.size(); ++index )
00161 {
00162 code_inverse.push_back( freq_items_with_counters[index].second );
00163 frequencies.push_back(
00164 static_cast<double>(freq_items_with_counters[index].first)/ nr_of_transactions);
00165 }
00166 log_status(0,"Doing decoder.");
00167 typedef StatOutput< CacheDFDecoder< >, true >DF_D;
00168
00169 DF_D::params_t par_d;
00170 par_d.file_name = output_file;
00171 par_d.mode=FileReprBase::WRITE;
00172 par_d.numfreq = freq_items_with_counters.size();
00173 DF_D df_decoder(&par_d);
00174 df_decoder.setCodeInverse(code_inverse);
00175
00176 if( min_supp<= nr_of_transactions)
00177 {
00178 df_decoder.write(nr_of_transactions);
00179 findFrequentSets( frequencies,
00180 static_cast<double>(min_supp)/nr_of_transactions,
00181 nr_of_transactions, 1.0, 0, df_decoder );
00182 }
00183 }
00184 catch (std::ios_base::failure e)
00185 {
00186 log_err(0,"Exiting the program due to IO exception");
00187 return 1;
00188 }
00189 return 0;
00190 }
00191
00192