Main Page | Namespace List | Class Hierarchy | Class List | Directories | File List | Namespace Members | Class Members | File Members

independence.cpp

Go to the documentation of this file.
00001 #include "common.hpp"
00002 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00003 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00004 
00005 #include "io/codec/decoder/df/CacheDFDecoder.hpp"
00006 #include "io/output/StatOutput.hpp"
00007 
00008 #include "util/StreamParser.hpp"
00009 #include "util/FrequentFilter.cpp"
00010 
00011 
00012 #include <vector>
00013 #include <iostream>
00014 #include <string>
00015 
00016 
00017 std::string file_format;
00018 
00019 void init()
00020 {
00021    file_format = "File format:";
00022    file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00023    file_format += "represents a transaction. \n";
00024    file_format += "A transaction is a set of items seperated by a nonnumeric ";
00025    file_format += "character.\nIt can be for example a white space, comma, ";
00026    file_format += "colon, etc.\n";
00027    file_format += "Items are nonnegative integers.\n";
00028 }
00030 void usage()
00031 {
00032    std::cerr << "\nUsage: independence transactionfile min_supp ";
00033    std::cerr << "outcomefile [options]\n";
00034    std::cerr << "\n transactionfile  file, that contains the transactions of items";
00035    std::cerr << "\n outcomefile\t  file to write the outcome";
00036    std::cerr << "\n min_supp\t  absolute support threshold";
00037 
00038    std::cerr << file_format;
00039    std::cerr << "\n\nHave a succesful mining ;-)"<<std::endl<<std::endl;
00040 }
00041 
00053 int process_arguments( int argc, char *argv[],
00054                        counter_t& min_supp, bool &isrel, double &relminsupp )
00055 {
00056    if ( argc < 4 )
00057    {
00058       usage();
00059       log_err(0,"There are 3 mandatory arguments.");
00060       return 2;
00061    }
00062    std::string mins=argv[2];
00063    if (mins[mins.size()-1]=='%') {
00064       mins.erase(mins.size()-1);
00065       isrel=true;
00066       relminsupp=atof(mins.c_str());
00067       relminsupp/=100;
00068       log_info(0,"Using relative minimum support of %lg",relminsupp);
00069       return 0;
00070    }
00071    isrel=false;
00072 
00073    int min_supp_i;
00074    try
00075    {
00076       convert(argv[2], min_supp_i);
00077       if ( min_supp_i <= 0  )
00078       {
00079          log_err(0,"%s cannot be converted to a positive integer.",argv[2]);
00080          return 3;
00081       }
00082    }
00083    catch(BadConversion e)
00084    {
00085       log_err(0,"min_supp conversion problem.");
00086       return 3;
00087    }
00088    min_supp = static_cast<counter_t>(min_supp_i);
00089    log_info(0,"min_supp is set to %d", min_supp);
00090    return 0;
00091 }
00092 void findFrequentSets(const std::vector<double>& frequencies,
00093                       const double min_freq, const counter_t nr_of_transactions,
00094                       double prefixfrequency,
00095                       size_t index, StatOutput< CacheDFDecoder< >, true >& df_decoder )
00096 {
00097    double new_prefixfrequency;
00098    while(index < frequencies.size())
00099    {
00100       new_prefixfrequency = prefixfrequency * frequencies[index];
00101       if( new_prefixfrequency >= min_freq)
00102       {
00103          df_decoder.pushItemWithWrite(
00104             index, static_cast<counter_t>( new_prefixfrequency * nr_of_transactions));
00105          findFrequentSets( frequencies, min_freq, nr_of_transactions,
00106                            new_prefixfrequency, ++index, df_decoder );
00107          df_decoder.popItem();
00108       }
00109       else
00110          break;
00111    }
00112 }
00113 int main( int argc, char *argv[] )
00114 {
00115    init();
00116    counter_t min_supp;
00117    bool relative;
00118    double relminsupp;
00119       
00120    {
00121       int return_val = 
00122          process_arguments( argc, argv, min_supp, relative, relminsupp );
00123       if(return_val)
00124          return return_val;
00125    }
00126 
00127    char* input_file = argv[1];
00128    char* output_file = argv[3];
00129 
00130    try
00131    {
00132       // We assume that the transactions does not contain duplicates!!!
00133       typedef brBufferedTransactionReader< > T_R;
00134       // Otherwise uncmment this:
00135       // typedef SortedTransactionReader<brBufferedTransactionReader< >, true> T_R;
00136 
00137       T_R::params_t par_i;
00138       par_i.file_name = input_file;
00139       par_i.mode=FileReprBase::READ;
00140       par_i.file_buffer_size = 16 * 1024;
00141       T_R tr_reader(&par_i);
00142       std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00143       counter_t nr_of_transactions;
00144       // The first step of each algorithms is determining the frequent items.
00145       FrequentFilter<T_R>
00146          fr_filter(tr_reader);
00147       log_status(0,"Finding frequent items.");
00148       fr_filter.findFrequentItems( freq_items_with_counters,   
00149                                    nr_of_transactions, min_supp);
00150 
00151       sort( freq_items_with_counters.begin(),
00152             freq_items_with_counters.end() );
00153       reverse( freq_items_with_counters.begin(),
00154                freq_items_with_counters.end() );
00155       std::vector<item_t> code_inverse;
00156       code_inverse.reserve(freq_items_with_counters.size());
00157       std::vector<double> frequencies;
00158       frequencies.reserve(freq_items_with_counters.size());
00159       std::vector< std::pair<counter_t, item_t> >::size_type index;
00160       for( index = 0; index < freq_items_with_counters.size(); ++index )
00161       {
00162          code_inverse.push_back( freq_items_with_counters[index].second );
00163          frequencies.push_back( 
00164             static_cast<double>(freq_items_with_counters[index].first)/ nr_of_transactions);
00165       }
00166       log_status(0,"Doing decoder.");
00167       typedef StatOutput< CacheDFDecoder< >, true >DF_D;
00168 
00169       DF_D::params_t par_d;
00170       par_d.file_name = output_file;
00171       par_d.mode=FileReprBase::WRITE;
00172       par_d.numfreq = freq_items_with_counters.size(); // If StatOutput is used!!!
00173       DF_D df_decoder(&par_d);
00174       df_decoder.setCodeInverse(code_inverse);
00175 
00176       if( min_supp<= nr_of_transactions)
00177       {
00178          df_decoder.write(nr_of_transactions);
00179          findFrequentSets( frequencies,
00180                            static_cast<double>(min_supp)/nr_of_transactions,
00181                            nr_of_transactions, 1.0, 0, df_decoder );
00182       }
00183    }
00184    catch (std::ios_base::failure e)
00185    {
00186       log_err(0,"Exiting the program due to IO exception");
00187       return 1;
00188    }
00189    return 0;
00190 }
00191 
00192 

Generated on Sun Sep 17 17:50:38 2006 for FIM environment by  doxygen 1.4.4