cscenter · baranov1ch · Dec 3, 2014 · Dec 9, 2014 · Dec 9, 2014 · Dec 9, 2014
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -3,5 +3,5 @@ add_subdirectory(gtest)
 add_subdirectory(dict_builder)
 add_subdirectory(incremental_tester/)
 add_subdirectory(incremental_updater/)
-
+add_subdirectory(two_dicts_tester/)
 
diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
@@ -22,3 +22,25 @@ So we can calculate `DocsOccursIn` for each node. We are going to solve the very
 In the last part of solution we just sort all survived substring by their rating and pick them until we reach the limit on the dictionary size.  
 
 Time complexity is `O(sum_length_documents)` with relatively small constant.
+
+**Further improvements**
+
+We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant  ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png). Smaller alpha corresponds to a huge sensitivity to the new documents. One can see that we can reach almost the same by multiplying the newest document by ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126773/render.png).  
+
+Another modification is related to reducing automaton's size when it become too big. In that case we just delete the node without outgoing edges with smallest score (the leaf of the automaton) until we reach desired amount of nodes.  
+
+The remaining part of our model is still the same.
+
+**Usage**  
+
+We use this tool via objects of the class `Dictionary`. One can pass the following parameters to the constructor: maximum size of the dictionary, minimum length of a string in the dictionary, stop symbol (say, `#`), maximum size of the automaton, coefficient ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png).  
+
+Also, where are useful methods:  
+`Dictionary::AddDocument` corresponds to the operation `whole_string += document`,  
+`Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`,
+`Dictionary::BuildDict` builds the dictionary from the current whole string,  
+`Dictionary::GetDict` returns dictionary obtained via the latest call of the previous method.  
+
+Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty!  
+
+We suggest to call `GetDict` only then one really needs the current dictionary.
diff --git a/src/two_dicts_tester/CMakeLists.txt b/src/two_dicts_tester/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(two_dicts_tester
+  tester.cpp
+)
+target_link_libraries(two_dicts_tester LINK_PUBLIC
+  vcdenc
+  vcddec
+  dictgen
+)
diff --git a/src/two_dicts_tester/Readme.md b/src/two_dicts_tester/Readme.md
@@ -0,0 +1,8 @@
+This is a program that calculates the advantage of using the new dictionary againts the old one.  
+
+**Usage**  
+`./two_dicts_tester  -file_name_old %file_name_of_the_old_dictionary's_file% -file_name_new %file_name_of_the_new_dictionary's_file% -folder_documents %name_of_the_folder_with_documents_to_test% -result_file_name %file_name_of_the_resulted_file_with_one_number%`  
+
+Result is the following fraction `score_of_new_dictionary / score_of_old_dictionary`.  
+
+Smaller score is better since it is `size_encoded_file / size_file`.
diff --git a/src/two_dicts_tester/tester.cpp b/src/two_dicts_tester/tester.cpp
@@ -0,0 +1,155 @@
+#include <iostream>
+#include <cstring>
+#include <stdexcept>
+#include <fstream>
+#include <google/vcencoder.h>
+#include <google/vcdecoder.h>
+#include <dirent.h>
+#include <sys/stat.h>
+
+using std::cout;
+using std::endl;
+using std::string;
+
+const char * kOldDictFileNameParam = "-file_name_old";
+const char * kNewDictFileNameParam = "-file_name_new";
+const char * kDocumentsFolderParam = "-folder_documents";
+const char * kResultFileNameParam = "-result_file_name";  
+
+double CalcScore(const string& path, const string& dict) {
+  std::ifstream input(path.c_str());
+
+  if  (!input.is_open()) {
+    throw std::invalid_argument("cannot open file \'" + path + "\'");
+  }
+
+  string content;
+  string buf;
+  while (input >> buf) {
+    content += buf;
+  }
+
+  if  (content.empty()) {
+    return 0.0;
+  }
+
+  open_vcdiff::VCDiffEncoder encoder(dict.data(), dict.size());
+//  encoder.SetFormatFlags(open_vcdiff::VCD_FORMAT_INTERLEAVED);
+  std::string delta;
+  encoder.Encode(content.data(), content.size(), &delta);
+  return (double) delta.size() / content.size();
+}
+
+string ReadDict(const char * dict_file_name) {
+  std::ifstream input(dict_file_name);
+
+  if  (!input.is_open()) {
+    throw std::invalid_argument("cannot read from file \'" + string(dict_file_name) + "\'");
+  }
+
+  string dict;
+  input >> dict;
+  return dict;
+}
+
+double CalcScore(const char * dict_file_name, const char * documents_folder) {
+  double score = 0.0;
+  size_t cnt_docs = 0;
+
+  string dict = ReadDict(dict_file_name);
+
+  DIR *dirp;
+  if  (!(dirp = opendir(documents_folder))) {
+    throw std::invalid_argument("failed to open directory");
+  }
+
+  struct dirent *dp;
+  while ((dp = readdir(dirp)) != NULL) {
+    struct stat st;
+    std::string path = string(documents_folder) + "/" + dp->d_name;
+    if (0 == stat(path.c_str(), &st)) {
+      if (S_ISREG(st.st_mode)) {
+        score += CalcScore(path, dict);
+        ++cnt_docs;
+      }
+    }
+  }
+
+  if  (!cnt_docs) {
+    return 0.0;
+  }
+  return score / cnt_docs;
+}
+
+int main(int argc, char ** argv) {
+  char * old_dict_file_name = nullptr;
+  char * new_dict_file_name = nullptr;
+  char * documents_folder = nullptr;
+  char * result_file_name = nullptr;
+
+  for (int i = 1; i < argc; ) {
+    if  (!strcmp(argv[i], kOldDictFileNameParam)) {
+      if  (i + 1 >= argc) {
+        cout << "cannot find old dictionary's file name" << endl;
+        return 1;
+      }
+      old_dict_file_name = argv[i + 1];
+      i += 2;
+    } else if  (!strcmp(argv[i], kNewDictFileNameParam)) {
+      if  (i + 1 >= argc) {
+        cout << "cannot find new dictionary's file name" << endl;
+        return 1;
+      }    
+      new_dict_file_name = argv[i + 1];
+      i += 2;
+    } else if  (!strcmp(argv[i], kDocumentsFolderParam)) {
+      if  (i + 1 >= argc) {
+        cout << "cannot find documents' folder name" << endl;
+        return 1;
+      }
+      documents_folder = argv[i + 1];
+      i += 2;
+    } else if  (!strcmp(argv[i], kResultFileNameParam)) {
+      if  (i + 1 >= argc) {
+        cout << "cannot find result's file name" << endl;
+        return 1;
+      }
+      result_file_name = argv[i + 1];
+      i += 2;
+    } else {
+      cout << "unrecognized parameter \'" << string(argv[i]) << "\'" << endl;
+      return 1;
+    }
+  }  
+
+  if  (!old_dict_file_name) {
+    cout << "old dictionary's file name must be specified with \'" << string(kOldDictFileNameParam) << "\'" << endl;
+    return 1;
+  }
+
+  if  (!new_dict_file_name) {
+    cout << "new dictionary's file name must be specified with \'" << string(kNewDictFileNameParam) << "\'" << endl;
+    return 1;
+  }
+
+  if  (!documents_folder) {
+    cout << "documents' folder must be specified with \'" << string(kDocumentsFolderParam) << "\'" << endl;
+    return 1;
+  }
+
+  if  (!result_file_name) {
+    cout << "result's file name must be specified with \'" << string(kResultFileNameParam) << "\'" << endl;
+    return 1;
+  }
+
+  std::ofstream output(result_file_name);
+  try {
+    // smaller result is better
+    output << CalcScore(new_dict_file_name, documents_folder) / CalcScore(old_dict_file_name, documents_folder) << endl;
+  } catch (const std::exception& ex) {
+    cout << "an exception was thrown " << ex.what() << endl;
+    return 1; 
+  }
+  return 0;
+}
+