cscenter · PavelSavchenkov · Dec 3, 2014 · Dec 9, 2014 · Dec 9, 2014 · Dec 9, 2014
diff --git a/.gitignore b/.gitignore
@@ -1,15 +1,10 @@
-<<<<<<< HEAD
 *.html
 *~
 src/third_party/open-vcdiff/
 src/gtest
-
-
-=======
 .idea/
 .svn/
 *~
 .DS_Store
 src/third_party/open-vcdiff/src/config.h
 src/third_party/open-vcdiff/src/stamp-h1
->>>>>>> upstream/master
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,11 +2,7 @@ cmake_minimum_required(VERSION 2.8.11)
 project(SInGe)
 #set(CMAKE_VERBOSE_MAKEFILE ON)
 
-<<<<<<< HEAD
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra")
-=======
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++1y -Wall -Wextra")
->>>>>>> upstream/master
 set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 

diff --git a/README.md b/README.md
@@ -2,12 +2,4 @@
 
 [![Build Status](https://travis-ci.org/cscenter/SInGe.svg?branch=master)](https://travis-ci.org/cscenter/SInGe)
 
-## How to build first time:
-1) go to src/third_party/open-vcdiff
-2) ./autogen.sh
-3) ./configure
-
-
-After that use Cmake as usual
-
-SDCH Dictionary Incremental Geenrator
+SDCH Dictionary Incremental Generator
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,10 +1,7 @@
 add_subdirectory(third_party/)
 add_subdirectory(gtest)
 add_subdirectory(dict_builder)
-add_subdirectory(incremental_updater/)
-<<<<<<< HEAD
 add_subdirectory(incremental_tester/)
-=======
->>>>>>> upstream/master
-
+add_subdirectory(incremental_updater/)
+add_subdirectory(dict_encoder/)
 
diff --git a/src/dict_builder/CMakeLists.txt b/src/dict_builder/CMakeLists.txt
@@ -1,6 +1,3 @@
-<<<<<<< HEAD
-add_library( dictgen
-=======
 find_package(Protobuf REQUIRED)
 
 set (PROTO_SOURCES
@@ -10,15 +7,12 @@ set (PROTO_SOURCES
 PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS ${PROTO_SOURCES})
 
 add_library(dictgen
->>>>>>> upstream/master
 	dictionary.cpp
 	dictionary.hpp
 	node.cpp
 	node.hpp
 	suffix_automaton.cpp
 	suffix_automaton.hpp
-<<<<<<< HEAD
-=======
 	${PROTO_SRCS}
 	${PROTO_HDRS}
 )
@@ -30,7 +24,6 @@ target_include_directories (dictgen PUBLIC
  ${CMAKE_CURRENT_SOURCE_DIR}
  ${PROTOBUF_INCLUDE_DIRS}
  ${CMAKE_CURRENT_BINARY_DIR}
->>>>>>> upstream/master
 )
 
 add_executable(pzip
@@ -42,16 +35,12 @@ add_executable(dict_builder_tests
 	node_test.cpp
 	dictionary_test.cpp
 	suffix_automaton_test.cpp
-<<<<<<< HEAD
-=======
 	serialization_tests.cpp
->>>>>>> upstream/master
 )
 
 target_link_libraries(dict_builder_tests
   gtest_main
   dictgen
 )
-target_include_directories (dictgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_test(NAME dict_builder_tests COMMAND dict_builder_tests)
diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
@@ -8,11 +8,11 @@ Our goal is to find some set of substrings that maximize the
   ![equation](http://latex.codecogs.com/png.latex?%5Csum_%7Bs%20%5Cin%20Dict%7D%20%5Cfrac%7BDocsOccursIn%28s%29%20%5Ccdot%20%28len%28s%29%20-%203%29%7D%7Blen%28s%29%7D)  
 
 under some constraints, namely  
-1. If we took a substring, we are not allowed to take a subtring of this substring  
+1. If we take a substring, we are not allowed to take a subtring of this substring  
 2. ![equation](http://latex.codecogs.com/png.latex?DocsOccursIn%28s%29%20%3E%201%2C%20len%28s%29%20%3E%20threshold).  
 3. Sum over taken substrings' lengths is not greater than some constant.  
 
-To find such strings we use data structure named 'suffix automaton' (more detailed explanation here http://www.cs.nyu.edu/~mohri/pub/nfac.pdf, in russian: http://e-maxx.ru/algo/suffix_automata).  
+To find such strings we use data structure called 'suffix automaton' (more detailed explanation here http://www.cs.nyu.edu/~mohri/pub/nfac.pdf, in russian: http://e-maxx.ru/algo/suffix_automata).  
 Briefly saying, suffix automaton is the smallest automaton accepting all suffixes of a string. Also, it is a directed acyclic graph with characters on the edges. It's proved that such automaton contains O(len(s)) nodes.
 
 Let's build the suffix automaton for the string ![equation](http://latex.codecogs.com/png.latex?s_1%20%5C%23%20s_2%20%5C%23%20...%20%5C%23%20s_k) - concatenated documents separated by '#' (or some another character does not occur in the documents). Every string (without '#') allowed by automaton is a substring of some document (the opposite fact is also true). Now we want to compute a 'DocsOccursIn' for each node in our automaton. For that do the following: build another automaton for each document (say, the i-th) and traverse common automaton (the first) and current automaton (automaton for i-th doc, the second) 'simultaneously'. More detailed explanation: our function (say, `dfs`) takes two parameters - `node1` and `node2` from the first and the second automaton respectively. ALso, one needs to increace the 'node1.DocsOccursIn' counter by 1. After that one has to call `dfs` from all nodes `to1` and `to2` corresponding to the same character written on the edges from `node1` ans `node2` respectively. In other words, the following must be satisfied: ![equation](http://latex.codecogs.com/png.latex?%28node_1%2C%20to_1%29%20%5Cin%20E%2C%20%28node2%2C%20to_2%29%20%5Cin%20E%2C%20MarkOnEdge%28node_1%2C%20to_1%29%20%3D%20MarkOnEdge%28node_2%2C%20to_2%29).  

diff --git a/src/dict_builder/dictionary.cpp b/src/dict_builder/dictionary.cpp
@@ -1,36 +1,24 @@
-<<<<<<< HEAD
-=======
-#include <map>
->>>>>>> upstream/master
-#include <fstream>
+#include <algorithm>
+#include <cassert>
 #include <cmath>
 #include <cstdlib>
-#include <algorithm>
+#include <fstream>
 #include <iostream>
-<<<<<<< HEAD
-=======
-#include <queue>
-#include <cassert>
->>>>>>> upstream/master
 #include <map>
+#include <queue>
 
 #include "dictionary.hpp"
 #include "suffix_automaton.hpp"
 
-using std::vector;
-using std::string;
-using std::pair;
-using std::make_pair;
-<<<<<<< HEAD
-using std::endl;
-using std::cout;
-=======
 using std::cerr;
-using std::endl;
 using std::cout;
-using std::queue;
+using std::endl;
+using std::make_pair;
 using std::map;
->>>>>>> upstream/master
+using std::pair;
+using std::queue;
+using std::string;
+using std::vector;
 
 namespace {
   const double kEps = 1e-10;
@@ -44,33 +32,54 @@ namespace {
   }	
 };
 
-<<<<<<< HEAD
-Dictionary::Dictionary() : kMaxDict(1 << 20), kMinLen(20), kMinDocsOccursIn(2) {}
-
-Dictionary::Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef) : kMaxDict(kMaxDict), kMinLen(kMinLen), kMinDocsOccursIn(2), automaton_all_(SuffixAutomaton(kStopSymbol, kMaxAutomatonSize, kAutomatonCoef)) {
-}
-=======
-const size_t Dictionary::kMaxDict = 1 << 16;
-const size_t Dictionary::kMinLen = 3;
-const size_t Dictionary::kMinDocsOccursIn = 2;
+Dictionary::Dictionary() : kMaxDict(1 << 18), kMinLen(20), kMinDocsOccursIn(1), consider_as_one_string_(true) {}
+
+Dictionary::Dictionary(size_t kMaxDict
+      , size_t kMinLen
+      , char kStopSymbol
+      , size_t kMaxAutomatonSize
+      , double kAutomatonCoef
+      , bool consider_as_one_string)
+    : kMaxDict(kMaxDict)
+    , kMinLen(kMinLen)
+    , kMinDocsOccursIn(consider_as_one_string ? 1 : 2)
+    , automaton_all_(SuffixAutomaton(kStopSymbol, kMaxAutomatonSize, kAutomatonCoef))
+    , consider_as_one_string_(consider_as_one_string)
+    {
+    }
 
-Dictionary::Dictionary() {}
->>>>>>> upstream/master
+Dictionary::Dictionary(size_t kMaxDict
+      , size_t kMinLen
+      , SuffixAutomaton& automaton
+      , bool consider_as_one_string)
+    : kMaxDict(kMaxDict)
+    , kMinLen(kMinLen)
+    , kMinDocsOccursIn(consider_as_one_string ? 1 : 2)
+    , automaton_all_(automaton)
+    , consider_as_one_string_(consider_as_one_string)
+    {
+    }
 
 Dictionary::~Dictionary() {}
 
 void Dictionary::AddDocument(string& doc) {
-  last_document_ += doc;
   automaton_all_.AddString(doc.data(), doc.size());
+  last_document_ += doc;
+  if  (consider_as_one_string_) {
+    ResetLastDocument();
+  }
 }
 
 void Dictionary::AddDocument(const char* doc, size_t length) {
-  last_document_ += string(doc, length);   
   automaton_all_.AddString(doc, length);
+  last_document_ += doc;
+  if  (consider_as_one_string_) {
+    ResetLastDocument();
+  }
 }
 
 void Dictionary::AddDocumentViaStopSymbol(string& doc) {
-  if  (automaton_all_.Empty()) {
+  if  (automaton_all_.Empty() || consider_as_one_string_) {
     AddDocument(doc);
     return;
   }
@@ -82,7 +91,7 @@ void Dictionary::AddDocumentViaStopSymbol(string& doc) {
 }
 
 void Dictionary::AddDocumentViaStopSymbol(const char* doc, size_t length) {
-  if  (automaton_all_.Empty()) {
+  if  (automaton_all_.Empty() || consider_as_one_string_) {
     AddDocument(doc, length);
     return;
   }
@@ -101,45 +110,20 @@ void Dictionary::BuildDict() {
   ResetLastDocument();
   dict_.clear();
 
-<<<<<<< HEAD
-=======
-  cout << "automaton size = " << automaton_all_.AmountAliveNodes() << endl;
-/*
-  for (size_t id : automaton_all_) {
-    cout << "occurs " << GetNode(id)->docs_occurs_in << " " << GetNode(id)->len_within_document << endl;
-  }
-*/
-  cout << "building dictionary..." << endl;
-
->>>>>>> upstream/master
   vector<size_t> substrings; 
   CollectGoodSubstrings(&substrings);
 
   sort(substrings.begin(), substrings.end(), [&] (int id1, int id2) { return DoubleLess(automaton_all_.GetScore(id2), automaton_all_.GetScore(id1)); });
 
-<<<<<<< HEAD
-=======
-  cout << "good substrings have been collected and sorted" << endl;
-
->>>>>>> upstream/master
   size_t length_dict = 0;
   for (size_t i = 0; i < substrings.size() && length_dict + kMinLen <= kMaxDict; ++i) {
     auto* node = GetNode(substrings[i]);
     if  (length_dict + node->len_within_document > kMaxDict) {
       continue;
     }
-<<<<<<< HEAD
-    length_dict += node->len_within_document;
-    dict_.push_back(substrings[i]);
-  }
-=======
-//    printf("occurs = %d, len = %d\n", node->docs_occurs_in, node->len_within_document);
     length_dict += node->len_within_document;
     dict_.push_back(substrings[i]);
   }
-
-  cout << "dict's length = " << length_dict << endl;
->>>>>>> upstream/master
 }
 
 vector<pair<string, size_t> > Dictionary::GetDictSubstringsList() {
@@ -161,6 +145,10 @@ string Dictionary::GetDict() {
   return dict_str;
 }
 
+SuffixAutomaton& Dictionary::GetAutomaton() {
+  return automaton_all_;
+}
+
 void Dictionary::OutputDictTo(string path) {
   std::ofstream file(path);
   file << GetDict();
@@ -171,11 +159,6 @@ void Dictionary::ResetLastDocument() {
     return;
   }
 
-<<<<<<< HEAD
-//  cout << "calculate occurences for document with length " << last_document_.size() << endl;
-=======
-  cout << "calculate occurences for document with length " << last_document_.size() << endl;
->>>>>>> upstream/master
 	size_t cur_hash = (rand() << 16) ^ rand();
   size_t id = automaton_all_.root();
 	size_t pos = 0;
@@ -186,7 +169,9 @@ void Dictionary::ResetLastDocument() {
 		size_t cur_id = id;
 		while (cur_id && GetNode(cur_id)->last_hash != cur_hash) {
 			GetNode(cur_id)->last_hash = cur_hash;
-      automaton_all_.AddOccurence(cur_id);
+      if  (!consider_as_one_string_ || automaton_all_.GetNode(cur_id)->docs_occurs_in == 0) {
+        automaton_all_.AddOccurence(cur_id);
+      }
 			cur_id = GetNode(cur_id)->link;
 		}
  	}
@@ -200,20 +185,8 @@ void Dictionary::CollectGoodSubstrings(vector <size_t>* substrings) {
   vector<double> max_score_substring(nodes, -1e20);
   vector<double> max_score_upstring(nodes, -1e20);
   vector<char> can_to_dict(nodes, true);
-<<<<<<< HEAD
   vector<size_t> order = automaton_all_.GetNodesInOrder();
 
-=======
-  vector<size_t> order;
-  order.reserve(nodes - 1);
-
-  for (size_t id : automaton_all_) {
-    order.push_back(id);
-  }
-
-  sort(order.begin(), order.end(), [&] (size_t id1, size_t id2) { return GetNode(id1)->len_actual < GetNode(id2)->len_actual; } );
-
->>>>>>> upstream/master
   // calc max_score_substring
   for (size_t id : order) {
     double max_score = -1e20;
@@ -291,7 +264,3 @@ void Dictionary::CollectGoodSubstrings(vector <size_t>* substrings) {
 bool Dictionary::CanAffordSubstringFrom(Node* node) const {
   return node->len_within_document >= kMinLen && node->docs_occurs_in >= kMinDocsOccursIn;
 }
-<<<<<<< HEAD
-
-=======
->>>>>>> upstream/master
diff --git a/src/dict_builder/dictionary.hpp b/src/dict_builder/dictionary.hpp
@@ -11,14 +11,16 @@
 
 class Dictionary {
 public:
-  // const in the past
+  // were const in the past
   size_t kMaxDict;
   size_t kMinLen;
   size_t kMinDocsOccursIn;
 
   Dictionary();
 
-  Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef);
+  Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef, bool consider_as_one_string);
+
+  Dictionary(size_t kMaxDict, size_t kMinLen, SuffixAutomaton& automaton, bool consider_as_one_string);
 
   ~Dictionary();
 
@@ -36,8 +38,11 @@ class Dictionary {
 
   std::string GetDict();
 
+  SuffixAutomaton& GetAutomaton();
+
   void OutputDictTo(std::string path);
 
+  // last document was added to automaton, here we are to update occurrences corresponds to substring of the last_document
   void ResetLastDocument();
 
 private:
@@ -64,6 +69,7 @@ class Dictionary {
   std::string last_document_;
   SuffixAutomaton automaton_all_;
   std::vector<int> dict_;
+  bool consider_as_one_string_;
 };
 
 #endif // DICTIONARY_HPP_