From 4d01e593bedec7bd0c1bda0155da6bfd4992c96b Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Wed, 3 Dec 2014 13:43:08 +0300
Subject: [PATCH 01/10] Update README.md

---
 src/dict_builder/README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
index c76ecf4..c5b592b 100644
--- a/src/dict_builder/README.md
+++ b/src/dict_builder/README.md
@@ -1,5 +1,5 @@
 **Introduction**  
-This is a tool that builds a "dictionary of common strings". More precisely, it picks some set of substrings from given text files and then writes them one by one to a dictionary. So, the dictionary is just some huge string.  
+This is a tool that builds a "dictionary of common strings". To be more precise, it picks some set of substrings from given text files and then writes them one by one to a dictionary. So the dictionary is just some huge string.  
 After that, one can make a "delta" file, that encodes our file in respect of the fact that decoder of our file knows the dictionary. That is why, the dictionary should be similar to all given files (in some sense).  
   
 **How does it work?**  
@@ -8,7 +8,7 @@ Our goal is to find some set of substrings that maximize the
   ![equation](http://latex.codecogs.com/png.latex?%5Csum_%7Bs%20%5Cin%20Dict%7D%20%5Cfrac%7BDocsOccursIn%28s%29%20%5Ccdot%20%28len%28s%29%20-%203%29%7D%7Blen%28s%29%7D)  
     
 under some constraints, namely  
-1. If we took a substring, we are not allowed to take a subtring of this substring  
+1. If we take a substring, we are not allowed to take a subtring of this substring  
 2. ![equation](http://latex.codecogs.com/png.latex?DocsOccursIn%28s%29%20%3E%201%2C%20len%28s%29%20%3E%20threshold).  
 3. Sum over taken substrings' lengths is not greater than some constant.  
   
@@ -22,3 +22,7 @@ So we can calculate `DocsOccursIn` for each node. We are going to solve the very
 In the last part of solution we just sort all survived substring by their rating and pick them until we reach the limit on the dictionary size.  
   
 Time complexity is `O(sum_length_documents)` with relatively small constant.
+
+**Further improvements**  
+  
+We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant `0 <= alpha < 1`. Smaller `alpha` correspond to a huge sensitivity to new documents.

From 553f283ca790df0f45c65e3bb3da44d4d7b65f07 Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Tue, 9 Dec 2014 14:53:43 +0300
Subject: [PATCH 02/10] update

---
 .gitignore                                    |   5 -
 CMakeLists.txt                                |   4 -
 README.md                                     |   8 -
 src/CMakeLists.txt                            |   5 +-
 src/dict_builder/CMakeLists.txt               |  11 -
 src/dict_builder/README.md                    |   8 +-
 src/dict_builder/dictionary.cpp               | 108 +++-----
 src/dict_builder/dictionary.hpp               |   4 +
 src/dict_builder/dictionary_test.cpp          |   4 -
 src/dict_builder/node.cpp                     |  18 +-
 src/dict_builder/node.hpp                     |  20 +-
 src/dict_builder/proto/automaton.proto        |   4 +-
 src/dict_builder/serialization_tests.cpp      |   3 +
 src/dict_builder/suffix_automaton.cpp         | 236 +++++-------------
 src/dict_builder/suffix_automaton.hpp         |  53 +---
 src/incremental_updater/CMakeLists.txt        |   1 +
 .../incremental_generator.cpp                 | 166 +++++++++++-
 src/third_party/open-vcdiff/CMakeLists.txt    |   4 +-
 18 files changed, 287 insertions(+), 375 deletions(-)

diff --git a/.gitignore b/.gitignore
index 28b298c..0c444da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,15 +1,10 @@
-<<<<<<< HEAD
 *.html
 *~
 src/third_party/open-vcdiff/
 src/gtest
-
-
-=======
 .idea/
 .svn/
 *~
 .DS_Store
 src/third_party/open-vcdiff/src/config.h
 src/third_party/open-vcdiff/src/stamp-h1
->>>>>>> upstream/master
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf0e2b3..883fbdd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,11 +2,7 @@ cmake_minimum_required(VERSION 2.8.11)
 project(SInGe)
 #set(CMAKE_VERBOSE_MAKEFILE ON)
 
-<<<<<<< HEAD
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra")
-=======
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++1y -Wall -Wextra")
->>>>>>> upstream/master
 set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 
diff --git a/README.md b/README.md
index 982e4aa..04800c9 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,4 @@
 
 [![Build Status](https://travis-ci.org/cscenter/SInGe.svg?branch=master)](https://travis-ci.org/cscenter/SInGe)
 
-## How to build first time:
-1) go to src/third_party/open-vcdiff
-2) ./autogen.sh
-3) ./configure
-
-
-After that use Cmake as usual
-
 SDCH Dictionary Incremental Geenrator
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 87f9621..244b01e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,10 +1,7 @@
 add_subdirectory(third_party/)
 add_subdirectory(gtest)
 add_subdirectory(dict_builder)
-add_subdirectory(incremental_updater/)
-<<<<<<< HEAD
 add_subdirectory(incremental_tester/)
-=======
->>>>>>> upstream/master
+add_subdirectory(incremental_updater/)
 
 
diff --git a/src/dict_builder/CMakeLists.txt b/src/dict_builder/CMakeLists.txt
index 464f8c0..6de2a05 100644
--- a/src/dict_builder/CMakeLists.txt
+++ b/src/dict_builder/CMakeLists.txt
@@ -1,6 +1,3 @@
-<<<<<<< HEAD
-add_library( dictgen
-=======
 find_package(Protobuf REQUIRED)
 
 set (PROTO_SOURCES
@@ -10,15 +7,12 @@ set (PROTO_SOURCES
 PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS ${PROTO_SOURCES})
 
 add_library(dictgen
->>>>>>> upstream/master
 	dictionary.cpp
 	dictionary.hpp
 	node.cpp
 	node.hpp
 	suffix_automaton.cpp
 	suffix_automaton.hpp
-<<<<<<< HEAD
-=======
 	${PROTO_SRCS}
 	${PROTO_HDRS}
 )
@@ -30,7 +24,6 @@ target_include_directories (dictgen PUBLIC
  ${CMAKE_CURRENT_SOURCE_DIR}
  ${PROTOBUF_INCLUDE_DIRS}
  ${CMAKE_CURRENT_BINARY_DIR}
->>>>>>> upstream/master
 )
 
 add_executable(pzip
@@ -42,16 +35,12 @@ add_executable(dict_builder_tests
 	node_test.cpp
 	dictionary_test.cpp
 	suffix_automaton_test.cpp
-<<<<<<< HEAD
-=======
 	serialization_tests.cpp
->>>>>>> upstream/master
 )
 
 target_link_libraries(dict_builder_tests
   gtest_main
   dictgen
 )
-target_include_directories (dictgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_test(NAME dict_builder_tests COMMAND dict_builder_tests)
diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
index c5b592b..c76ecf4 100644
--- a/src/dict_builder/README.md
+++ b/src/dict_builder/README.md
@@ -1,5 +1,5 @@
 **Introduction**  
-This is a tool that builds a "dictionary of common strings". To be more precise, it picks some set of substrings from given text files and then writes them one by one to a dictionary. So the dictionary is just some huge string.  
+This is a tool that builds a "dictionary of common strings". More precisely, it picks some set of substrings from given text files and then writes them one by one to a dictionary. So, the dictionary is just some huge string.  
 After that, one can make a "delta" file, that encodes our file in respect of the fact that decoder of our file knows the dictionary. That is why, the dictionary should be similar to all given files (in some sense).  
   
 **How does it work?**  
@@ -8,7 +8,7 @@ Our goal is to find some set of substrings that maximize the
   ![equation](http://latex.codecogs.com/png.latex?%5Csum_%7Bs%20%5Cin%20Dict%7D%20%5Cfrac%7BDocsOccursIn%28s%29%20%5Ccdot%20%28len%28s%29%20-%203%29%7D%7Blen%28s%29%7D)  
     
 under some constraints, namely  
-1. If we take a substring, we are not allowed to take a subtring of this substring  
+1. If we took a substring, we are not allowed to take a subtring of this substring  
 2. ![equation](http://latex.codecogs.com/png.latex?DocsOccursIn%28s%29%20%3E%201%2C%20len%28s%29%20%3E%20threshold).  
 3. Sum over taken substrings' lengths is not greater than some constant.  
   
@@ -22,7 +22,3 @@ So we can calculate `DocsOccursIn` for each node. We are going to solve the very
 In the last part of solution we just sort all survived substring by their rating and pick them until we reach the limit on the dictionary size.  
   
 Time complexity is `O(sum_length_documents)` with relatively small constant.
-
-**Further improvements**  
-  
-We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant `0 <= alpha < 1`. Smaller `alpha` correspond to a huge sensitivity to new documents.
diff --git a/src/dict_builder/dictionary.cpp b/src/dict_builder/dictionary.cpp
index 0fa99c0..5e1dabe 100644
--- a/src/dict_builder/dictionary.cpp
+++ b/src/dict_builder/dictionary.cpp
@@ -1,36 +1,24 @@
-<<<<<<< HEAD
-=======
-#include <map>
->>>>>>> upstream/master
-#include <fstream>
+#include <algorithm>
+#include <cassert>
 #include <cmath>
 #include <cstdlib>
-#include <algorithm>
+#include <fstream>
 #include <iostream>
-<<<<<<< HEAD
-=======
-#include <queue>
-#include <cassert>
->>>>>>> upstream/master
 #include <map>
+#include <queue>
 
 #include "dictionary.hpp"
 #include "suffix_automaton.hpp"
 
-using std::vector;
-using std::string;
-using std::pair;
-using std::make_pair;
-<<<<<<< HEAD
-using std::endl;
-using std::cout;
-=======
 using std::cerr;
-using std::endl;
 using std::cout;
-using std::queue;
+using std::endl;
+using std::make_pair;
 using std::map;
->>>>>>> upstream/master
+using std::pair;
+using std::queue;
+using std::string;
+using std::vector;
 
 namespace {
   const double kEps = 1e-10;
@@ -44,18 +32,28 @@ namespace {
   }	
 };
 
-<<<<<<< HEAD
 Dictionary::Dictionary() : kMaxDict(1 << 20), kMinLen(20), kMinDocsOccursIn(2) {}
 
-Dictionary::Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef) : kMaxDict(kMaxDict), kMinLen(kMinLen), kMinDocsOccursIn(2), automaton_all_(SuffixAutomaton(kStopSymbol, kMaxAutomatonSize, kAutomatonCoef)) {
+Dictionary::Dictionary(size_t kMaxDict
+      , size_t kMinLen
+      , char kStopSymbol
+      , size_t kMaxAutomatonSize
+      , double kAutomatonCoef)
+    : kMaxDict(kMaxDict)
+    , kMinLen(kMinLen)
+    , kMinDocsOccursIn(2)
+    , automaton_all_(
+        SuffixAutomaton(kStopSymbol, kMaxAutomatonSize, kAutomatonCoef)) {
 }
-=======
-const size_t Dictionary::kMaxDict = 1 << 16;
-const size_t Dictionary::kMinLen = 3;
-const size_t Dictionary::kMinDocsOccursIn = 2;
 
-Dictionary::Dictionary() {}
->>>>>>> upstream/master
+Dictionary::Dictionary(size_t kMaxDict
+      , size_t kMinLen
+      , SuffixAutomaton& automaton)
+    : kMaxDict(kMaxDict)
+    , kMinLen(kMinLen)
+    , kMinDocsOccursIn(2)
+    , automaton_all_(automaton) {
+}
 
 Dictionary::~Dictionary() {}
 
@@ -101,45 +99,20 @@ void Dictionary::BuildDict() {
   ResetLastDocument();
   dict_.clear();
 
-<<<<<<< HEAD
-=======
-  cout << "automaton size = " << automaton_all_.AmountAliveNodes() << endl;
-/*
-  for (size_t id : automaton_all_) {
-    cout << "occurs " << GetNode(id)->docs_occurs_in << " " << GetNode(id)->len_within_document << endl;
-  }
-*/
-  cout << "building dictionary..." << endl;
-
->>>>>>> upstream/master
   vector<size_t> substrings; 
   CollectGoodSubstrings(&substrings);
 
   sort(substrings.begin(), substrings.end(), [&] (int id1, int id2) { return DoubleLess(automaton_all_.GetScore(id2), automaton_all_.GetScore(id1)); });
 
-<<<<<<< HEAD
-=======
-  cout << "good substrings have been collected and sorted" << endl;
-
->>>>>>> upstream/master
   size_t length_dict = 0;
   for (size_t i = 0; i < substrings.size() && length_dict + kMinLen <= kMaxDict; ++i) {
     auto* node = GetNode(substrings[i]);
     if  (length_dict + node->len_within_document > kMaxDict) {
       continue;
     }
-<<<<<<< HEAD
     length_dict += node->len_within_document;
     dict_.push_back(substrings[i]);
   }
-=======
-//    printf("occurs = %d, len = %d\n", node->docs_occurs_in, node->len_within_document);
-    length_dict += node->len_within_document;
-    dict_.push_back(substrings[i]);
-  }
-
-  cout << "dict's length = " << length_dict << endl;
->>>>>>> upstream/master
 }
 
 vector<pair<string, size_t> > Dictionary::GetDictSubstringsList() {
@@ -161,6 +134,10 @@ string Dictionary::GetDict() {
   return dict_str;
 }
 
+SuffixAutomaton& Dictionary::GetAutomaton() {
+  return automaton_all_;
+}
+
 void Dictionary::OutputDictTo(string path) {
   std::ofstream file(path);
   file << GetDict();
@@ -171,11 +148,8 @@ void Dictionary::ResetLastDocument() {
     return;
   }
 
-<<<<<<< HEAD
 //  cout << "calculate occurences for document with length " << last_document_.size() << endl;
-=======
-  cout << "calculate occurences for document with length " << last_document_.size() << endl;
->>>>>>> upstream/master
+
 	size_t cur_hash = (rand() << 16) ^ rand();
   size_t id = automaton_all_.root();
 	size_t pos = 0;
@@ -200,20 +174,8 @@ void Dictionary::CollectGoodSubstrings(vector <size_t>* substrings) {
   vector<double> max_score_substring(nodes, -1e20);
   vector<double> max_score_upstring(nodes, -1e20);
   vector<char> can_to_dict(nodes, true);
-<<<<<<< HEAD
   vector<size_t> order = automaton_all_.GetNodesInOrder();
 
-=======
-  vector<size_t> order;
-  order.reserve(nodes - 1);
-
-  for (size_t id : automaton_all_) {
-    order.push_back(id);
-  }
-  
-  sort(order.begin(), order.end(), [&] (size_t id1, size_t id2) { return GetNode(id1)->len_actual < GetNode(id2)->len_actual; } );
-    
->>>>>>> upstream/master
   // calc max_score_substring
   for (size_t id : order) {
     double max_score = -1e20;
@@ -291,7 +253,3 @@ void Dictionary::CollectGoodSubstrings(vector <size_t>* substrings) {
 bool Dictionary::CanAffordSubstringFrom(Node* node) const {
   return node->len_within_document >= kMinLen && node->docs_occurs_in >= kMinDocsOccursIn;
 }
-<<<<<<< HEAD
-
-=======
->>>>>>> upstream/master
diff --git a/src/dict_builder/dictionary.hpp b/src/dict_builder/dictionary.hpp
index dc10d43..087f649 100644
--- a/src/dict_builder/dictionary.hpp
+++ b/src/dict_builder/dictionary.hpp
@@ -20,6 +20,8 @@ class Dictionary {
 
   Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef);
 
+  Dictionary(size_t kMaxDict, size_t kMinLen, SuffixAutomaton& automaton);
+
   ~Dictionary();
 
   void AddDocument(std::string& doc);
@@ -36,6 +38,8 @@ class Dictionary {
 
   std::string GetDict();
 
+  SuffixAutomaton& GetAutomaton();
+
   void OutputDictTo(std::string path);
 
   void ResetLastDocument();
diff --git a/src/dict_builder/dictionary_test.cpp b/src/dict_builder/dictionary_test.cpp
index fd3058a..a78feaa 100644
--- a/src/dict_builder/dictionary_test.cpp
+++ b/src/dict_builder/dictionary_test.cpp
@@ -25,11 +25,7 @@ TEST(DictionaryTest, MainDictionaryTest) {
   std::string s2 = "qwecabarty";
   std::string s3 = "caba_cabaqwe";
 
-<<<<<<< HEAD
   Dictionary dict(100, 3, '#', 1000, 1.0);
-=======
-  Dictionary dict;
->>>>>>> upstream/master
   dict.AddDocumentViaStopSymbol(s1);
   dict.AddDocumentViaStopSymbol(s2);
   dict.AddDocumentViaStopSymbol(s3);
diff --git a/src/dict_builder/node.cpp b/src/dict_builder/node.cpp
index f64f045..484c51d 100644
--- a/src/dict_builder/node.cpp
+++ b/src/dict_builder/node.cpp
@@ -74,14 +74,9 @@ bool Node::AddEdge(char ch, size_t to) {
 }
 
 bool Node::AddRevEdge(char ch, size_t from) {
-<<<<<<< HEAD
   for (auto& it : rev_edges_) {
     if  (it.second == from) {
       it.first = ch;
-=======
-  for (auto it : rev_edges_) {
-    if  (it == make_pair(ch, from)) {
->>>>>>> upstream/master
       return false;
     }
   }
@@ -103,11 +98,11 @@ void Node::SortEdges() {
   std::sort(edges_.begin(), edges_.end());
 }
 
-size_t Node::InDegree() {
+size_t Node::InDegree() const {
   return rev_edges_.size();
 }
 
-size_t Node::OutDegree() {
+size_t Node::OutDegree() const {
   return edges_.size();
 }
 
@@ -132,11 +127,7 @@ bool Node::DeleteRevEdge(size_t from) {
     if  (rev_edges_[i].second == from) {
       pos = i;
       break;
-<<<<<<< HEAD
-    } 
-=======
     }
->>>>>>> upstream/master
   }
   if  (pos < rev_edges_.size()) {
     rev_edges_.erase(rev_edges_.begin() + pos);
@@ -159,11 +150,9 @@ bool Node::DeleteRevLink(size_t from) {
   }
   return false;
 }
-<<<<<<< HEAD
-=======
 
 std::unique_ptr<ProtoNode> Node::GetProtoNode() const {
-  auto proto_node = std::make_unique<ProtoNode>();
+  auto proto_node = std::unique_ptr<ProtoNode>(new ProtoNode());
   auto *proto_repeated_ptrs_edges = proto_node->mutable_edges();
   proto_repeated_ptrs_edges->Reserve(edges_.size());
   for (const auto &edge : edges_) {
@@ -224,4 +213,3 @@ Node::Node(const ProtoNode& proto_node) : Node() {
     rev_links_.emplace_back(rev_link);
   }
 }
->>>>>>> upstream/master
diff --git a/src/dict_builder/node.hpp b/src/dict_builder/node.hpp
index bae5ebf..b84c920 100644
--- a/src/dict_builder/node.hpp
+++ b/src/dict_builder/node.hpp
@@ -11,11 +11,9 @@
 #include <vector>
 #include <cstdlib>
 #include <cstddef> // size_t
-<<<<<<< HEAD
-=======
-#include <memory> //uinique_ptr
-#include <automaton.pb.h>
->>>>>>> upstream/master
+#include <memory> // unique_ptr
+
+#include "automaton.pb.h"
 
 class Node {
 public:
@@ -47,9 +45,9 @@ class Node {
 
   void SortEdges();
 
-  size_t InDegree();
+  size_t InDegree() const;
 
-  size_t OutDegree();
+  size_t OutDegree() const;
   
   bool DeleteEdge(size_t to);
 
@@ -57,13 +55,10 @@ class Node {
 
   bool DeleteRevLink(size_t from);
 
-<<<<<<< HEAD
-=======
   std::unique_ptr<ProtoNode> GetProtoNode() const ;
 
-  explicit Node(const ProtoNode & proto_node);
+  explicit Node(const ProtoNode& proto_node);
 
->>>>>>> upstream/master
   size_t link;
   size_t len_actual;
   size_t len_within_document;
@@ -75,11 +70,8 @@ class Node {
   std::vector<std::pair<char, size_t> > edges_;
   std::vector<std::pair<char, size_t> > rev_edges_;
   std::vector<size_t> rev_links_;
-<<<<<<< HEAD
-=======
 
   friend class SerializationTest;
->>>>>>> upstream/master
 };
 
 #endif // NODE_HPP_
diff --git a/src/dict_builder/proto/automaton.proto b/src/dict_builder/proto/automaton.proto
index beb3818..74a3393 100644
--- a/src/dict_builder/proto/automaton.proto
+++ b/src/dict_builder/proto/automaton.proto
@@ -23,5 +23,7 @@ message ProtoAutomaton {
   required int64 len_up_to_stop_symbol = 3;
   required double current_coef = 4;
   repeated bool is_free_node = 5;
-
+  required uint64 max_size = 6;
+  required int32 stop_symbol = 7;
+  required double coef = 8;
 }
\ No newline at end of file
diff --git a/src/dict_builder/serialization_tests.cpp b/src/dict_builder/serialization_tests.cpp
index 996461c..2c99c38 100644
--- a/src/dict_builder/serialization_tests.cpp
+++ b/src/dict_builder/serialization_tests.cpp
@@ -41,6 +41,9 @@ class SerializationTest : public testing::Test {
     EXPECT_EQ(first_automaton.current_coef, second_automaton.current_coef) << msg;
     EXPECT_EQ(first_automaton.amount_alive_nodes_, second_automaton.amount_alive_nodes_) << msg;
     EXPECT_EQ(first_automaton.nodes_to_delete_, second_automaton.nodes_to_delete_) << msg;
+    EXPECT_EQ(first_automaton.kMaxSize, second_automaton.kMaxSize) << msg;
+    EXPECT_EQ(first_automaton.kCoef, second_automaton.kCoef) << msg;
+    EXPECT_EQ(first_automaton.kStopSymbol, second_automaton.kStopSymbol) << msg;
   }
 
   SuffixAutomaton SerializeAndDeserialize(const SuffixAutomaton& automaton) {
diff --git a/src/dict_builder/suffix_automaton.cpp b/src/dict_builder/suffix_automaton.cpp
index eb61978..63bfe08 100644
--- a/src/dict_builder/suffix_automaton.cpp
+++ b/src/dict_builder/suffix_automaton.cpp
@@ -6,14 +6,11 @@
 */
 
 #include <algorithm>
+#include <cassert>
 #include <cstdio>
-#include <string>
-<<<<<<< HEAD
-#include <queue>
-=======
 #include <iostream>
-#include <cassert>
->>>>>>> upstream/master
+#include <queue>
+#include <string>
 
 #include "suffix_automaton.hpp"
 
@@ -23,21 +20,25 @@ using std::max;
 using std::string;
 using std::make_pair;
 
-<<<<<<< HEAD
-SuffixAutomaton::SuffixAutomaton() : kMaxSize(1 << 18), kStopSymbol('#'), kCoef(0.9), len_up_to_stop_symbol_(1), amount_alive_nodes_(0), current_coef(1.0) {
+SuffixAutomaton::SuffixAutomaton()
+    : kMaxSize(1 << 18),
+      kStopSymbol('#'),
+      kCoef(0.9),
+      len_up_to_stop_symbol_(1),
+      amount_alive_nodes_(0),
+      current_coef(1.0) {
   NewNode(); // ~ nullptr
   last_node_ = NewNode();
   AddToNodesToDelete(last_node_);
 }
 
-SuffixAutomaton::SuffixAutomaton(char kStopSymbol, size_t kMaxSize, double kCoef) : kMaxSize(kMaxSize), kStopSymbol(kStopSymbol), kCoef(kCoef), len_up_to_stop_symbol_(1), amount_alive_nodes_(0), current_coef(1.0) {
-=======
-const char SuffixAutomaton::kStopSymbol = '#';
-const size_t SuffixAutomaton::kMaxSize = 1 << 13;
-const double SuffixAutomaton::kCoef = 0.95;
-
-SuffixAutomaton::SuffixAutomaton() : len_up_to_stop_symbol_(1), amount_alive_nodes_(0), current_coef(1.0) {
->>>>>>> upstream/master
+SuffixAutomaton::SuffixAutomaton(char kStopSymbol, size_t kMaxSize, double kCoef)
+    : kMaxSize(kMaxSize),
+      kStopSymbol(kStopSymbol),
+      kCoef(kCoef),
+      len_up_to_stop_symbol_(1),
+      amount_alive_nodes_(0),
+      current_coef(1.0) {
   NewNode(); // ~ nullptr
   last_node_ = NewNode();
   AddToNodesToDelete(last_node_);
@@ -48,11 +49,7 @@ SuffixAutomaton::~SuffixAutomaton() {}
 SuffixAutomaton::iterator SuffixAutomaton::begin() {
   return SuffixAutomaton::iterator(1, is_free_node_);
 }
-<<<<<<< HEAD
- 
-=======
 
->>>>>>> upstream/master
 SuffixAutomaton::iterator SuffixAutomaton::end() {
   return SuffixAutomaton::iterator(AmountNodes(), is_free_node_);
 }
@@ -78,24 +75,18 @@ double SuffixAutomaton::GetScore(size_t id) {
 }
 
 bool SuffixAutomaton::AddOccurence(size_t id) {
-  if  (!GetNode(id)) {
+  if (!GetNode(id))
     return false;
-  }
-<<<<<<< HEAD
-  if  (GetNode(id)->OutDegree() == 0) {
+
+  if (GetNode(id)->OutDegree() == 0)
     EraseFromNodesToDelete(id);
-  }
+
   ++GetNode(id)->docs_occurs_in;
   GetNode(id)->score_occurs_only += 1.0 / current_coef;
-  if  (GetNode(id)->OutDegree() == 0) {
+
+  if (GetNode(id)->OutDegree() == 0)
     AddToNodesToDelete(id);
-  }
-=======
-  EraseFromNodesToDelete(id);
-  ++GetNode(id)->docs_occurs_in;
-  GetNode(id)->score_occurs_only += 1.0 / current_coef;
-  AddToNodesToDelete(id);
->>>>>>> upstream/master
+
   return true;
 }
 
@@ -103,11 +94,7 @@ void SuffixAutomaton::AddString(const char* const str, size_t length) {
   for (size_t i = 0; i < length; ++i) {
     AddCharacter(str[i]);
     ++len_up_to_stop_symbol_;
-<<<<<<< HEAD
-  }	
-=======
   }
->>>>>>> upstream/master
 }
 
 void SuffixAutomaton::AddStringViaStopSymbol(const char* const str, size_t length) {
@@ -120,11 +107,7 @@ void SuffixAutomaton::AddStringViaStopSymbol(const char* const str, size_t lengt
   for (size_t i = 0; i < length; ++i) {
     AddCharacter(str[i]);
     ++len_up_to_stop_symbol_;
-<<<<<<< HEAD
-  }	
-=======
   }
->>>>>>> upstream/master
 }
 
 size_t SuffixAutomaton::root() const {
@@ -135,24 +118,6 @@ bool SuffixAutomaton::Empty() const {
   return last_node_ == root();
 }
 
-<<<<<<< HEAD
-=======
-void SuffixAutomaton::Output() {
-  for (size_t id : *this) {
-    std::cout << id << ": " << GetLongestString(id) << std::endl;
-  }
-
-  Output(root(), "");
-}
-
-void SuffixAutomaton::Output(size_t v, std::string s) {
-  printf("%s v = %zu, occurs = %zu, len = %zu, score = %.5f\n", s.c_str(), v, GetNode(v)->docs_occurs_in, GetNode(v)->len_within_document, GetScore(v));
-  for (auto it = GetNode(v)->edges_begin(); it != GetNode(v)->edges_end(); ++it) {
-    Output(it->second, s + it->first);
-  }
-}
-
->>>>>>> upstream/master
 double SuffixAutomaton::GetCurrentCoef() {
   return current_coef;
 }
@@ -171,11 +136,7 @@ string SuffixAutomaton::GetLongestString(size_t id) {
         max_len = cur_len;
         max_id = cur_id;
         max_ch = cur_ch;
-<<<<<<< HEAD
-      } 
-=======
       }
->>>>>>> upstream/master
     }
     str += max_ch;
     id = max_id;
@@ -185,23 +146,15 @@ string SuffixAutomaton::GetLongestString(size_t id) {
 }
 
 bool SuffixAutomaton::ReduceSize() {
-<<<<<<< HEAD
   if  (AmountAliveNodes() > 2 * kMaxSize) { 
-    while (AmountAliveNodes() > kMaxSize) {
+    while (AmountAliveNodes() > kMaxSize)
       DeleteNode(nodes_to_delete_.begin()->second);
-=======
-  if  (AmountAliveNodes() > 2 * kMaxSize) {
-    while (AmountAliveNodes() > kMaxSize) {
-      auto min_node = *nodes_to_delete_.begin();
-      DeleteNode(min_node.second);
->>>>>>> upstream/master
-    }
+
     return true;
   }
   return false;
 }
 
-<<<<<<< HEAD
 vector<size_t> SuffixAutomaton::GetNodesInOrder() {
   std::queue<size_t> q_nodes;
   vector<char> was_in_q(AmountNodes(), false);
@@ -226,8 +179,6 @@ vector<size_t> SuffixAutomaton::GetNodesInOrder() {
   return order;
 }
 
-=======
->>>>>>> upstream/master
 size_t SuffixAutomaton::NewNode() {
   if  (nodes_pool_.empty()) {
     is_free_node_.push_back(false);
@@ -251,23 +202,19 @@ size_t SuffixAutomaton::NewNode() {
 }
 
 bool SuffixAutomaton::AddLink(size_t from, size_t to) {
-  if  (!GetNode(from) || !GetNode(to)) {
+  if  (!GetNode(from) || !GetNode(to))
     return false;
-  }
-<<<<<<< HEAD
+
   size_t old_to = GetNode(from)->link;
-  if  (GetNode(old_to)) {
+  if  (GetNode(old_to))
     GetNode(old_to)->DeleteRevLink(from);
-  }
-=======
->>>>>>> upstream/master
+
   GetNode(from)->link = to;
   GetNode(to)->AddRevLink(from);
   return true;
 }
 
 bool SuffixAutomaton::AddEdge(size_t from, size_t to, char ch) {
-<<<<<<< HEAD
   GetNode(from)->AddEdge(ch, to);
   GetNode(to)->AddRevEdge(ch, from);
   if  (GetNode(from)->OutDegree() == 1) {
@@ -289,7 +236,7 @@ bool SuffixAutomaton::DeleteNode(size_t id) {
   if  (id == last_node_) {
     size_t new_last_node_ = 0;
     for (auto it = GetNode(id)->rev_edges_begin(); it != GetNode(id)->rev_edges_end(); ++it) {
-      if  (!new_last_node_ || GetNode(it->second)->len_actual > GetNode(new_last_node_)->len_actual) {
+      if (!new_last_node_ || GetNode(it->second)->len_actual > GetNode(new_last_node_)->len_actual) {
         new_last_node_ = it->second;
       }
     }
@@ -306,52 +253,20 @@ bool SuffixAutomaton::DeleteNode(size_t id) {
   // delete incoming edges
   for (auto it = GetNode(id)->rev_edges_begin(); it != GetNode(id)->rev_edges_end(); ++it) {
     GetNode(it->second)->DeleteEdge(id);
-    if  (GetNode(it->second)->OutDegree() == 0) {    
+    if (GetNode(it->second)->OutDegree() == 0) {    
       AddToNodesToDelete(it->second);
     }
-=======
-  if  (!GetNode(from) || !GetNode(to)) {
-    return false;
-  }
-  GetNode(from)->AddEdge(ch, to);
-  GetNode(to)->AddRevEdge(ch, from);
-  return true;
-}
-
-bool SuffixAutomaton::DeleteNode(size_t id) {
-  if  (!GetNode(id)) {
-    return false;
-  }
-
-//  std::cout << "delete " << id << std::endl;
-
-  // redirect incoming links
-  for (auto it = GetNode(id)->rev_links_begin(); it != GetNode(id)->rev_links_end(); ++it) {
-    size_t from = *it;
-    GetNode(from)->link = GetNode(id)->link;
-  }
-
-  // delete incoming edges
-  for (auto it = GetNode(id)->rev_edges_begin(); it != GetNode(id)->rev_edges_end(); ++it) {
-    size_t from = it->second;
-    GetNode(from)->DeleteEdge(id);
->>>>>>> upstream/master
   }
 
   // delete outcoming links (actually, exactly one link)
   GetNode(GetNode(id)->link)->DeleteRevLink(id);
 
-<<<<<<< HEAD
   vector<size_t> to_delete;
   // delete outgoing edges
-=======
-  // delete outcoming edges
->>>>>>> upstream/master
   for (auto it = GetNode(id)->edges_begin(); it != GetNode(id)->edges_end(); ++it) {
     size_t to = it->second;
     GetNode(to)->DeleteRevEdge(id);
-    if  (GetNode(to)->InDegree() == 0) {
-<<<<<<< HEAD
+    if (GetNode(to)->InDegree() == 0) {
       to_delete.push_back(to);
     }
   }
@@ -365,17 +280,6 @@ bool SuffixAutomaton::DeleteNode(size_t id) {
     DeleteNode(to);
   }
   return true;
-=======
-      DeleteNode(to);
-    }
-  }
-
-  --amount_alive_nodes_;
-  is_free_node_[id] = true;
-  free_nodes_.push_back(id);
-	EraseFromNodesToDelete(id);
-	return true;
->>>>>>> upstream/master
 }
 
 void SuffixAutomaton::AddCharacter(char ch) {
@@ -396,11 +300,7 @@ void SuffixAutomaton::AddCharacter(char ch) {
     AddLink(new_node, root());
     return;
   }
-<<<<<<< HEAD
-  
-=======
 
->>>>>>> upstream/master
   size_t next = GetNode(prev)->NextNodeThrough(ch);
   if  (GetNode(next)->len_actual == GetNode(prev)->len_actual + 1) {
     AddLink(new_node, next);
@@ -413,7 +313,6 @@ void SuffixAutomaton::AddCharacter(char ch) {
   GetNode(middle)->score_occurs_only = GetNode(next)->score_occurs_only;
   GetNode(middle)->docs_occurs_in = GetNode(next)->docs_occurs_in;
   AddLink(middle, GetNode(next)->link);
-<<<<<<< HEAD
   AddToNodesToDelete(middle);
   for (auto it = GetNode(next)->edges_begin(); it != GetNode(next)->edges_end(); ++it) {
     AddEdge(middle, it->second, it->first);
@@ -421,14 +320,6 @@ void SuffixAutomaton::AddCharacter(char ch) {
 
   for (; prev && GetNode(prev)->NextNodeThrough(ch) == next; prev = GetNode(prev)->link) {
     DeleteEdge(prev, next);
-=======
-  for (auto it = GetNode(next)->edges_begin(); it != GetNode(next)->edges_end(); ++it) {
-    AddEdge(middle, it->second, it->first);
-  }
-  AddToNodesToDelete(middle);
-
-  for (; prev && GetNode(prev)->NextNodeThrough(ch) == next; prev = GetNode(prev)->link) {
->>>>>>> upstream/master
     AddEdge(prev, middle, ch);
   }
   AddLink(new_node, middle);
@@ -436,20 +327,14 @@ void SuffixAutomaton::AddCharacter(char ch) {
 }
 
 void SuffixAutomaton::AddToNodesToDelete(size_t id) {
-<<<<<<< HEAD
   assert(GetNode(id)->OutDegree() == 0);
-  nodes_to_delete_.insert(make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id));
-}
-
-void SuffixAutomaton::EraseFromNodesToDelete(size_t id) {
-  nodes_to_delete_.erase(make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id));
-=======
-	nodes_to_delete_.insert(make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id));
+  nodes_to_delete_.insert(
+      make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id));
 }
 
 void SuffixAutomaton::EraseFromNodesToDelete(size_t id) {
-	nodes_to_delete_.erase(make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id));
->>>>>>> upstream/master
+  nodes_to_delete_.erase(
+      make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id));
 }
 
 SuffixAutomaton::iterator::iterator(size_t id, vector<bool>& is_free_node) : id_(id), is_free_node_(is_free_node) {}
@@ -476,22 +361,23 @@ bool SuffixAutomaton::iterator::operator ==(const iterator& other) {
 bool SuffixAutomaton::iterator::operator !=(const iterator& other) {
   return id_ != other.id_;
 }
-<<<<<<< HEAD
-=======
 
 std::unique_ptr<ProtoAutomaton> SuffixAutomaton::GetProtoAutomaton() const {
   assert(is_free_node_.size() == nodes_pool_.size());
-  auto proto_automaton = std::make_unique<ProtoAutomaton>();
+  auto proto_automaton = std::unique_ptr<ProtoAutomaton>(new ProtoAutomaton());
   proto_automaton->set_last_node(last_node_);
   proto_automaton->set_len_up_to_stop_symbol(len_up_to_stop_symbol_);
   proto_automaton->set_current_coef(current_coef);
+  proto_automaton->set_max_size(kMaxSize);
+  proto_automaton->set_coef(kCoef);
+  proto_automaton->set_stop_symbol(kStopSymbol);
   auto* proto_nodes_pool = proto_automaton->mutable_nodes_pool();
   proto_nodes_pool->Reserve(nodes_pool_.size());
   for (auto& node : nodes_pool_) {
     //ownership transfer
     proto_nodes_pool->AddAllocated(node.GetProtoNode().release());
   }
-  assert(proto_nodes_pool->size() == (int) nodes_pool_.size());
+  assert(proto_nodes_pool->size() == static_cast<int>(nodes_pool_.size()));
   auto* proto_is_free_node = proto_automaton->mutable_is_free_node();
   proto_is_free_node->Reserve(is_free_node_.size());
   for (bool is_free : is_free_node_) {
@@ -501,16 +387,23 @@ std::unique_ptr<ProtoAutomaton> SuffixAutomaton::GetProtoAutomaton() const {
   return proto_automaton;
 }
 
-SuffixAutomaton::SuffixAutomaton(const ProtoAutomaton& proto_automaton) {
+SuffixAutomaton::SuffixAutomaton(const ProtoAutomaton& proto_automaton)
+    : amount_alive_nodes_(0) {
   last_node_ = proto_automaton.last_node();
   len_up_to_stop_symbol_ = proto_automaton.len_up_to_stop_symbol();
   current_coef = proto_automaton.current_coef();
+  kMaxSize = proto_automaton.max_size();
+  kCoef = proto_automaton.coef();
+  kStopSymbol = proto_automaton.stop_symbol();
+
   const auto& proto_is_free_node = proto_automaton.is_free_node();
-  is_free_node_.resize(proto_is_free_node.size());
+  is_free_node_.resize(proto_automaton.is_free_node_size());
   const auto& proto_nodes_pool = proto_automaton.nodes_pool();
-  nodes_pool_.reserve(proto_nodes_pool.size());
-  assert(proto_is_free_node.size() == proto_nodes_pool.size());
+  nodes_pool_.reserve(proto_automaton.nodes_pool_size());
+  assert(proto_automaton.is_free_node_size() == proto_automaton.nodes_pool_size());
+
   nodes_pool_.emplace_back(proto_nodes_pool.Get(0)); // zero node
+
   for (size_t i_node = 1; i_node < is_free_node_.size(); ++i_node) {
     const auto& proto_node = proto_nodes_pool.Get(i_node);
     nodes_pool_.emplace_back(proto_node);
@@ -519,19 +412,20 @@ SuffixAutomaton::SuffixAutomaton(const ProtoAutomaton& proto_automaton) {
       is_free_node_[i_node] = true;
       free_nodes_.push_back(i_node);
     } else {
-      nodes_to_delete_.insert(
-          make_pair(
-              make_pair(
-                  current_node.score_occurs_only,
-                  current_node.len_within_document
-              ),
-              i_node
-          )
-      );
+      ++amount_alive_nodes_;
+      if (current_node.OutDegree() == 0) {
+        nodes_to_delete_.insert(
+            make_pair(
+                make_pair(
+                    current_node.score_occurs_only,
+                    current_node.len_within_document
+                ),
+                i_node
+            )
+        );
+      }
     }
-    amount_alive_nodes_ = nodes_to_delete_.size();
     // 1 is for zero node
     assert(amount_alive_nodes_ + free_nodes_.size() + 1 == nodes_pool_.size());
   }
 }
->>>>>>> upstream/master
diff --git a/src/dict_builder/suffix_automaton.hpp b/src/dict_builder/suffix_automaton.hpp
index 8bd7cca..91dc503 100644
--- a/src/dict_builder/suffix_automaton.hpp
+++ b/src/dict_builder/suffix_automaton.hpp
@@ -8,18 +8,15 @@
 #ifndef SUFFIX_AUTOMATON_HPP_
 #define SUFFIX_AUTOMATON_HPP_
 
+#include <cassert>
+#include <cmath>
 #include <cstddef> //size_t
+#include <memory> //unique_ptr
 #include <string>
 #include <vector>
 #include <set>
-#include <cmath>
-<<<<<<< HEAD
-#include <cassert>
-=======
-#include <memory> //unique_ptr
-#include <automaton.pb.h>
 
->>>>>>> upstream/master
+#include "automaton.pb.h"
 #include "node.hpp"
 
 struct compare_nodes {
@@ -39,36 +36,23 @@ struct compare_nodes {
       return len2 < len1; 
     }
 
-<<<<<<< HEAD
     return id1 > id2;
-=======
-    return id1 < id2;
->>>>>>> upstream/master
   }
 };
 
 class SuffixAutomaton {
 public:
-<<<<<<< HEAD
   // const in the past
   size_t kMaxSize;
   char kStopSymbol;
   double kCoef;
-=======
-  static const size_t kMaxSize;
-  static const char kStopSymbol;
-  static const double kCoef;
->>>>>>> upstream/master
 
   class iterator;
 
   SuffixAutomaton();
 
-<<<<<<< HEAD
   SuffixAutomaton(char kStopSymbol, size_t kMaxSize, double kCoef);
 
-=======
->>>>>>> upstream/master
   ~SuffixAutomaton();
 
   iterator begin();
@@ -88,19 +72,11 @@ class SuffixAutomaton {
   }
 
   inline const Node* GetNode(size_t id) const {
-<<<<<<< HEAD
     return id && !is_free_node_[id] && id < nodes_pool_.size() ? &nodes_pool_[id] : nullptr;
   }
 
   inline Node* GetNode(size_t id) {
     return id && !is_free_node_[id] && id < nodes_pool_.size() ? &nodes_pool_[id] : nullptr;
-=======
-    return id ? &nodes_pool_[id] : nullptr;
-  }
-
-  inline Node* GetNode(size_t id) {
-    return id ? &nodes_pool_[id] : nullptr;
->>>>>>> upstream/master
   }
 
   double GetScore(size_t id);
@@ -115,26 +91,17 @@ class SuffixAutomaton {
 
   bool Empty() const;
 
-<<<<<<< HEAD
-=======
-  void Output();
-
-  void Output(size_t v, std::string s);
-
->>>>>>> upstream/master
   double GetCurrentCoef();
 
   std::string GetLongestString(size_t id);
 
   bool ReduceSize();
 
-<<<<<<< HEAD
   std::vector<size_t> GetNodesInOrder();
-=======
+
   std::unique_ptr<ProtoAutomaton> GetProtoAutomaton() const;
 
   explicit SuffixAutomaton(const ProtoAutomaton& proto_automaton);
->>>>>>> upstream/master
 
 private:
   size_t NewNode();
@@ -143,11 +110,8 @@ class SuffixAutomaton {
 
   bool AddEdge(size_t from, size_t to, char ch);
 
-<<<<<<< HEAD
   bool DeleteEdge(size_t from, size_t to);
 
-=======
->>>>>>> upstream/master
   bool DeleteNode(size_t id);
 
   void AddCharacter(char ch);
@@ -164,17 +128,11 @@ class SuffixAutomaton {
   size_t amount_alive_nodes_;
   double current_coef;
   std::set<std::pair<std::pair<double, size_t>, size_t>, compare_nodes> nodes_to_delete_;
-<<<<<<< HEAD
-};
-
-=======
 
   friend class SerializationTest;
 };
 
 
-
->>>>>>> upstream/master
 class SuffixAutomaton::iterator {
 public:
   iterator(size_t id, std::vector<bool>& is_free_node);
@@ -195,4 +153,3 @@ class SuffixAutomaton::iterator {
 };
 
 #endif // SUFFIX_AUTOMATON_HPP_
-
diff --git a/src/incremental_updater/CMakeLists.txt b/src/incremental_updater/CMakeLists.txt
index 86e586c..6d2d2e7 100644
--- a/src/incremental_updater/CMakeLists.txt
+++ b/src/incremental_updater/CMakeLists.txt
@@ -5,4 +5,5 @@ target_link_libraries(incremental_generator LINK_PUBLIC
   vcdenc
   vcddec
   dictgen
+  gflags
 )
\ No newline at end of file
diff --git a/src/incremental_updater/incremental_generator.cpp b/src/incremental_updater/incremental_generator.cpp
index a5b1006..d1536ca 100644
--- a/src/incremental_updater/incremental_generator.cpp
+++ b/src/incremental_updater/incremental_generator.cpp
@@ -1,19 +1,169 @@
+#include <fcntl.h>
+#include <fstream>
 #include <iostream>
-#include <google/vcencoder.h>
-#include <google/vcdecoder.h>
+#include <time.h>
+#include <sstream>
+#include <vector>
+#include <unistd.h>
+#include <gflags/gflags.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <dictionary.hpp>
-#include <statistics.h>
 
 using std::string;
-using open_vcdiff::Statistics;
+using std::vector;
 using std::cout;
 using std::endl;
+using std::cerr;
 
+DEFINE_string(automaton_proto, "automaton.pb", "path to the serialized automaton");
+DEFINE_string(new_documents, "",
+                 "comma-separated list of documents to update the dictionary");
+DEFINE_int64(max_size, 307200, "max allowable number of nodes in the automaton");
+DEFINE_string(output_path, "dictionary",
+                 "name of the file to be recorded new dictionary");
+#define PROTOBUF_AUTOMATON_CONST 8867856
 
-int main() {
-  //code coming soon...
+void split(vector<string>&, const string&, const string&);
+bool serializeAutomatonToDisk(
+      const SuffixAutomaton& automaton,
+      const string& automatonPath);
+bool deserializeAutomatonFromDisk(
+      SuffixAutomaton& automaton,
+      const string& automatonPath);
 
-  std::cout << "not implemented yet :(" << std::endl;
+int main(int argc, char** argv) {
+    long t1 = clock();
+    google::ParseCommandLineFlags(&argc, &argv, true);
+    vector<std::string> docs;
+    split(docs,FLAGS_new_documents, ",");
+    if (docs.size() < 2) {
+        cerr << "Empty new_documents list. For help: '"
+             << argv[0] << " --help'" << endl;
+        return 1;
+    }
+    SuffixAutomaton automaton = SuffixAutomaton();
+    if (deserializeAutomatonFromDisk(automaton, FLAGS_automaton_proto))
+    {
+        cout << "Automation loaded." << endl;
+    } else {
+        cerr << "Failed to open: " << FLAGS_automaton_proto << endl;
+    }
+    Dictionary dictionary = Dictionary(FLAGS_max_size, 3, automaton);
+    cout << "AmountNodes: " << automaton.AmountNodes() << endl;
+    long n_total_added = 0;
+    for (auto it = docs.begin(); it != docs.end(); ++it) {
+        std::ifstream doc(*it);
+        if (doc.is_open()) {
+            std::ostringstream ss;
+            ss << doc.rdbuf();
+            string str(ss.str());
+            cout << *it << " loaded " << str.length() << " simbols." << endl;
+            n_total_added += str.length();
+            dictionary.AddDocumentViaStopSymbol(str);
+            cout<< *it << " is added to automaton." << endl;
+            //Remove added file
+            int ret_code = remove((*it).c_str());
+            if (ret_code == 0) {
+                cout << *it << " deleted." << endl;
+            } else {
+                cerr <<"Error during deletion." << *it << endl;
+                return 1;
+            }
+        }
+    }
+    cout << "total added: " << n_total_added << " simbols." << endl;
+    dictionary.BuildDict();
+    dictionary.OutputDictTo(FLAGS_output_path);
+    cout << "Save dictionary." << endl;
+    if (dictionary.GetAutomaton().AmountNodes() > 2) {
+        cout << "AmountNodes: " << dictionary.GetAutomaton().AmountNodes() << endl;
+        if (serializeAutomatonToDisk(dictionary.GetAutomaton(), FLAGS_automaton_proto)) {
+            cout << "Save automaton." << endl;
+        } else {
+            cerr <<"Error during save aumomaton." << endl;
+            return 1;
+        }
+    } else {
+        cerr <<"Error empty automaton on exit." << endl;
+        return 1;
+    }
+    long t2 = clock();
+    cout << "time: " << t2 - t1 << endl;
+    return 0;
+}
 
-  return 0;
+void split(std::vector<string> & result,
+      const std::string  & str,
+      const std::string  & delimiter) {
+    size_t  start = 0, end = 0;
+    while (end != string::npos) {
+        end = str.find(delimiter, start);
+        result.push_back(str.substr( start,
+                       (end == string::npos) ? string::npos : end - start));
+        start = ((end > (string::npos - delimiter.size()))
+                  ?  string::npos : end + delimiter.size());
+    }
+}
+
+bool serializeAutomatonToDisk(
+      const SuffixAutomaton& automaton,
+      const string& automatonPath)
+{
+    int fd = open(automatonPath.c_str(), O_WRONLY | O_CREAT | O_TRUNC,
+                             S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    if (fd < 0) {
+        cerr << "Can't create \'" << automatonPath << "\'." << endl;
+        return false;
+    }
+    google::protobuf::io::ZeroCopyOutputStream* raw_output = 
+            new google::protobuf::io::FileOutputStream(fd);
+    google::protobuf::io::CodedOutputStream* coded_output = 
+            new google::protobuf::io::CodedOutputStream(raw_output);
+
+    auto proto_automaton_ptr = automaton.GetProtoAutomaton();
+    std::ostringstream output_stream;
+    proto_automaton_ptr->SerializeToOstream(&output_stream);
+    string output_str(output_stream.str());
+    coded_output->WriteLittleEndian32(PROTOBUF_AUTOMATON_CONST);
+    coded_output->WriteVarint32(output_str.length());
+    coded_output->WriteRaw(output_str.c_str(), output_str.length());
+
+    delete coded_output;
+    delete raw_output;
+    close(fd);
+    return true;
+}
+
+bool deserializeAutomatonFromDisk(
+      SuffixAutomaton& automaton,
+      const string& automatonPath) {
+    int fd = open(automatonPath.c_str(), O_RDONLY);
+    if (fd < 0) {
+        cerr << "Can't open " << automatonPath << " fd = " << fd << endl;
+        return false;
+    }
+    google::protobuf::io::ZeroCopyInputStream* raw_input = 
+            new google::protobuf::io::FileInputStream(fd);
+    google::protobuf::io::CodedInputStream* coded_input = 
+            new google::protobuf::io::CodedInputStream(raw_input);
+
+    uint32_t magic_number = 0;
+    coded_input->ReadLittleEndian32(&magic_number);
+    if (magic_number != PROTOBUF_AUTOMATON_CONST) {
+        cerr << "File \'"<< automatonPath << "\' not in expected format." << endl;
+        return false;
+    }
+
+    uint32_t sizeToRead = 0;
+    coded_input->ReadVarint32(&sizeToRead);
+    char* buffer = new char[sizeToRead];
+    coded_input->ReadRaw(buffer, sizeToRead);
+    ProtoAutomaton proto_automaton_ptr;
+    proto_automaton_ptr.ParseFromArray((void*)buffer, sizeToRead);
+    automaton = SuffixAutomaton(proto_automaton_ptr);
+    delete coded_input;
+    delete raw_input;
+    close(fd);
+    return true;
 }
\ No newline at end of file
diff --git a/src/third_party/open-vcdiff/CMakeLists.txt b/src/third_party/open-vcdiff/CMakeLists.txt
index 27e1365..e1193da 100644
--- a/src/third_party/open-vcdiff/CMakeLists.txt
+++ b/src/third_party/open-vcdiff/CMakeLists.txt
@@ -59,7 +59,6 @@ target_link_libraries(vcdenc
 include_directories(
   ${CMAKE_CURRENT_SOURCE_DIR}/src/
   ${CMAKE_CURRENT_SOURCE_DIR}/src/zlib
-  ${CMAKE_CURRENT_SOURCE_DIR}/gflags/src
 )
 set(VCDIFF_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
 target_include_directories (vcdcom PUBLIC ${VCDIFF_INCLUDE_DIR})
@@ -76,6 +75,9 @@ add_library(gflags
   gflags/src/gflags_completions.cc
   gflags/src/gflags_reporting.cc
 )
+
+target_include_directories (gflags PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/gflags/src)
+
 add_executable(vcdiff
   src/vcdiff_main.cc
 )

From 5c7b8b629704615f0a78118eb523e66dbec0e959 Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Tue, 9 Dec 2014 15:09:46 +0300
Subject: [PATCH 03/10] Update README.md

---
 src/dict_builder/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
index c76ecf4..8fec1ef 100644
--- a/src/dict_builder/README.md
+++ b/src/dict_builder/README.md
@@ -22,3 +22,9 @@ So we can calculate `DocsOccursIn` for each node. We are going to solve the very
 In the last part of solution we just sort all survived substring by their rating and pick them until we reach the limit on the dictionary size.  
   
 Time complexity is `O(sum_length_documents)` with relatively small constant.
+
+**Further improvements**
+
+We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant  ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png). Smaller alpha corresponds to a huge sensitivity to the new documents. One can see that we can reach almost the same by multiplying the newest document by ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126773/render.png).  
+  
+Another 

From a102aeac0f16cc99d190c3adc092940c31a2b124 Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Tue, 9 Dec 2014 21:16:33 +0300
Subject: [PATCH 04/10] Update README.md

---
 src/dict_builder/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
index 8fec1ef..4a0a40e 100644
--- a/src/dict_builder/README.md
+++ b/src/dict_builder/README.md
@@ -27,4 +27,6 @@ Time complexity is `O(sum_length_documents)` with relatively small constant.
 
 We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant  ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png). Smaller alpha corresponds to a huge sensitivity to the new documents. One can see that we can reach almost the same by multiplying the newest document by ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126773/render.png).  
   
-Another 
+Another modification is related to reducing automaton's size when it become too big. In that case we just delete the node without outgoing edges with smallest score (the leaf of the automaton) until we reach desired amount of nodes.  
+  
+The remaining part of our model is still the same.

From 2ec45853bfa390ee67a07fc2f8bb1f3362ad194b Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Tue, 9 Dec 2014 21:56:12 +0300
Subject: [PATCH 05/10] Update README.md

---
 src/dict_builder/README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
index 4a0a40e..fe6fd24 100644
--- a/src/dict_builder/README.md
+++ b/src/dict_builder/README.md
@@ -30,3 +30,17 @@ We can make our algorithm online, that is, process documents one by one (or set
 Another modification is related to reducing automaton's size when it become too big. In that case we just delete the node without outgoing edges with smallest score (the leaf of the automaton) until we reach desired amount of nodes.  
   
 The remaining part of our model is still the same.
+
+**Usage**  
+  
+We use this tool via objects of the class `Dictionary`. One can pass the following parameters to the constructor: maximum size of the dictionary, minimum length of a string in the dictionary, stop symbol (say, `#`), maximum size of the automaton, coefficient ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png).  
+  
+Also, where are useful methods:  
+`Dictionary::AddDocument` corresponds to the operation `whole_string += document`,  
+`Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`,
+`Dictionary::BuildDict` builds the dictionary from the current whole string,
+`Dictionary::GetDict` returns dictionary obtained via the lastest call of the previous method.  
+
+Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty!  
+  
+We suggest to call `GetDict` only then one really needs the current dictionary.

From b545695f81200b789ab03b26d3c7c5404de88164 Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Tue, 9 Dec 2014 21:56:51 +0300
Subject: [PATCH 06/10] Update README.md

---
 src/dict_builder/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
index fe6fd24..71a551b 100644
--- a/src/dict_builder/README.md
+++ b/src/dict_builder/README.md
@@ -38,7 +38,7 @@ We use this tool via objects of the class `Dictionary`. One can pass the followi
 Also, where are useful methods:  
 `Dictionary::AddDocument` corresponds to the operation `whole_string += document`,  
 `Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`,
-`Dictionary::BuildDict` builds the dictionary from the current whole string,
+`Dictionary::BuildDict` builds the dictionary from the current whole string,  
 `Dictionary::GetDict` returns dictionary obtained via the lastest call of the previous method.  
 
 Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty!  

From ea76166f53b714a17b27e317c7a8d8533b93c769 Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Tue, 9 Dec 2014 22:02:59 +0300
Subject: [PATCH 07/10] Update README.md

---
 src/dict_builder/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
index 71a551b..cc3448c 100644
--- a/src/dict_builder/README.md
+++ b/src/dict_builder/README.md
@@ -39,7 +39,7 @@ Also, where are useful methods:
 `Dictionary::AddDocument` corresponds to the operation `whole_string += document`,  
 `Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`,
 `Dictionary::BuildDict` builds the dictionary from the current whole string,  
-`Dictionary::GetDict` returns dictionary obtained via the lastest call of the previous method.  
+`Dictionary::GetDict` returns dictionary obtained via the lateest call of the previous method.  
 
 Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty!  
   

From 8d9ec12ec6ee8a3c051e1357fe46edfd1004acdc Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Tue, 9 Dec 2014 22:03:28 +0300
Subject: [PATCH 08/10] Update README.md

---
 src/dict_builder/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
index cc3448c..75c8141 100644
--- a/src/dict_builder/README.md
+++ b/src/dict_builder/README.md
@@ -39,7 +39,7 @@ Also, where are useful methods:
 `Dictionary::AddDocument` corresponds to the operation `whole_string += document`,  
 `Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`,
 `Dictionary::BuildDict` builds the dictionary from the current whole string,  
-`Dictionary::GetDict` returns dictionary obtained via the lateest call of the previous method.  
+`Dictionary::GetDict` returns dictionary obtained via the latest call of the previous method.  
 
 Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty!  
   

From 207005bd46715e4fa1a88160ce520b6f83b15532 Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Wed, 10 Dec 2014 02:25:18 +0400
Subject: [PATCH 09/10] added two dicts tester

---
 src/CMakeLists.txt                  |   2 +-
 src/two_dicts_tester/CMakeLists.txt |   8 ++
 src/two_dicts_tester/tester.cpp     | 155 ++++++++++++++++++++++++++++
 3 files changed, 164 insertions(+), 1 deletion(-)
 create mode 100644 src/two_dicts_tester/CMakeLists.txt
 create mode 100644 src/two_dicts_tester/tester.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 244b01e..cafc9d1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,5 +3,5 @@ add_subdirectory(gtest)
 add_subdirectory(dict_builder)
 add_subdirectory(incremental_tester/)
 add_subdirectory(incremental_updater/)
-
+add_subdirectory(two_dicts_tester/)
 
diff --git a/src/two_dicts_tester/CMakeLists.txt b/src/two_dicts_tester/CMakeLists.txt
new file mode 100644
index 0000000..e1ff617
--- /dev/null
+++ b/src/two_dicts_tester/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(two_dicts_tester
+  tester.cpp
+)
+target_link_libraries(two_dicts_tester LINK_PUBLIC
+  vcdenc
+  vcddec
+  dictgen
+)
diff --git a/src/two_dicts_tester/tester.cpp b/src/two_dicts_tester/tester.cpp
new file mode 100644
index 0000000..8e431b8
--- /dev/null
+++ b/src/two_dicts_tester/tester.cpp
@@ -0,0 +1,155 @@
+#include <iostream>
+#include <cstring>
+#include <stdexcept>
+#include <fstream>
+#include <google/vcencoder.h>
+#include <google/vcdecoder.h>
+#include <dirent.h>
+#include <sys/stat.h>
+
+using std::cout;
+using std::endl;
+using std::string;
+
+const char * kOldDictFileNameParam = "-file_name_old";
+const char * kNewDictFileNameParam = "-file_name_new";
+const char * kDocumentsFolderParam = "-folder_documents";
+const char * kResultFileNameParam = "-result_file_name";  
+
+double CalcScore(const string& path, const string& dict) {
+  std::ifstream input(path.c_str());
+
+  if  (!input.is_open()) {
+    throw std::invalid_argument("cannot open file \'" + path + "\'");
+  }
+
+  string content;
+  string buf;
+  while (input >> buf) {
+    content += buf;
+  }
+
+  if  (content.empty()) {
+    return 0.0;
+  }
+
+  open_vcdiff::VCDiffEncoder encoder(dict.data(), dict.size());
+//  encoder.SetFormatFlags(open_vcdiff::VCD_FORMAT_INTERLEAVED);
+  std::string delta;
+  encoder.Encode(content.data(), content.size(), &delta);
+  return (double) delta.size() / content.size();
+}
+
+string ReadDict(const char * dict_file_name) {
+  std::ifstream input(dict_file_name);
+
+  if  (!input.is_open()) {
+    throw std::invalid_argument("cannot read from file \'" + string(dict_file_name) + "\'");
+  }
+
+  string dict;
+  input >> dict;
+  return dict;
+}
+
+double CalcScore(const char * dict_file_name, const char * documents_folder) {
+  double score = 0.0;
+  size_t cnt_docs = 0;
+
+  string dict = ReadDict(dict_file_name);
+
+  DIR *dirp;
+  if  (!(dirp = opendir(documents_folder))) {
+    throw std::invalid_argument("failed to open directory");
+  }
+
+  struct dirent *dp;
+  while ((dp = readdir(dirp)) != NULL) {
+    struct stat st;
+    std::string path = string(documents_folder) + "/" + dp->d_name;
+    if (0 == stat(path.c_str(), &st)) {
+      if (S_ISREG(st.st_mode)) {
+        score += CalcScore(path, dict);
+        ++cnt_docs;
+      }
+    }
+  }
+
+  if  (!cnt_docs) {
+    return 0.0;
+  }
+  return score / cnt_docs;
+}
+
+int main(int argc, char ** argv) {
+  char * old_dict_file_name = nullptr;
+  char * new_dict_file_name = nullptr;
+  char * documents_folder = nullptr;
+  char * result_file_name = nullptr;
+
+  for (int i = 1; i < argc; ) {
+    if  (!strcmp(argv[i], kOldDictFileNameParam)) {
+      if  (i + 1 >= argc) {
+        cout << "cannot find old dictionary's file name" << endl;
+        return 1;
+      }
+      old_dict_file_name = argv[i + 1];
+      i += 2;
+    } else if  (!strcmp(argv[i], kNewDictFileNameParam)) {
+      if  (i + 1 >= argc) {
+        cout << "cannot find new dictionary's file name" << endl;
+        return 1;
+      }    
+      new_dict_file_name = argv[i + 1];
+      i += 2;
+    } else if  (!strcmp(argv[i], kDocumentsFolderParam)) {
+      if  (i + 1 >= argc) {
+        cout << "cannot find documents' folder name" << endl;
+        return 1;
+      }
+      documents_folder = argv[i + 1];
+      i += 2;
+    } else if  (!strcmp(argv[i], kResultFileNameParam)) {
+      if  (i + 1 >= argc) {
+        cout << "cannot find result's file name" << endl;
+        return 1;
+      }
+      result_file_name = argv[i + 1];
+      i += 2;
+    } else {
+      cout << "unrecognized parameter \'" << string(argv[i]) << "\'" << endl;
+      return 1;
+    }
+  }  
+
+  if  (!old_dict_file_name) {
+    cout << "old dictionary's file name must be specified with \'" << string(kOldDictFileNameParam) << "\'" << endl;
+    return 1;
+  }
+
+  if  (!new_dict_file_name) {
+    cout << "new dictionary's file name must be specified with \'" << string(kNewDictFileNameParam) << "\'" << endl;
+    return 1;
+  }
+
+  if  (!documents_folder) {
+    cout << "documents' folder must be specified with \'" << string(kDocumentsFolderParam) << "\'" << endl;
+    return 1;
+  }
+
+  if  (!result_file_name) {
+    cout << "result's file name must be specified with \'" << string(kResultFileNameParam) << "\'" << endl;
+    return 1;
+  }
+
+  std::ofstream output(result_file_name);
+  try {
+    // smaller result is better
+    output << CalcScore(new_dict_file_name, documents_folder) / CalcScore(old_dict_file_name, documents_folder) << endl;
+  } catch (const std::exception& ex) {
+    cout << "an exception was thrown " << ex.what() << endl;
+    return 1; 
+  }
+  return 0;
+}
+

From 693b1604428605a10f744f321cd78c6585df1a69 Mon Sep 17 00:00:00 2001
From: PavelSavchenkov <pavelsavchenkow@gmail.com>
Date: Wed, 10 Dec 2014 01:38:35 +0300
Subject: [PATCH 10/10] Create Readme.md

---
 src/two_dicts_tester/Readme.md | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 src/two_dicts_tester/Readme.md

diff --git a/src/two_dicts_tester/Readme.md b/src/two_dicts_tester/Readme.md
new file mode 100644
index 0000000..31343e1
--- /dev/null
+++ b/src/two_dicts_tester/Readme.md
@@ -0,0 +1,8 @@
+This is a program that calculates the advantage of using the new dictionary againts the old one.  
+
+**Usage**  
+`./two_dicts_tester  -file_name_old %file_name_of_the_old_dictionary's_file% -file_name_new %file_name_of_the_new_dictionary's_file% -folder_documents %name_of_the_folder_with_documents_to_test% -result_file_name %file_name_of_the_resulted_file_with_one_number%`  
+  
+Result is the following fraction `score_of_new_dictionary / score_of_old_dictionary`.  
+  
+Smaller score is better since it is `size_encoded_file / size_file`.