From 4d01e593bedec7bd0c1bda0155da6bfd4992c96b Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Wed, 3 Dec 2014 13:43:08 +0300 Subject: [PATCH 01/10] Update README.md --- src/dict_builder/README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md index c76ecf4..c5b592b 100644 --- a/src/dict_builder/README.md +++ b/src/dict_builder/README.md @@ -1,5 +1,5 @@ **Introduction** -This is a tool that builds a "dictionary of common strings". More precisely, it picks some set of substrings from given text files and then writes them one by one to a dictionary. So, the dictionary is just some huge string. +This is a tool that builds a "dictionary of common strings". To be more precise, it picks some set of substrings from given text files and then writes them one by one to a dictionary. So the dictionary is just some huge string. After that, one can make a "delta" file, that encodes our file in respect of the fact that decoder of our file knows the dictionary. That is why, the dictionary should be similar to all given files (in some sense). **How does it work?** @@ -8,7 +8,7 @@ Our goal is to find some set of substrings that maximize the ![equation](http://latex.codecogs.com/png.latex?%5Csum_%7Bs%20%5Cin%20Dict%7D%20%5Cfrac%7BDocsOccursIn%28s%29%20%5Ccdot%20%28len%28s%29%20-%203%29%7D%7Blen%28s%29%7D) under some constraints, namely -1. If we took a substring, we are not allowed to take a subtring of this substring +1. If we take a substring, we are not allowed to take a subtring of this substring 2. ![equation](http://latex.codecogs.com/png.latex?DocsOccursIn%28s%29%20%3E%201%2C%20len%28s%29%20%3E%20threshold). 3. Sum over taken substrings' lengths is not greater than some constant. @@ -22,3 +22,7 @@ So we can calculate `DocsOccursIn` for each node. We are going to solve the very In the last part of solution we just sort all survived substring by their rating and pick them until we reach the limit on the dictionary size. Time complexity is `O(sum_length_documents)` with relatively small constant. + +**Further improvements** + +We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant `0 <= alpha < 1`. Smaller `alpha` correspond to a huge sensitivity to new documents. From 553f283ca790df0f45c65e3bb3da44d4d7b65f07 Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Tue, 9 Dec 2014 14:53:43 +0300 Subject: [PATCH 02/10] update --- .gitignore | 5 - CMakeLists.txt | 4 - README.md | 8 - src/CMakeLists.txt | 5 +- src/dict_builder/CMakeLists.txt | 11 - src/dict_builder/README.md | 8 +- src/dict_builder/dictionary.cpp | 108 +++----- src/dict_builder/dictionary.hpp | 4 + src/dict_builder/dictionary_test.cpp | 4 - src/dict_builder/node.cpp | 18 +- src/dict_builder/node.hpp | 20 +- src/dict_builder/proto/automaton.proto | 4 +- src/dict_builder/serialization_tests.cpp | 3 + src/dict_builder/suffix_automaton.cpp | 236 +++++------------- src/dict_builder/suffix_automaton.hpp | 53 +--- src/incremental_updater/CMakeLists.txt | 1 + .../incremental_generator.cpp | 166 +++++++++++- src/third_party/open-vcdiff/CMakeLists.txt | 4 +- 18 files changed, 287 insertions(+), 375 deletions(-) diff --git a/.gitignore b/.gitignore index 28b298c..0c444da 100644 --- a/.gitignore +++ b/.gitignore @@ -1,15 +1,10 @@ -<<<<<<< HEAD *.html *~ src/third_party/open-vcdiff/ src/gtest - - -======= .idea/ .svn/ *~ .DS_Store src/third_party/open-vcdiff/src/config.h src/third_party/open-vcdiff/src/stamp-h1 ->>>>>>> upstream/master diff --git a/CMakeLists.txt b/CMakeLists.txt index cf0e2b3..883fbdd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,11 +2,7 @@ cmake_minimum_required(VERSION 2.8.11) project(SInGe) #set(CMAKE_VERBOSE_MAKEFILE ON) -<<<<<<< HEAD set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra") -======= -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++1y -Wall -Wextra") ->>>>>>> upstream/master set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) diff --git a/README.md b/README.md index 982e4aa..04800c9 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,4 @@ [![Build Status](https://travis-ci.org/cscenter/SInGe.svg?branch=master)](https://travis-ci.org/cscenter/SInGe) -## How to build first time: -1) go to src/third_party/open-vcdiff -2) ./autogen.sh -3) ./configure - - -After that use Cmake as usual - SDCH Dictionary Incremental Geenrator diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 87f9621..244b01e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,10 +1,7 @@ add_subdirectory(third_party/) add_subdirectory(gtest) add_subdirectory(dict_builder) -add_subdirectory(incremental_updater/) -<<<<<<< HEAD add_subdirectory(incremental_tester/) -======= ->>>>>>> upstream/master +add_subdirectory(incremental_updater/) diff --git a/src/dict_builder/CMakeLists.txt b/src/dict_builder/CMakeLists.txt index 464f8c0..6de2a05 100644 --- a/src/dict_builder/CMakeLists.txt +++ b/src/dict_builder/CMakeLists.txt @@ -1,6 +1,3 @@ -<<<<<<< HEAD -add_library( dictgen -======= find_package(Protobuf REQUIRED) set (PROTO_SOURCES @@ -10,15 +7,12 @@ set (PROTO_SOURCES PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS ${PROTO_SOURCES}) add_library(dictgen ->>>>>>> upstream/master dictionary.cpp dictionary.hpp node.cpp node.hpp suffix_automaton.cpp suffix_automaton.hpp -<<<<<<< HEAD -======= ${PROTO_SRCS} ${PROTO_HDRS} ) @@ -30,7 +24,6 @@ target_include_directories (dictgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${PROTOBUF_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR} ->>>>>>> upstream/master ) add_executable(pzip @@ -42,16 +35,12 @@ add_executable(dict_builder_tests node_test.cpp dictionary_test.cpp suffix_automaton_test.cpp -<<<<<<< HEAD -======= serialization_tests.cpp ->>>>>>> upstream/master ) target_link_libraries(dict_builder_tests gtest_main dictgen ) -target_include_directories (dictgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) add_test(NAME dict_builder_tests COMMAND dict_builder_tests) diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md index c5b592b..c76ecf4 100644 --- a/src/dict_builder/README.md +++ b/src/dict_builder/README.md @@ -1,5 +1,5 @@ **Introduction** -This is a tool that builds a "dictionary of common strings". To be more precise, it picks some set of substrings from given text files and then writes them one by one to a dictionary. So the dictionary is just some huge string. +This is a tool that builds a "dictionary of common strings". More precisely, it picks some set of substrings from given text files and then writes them one by one to a dictionary. So, the dictionary is just some huge string. After that, one can make a "delta" file, that encodes our file in respect of the fact that decoder of our file knows the dictionary. That is why, the dictionary should be similar to all given files (in some sense). **How does it work?** @@ -8,7 +8,7 @@ Our goal is to find some set of substrings that maximize the ![equation](http://latex.codecogs.com/png.latex?%5Csum_%7Bs%20%5Cin%20Dict%7D%20%5Cfrac%7BDocsOccursIn%28s%29%20%5Ccdot%20%28len%28s%29%20-%203%29%7D%7Blen%28s%29%7D) under some constraints, namely -1. If we take a substring, we are not allowed to take a subtring of this substring +1. If we took a substring, we are not allowed to take a subtring of this substring 2. ![equation](http://latex.codecogs.com/png.latex?DocsOccursIn%28s%29%20%3E%201%2C%20len%28s%29%20%3E%20threshold). 3. Sum over taken substrings' lengths is not greater than some constant. @@ -22,7 +22,3 @@ So we can calculate `DocsOccursIn` for each node. We are going to solve the very In the last part of solution we just sort all survived substring by their rating and pick them until we reach the limit on the dictionary size. Time complexity is `O(sum_length_documents)` with relatively small constant. - -**Further improvements** - -We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant `0 <= alpha < 1`. Smaller `alpha` correspond to a huge sensitivity to new documents. diff --git a/src/dict_builder/dictionary.cpp b/src/dict_builder/dictionary.cpp index 0fa99c0..5e1dabe 100644 --- a/src/dict_builder/dictionary.cpp +++ b/src/dict_builder/dictionary.cpp @@ -1,36 +1,24 @@ -<<<<<<< HEAD -======= -#include ->>>>>>> upstream/master -#include +#include +#include #include #include -#include +#include #include -<<<<<<< HEAD -======= -#include -#include ->>>>>>> upstream/master #include +#include #include "dictionary.hpp" #include "suffix_automaton.hpp" -using std::vector; -using std::string; -using std::pair; -using std::make_pair; -<<<<<<< HEAD -using std::endl; -using std::cout; -======= using std::cerr; -using std::endl; using std::cout; -using std::queue; +using std::endl; +using std::make_pair; using std::map; ->>>>>>> upstream/master +using std::pair; +using std::queue; +using std::string; +using std::vector; namespace { const double kEps = 1e-10; @@ -44,18 +32,28 @@ namespace { } }; -<<<<<<< HEAD Dictionary::Dictionary() : kMaxDict(1 << 20), kMinLen(20), kMinDocsOccursIn(2) {} -Dictionary::Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef) : kMaxDict(kMaxDict), kMinLen(kMinLen), kMinDocsOccursIn(2), automaton_all_(SuffixAutomaton(kStopSymbol, kMaxAutomatonSize, kAutomatonCoef)) { +Dictionary::Dictionary(size_t kMaxDict + , size_t kMinLen + , char kStopSymbol + , size_t kMaxAutomatonSize + , double kAutomatonCoef) + : kMaxDict(kMaxDict) + , kMinLen(kMinLen) + , kMinDocsOccursIn(2) + , automaton_all_( + SuffixAutomaton(kStopSymbol, kMaxAutomatonSize, kAutomatonCoef)) { } -======= -const size_t Dictionary::kMaxDict = 1 << 16; -const size_t Dictionary::kMinLen = 3; -const size_t Dictionary::kMinDocsOccursIn = 2; -Dictionary::Dictionary() {} ->>>>>>> upstream/master +Dictionary::Dictionary(size_t kMaxDict + , size_t kMinLen + , SuffixAutomaton& automaton) + : kMaxDict(kMaxDict) + , kMinLen(kMinLen) + , kMinDocsOccursIn(2) + , automaton_all_(automaton) { +} Dictionary::~Dictionary() {} @@ -101,45 +99,20 @@ void Dictionary::BuildDict() { ResetLastDocument(); dict_.clear(); -<<<<<<< HEAD -======= - cout << "automaton size = " << automaton_all_.AmountAliveNodes() << endl; -/* - for (size_t id : automaton_all_) { - cout << "occurs " << GetNode(id)->docs_occurs_in << " " << GetNode(id)->len_within_document << endl; - } -*/ - cout << "building dictionary..." << endl; - ->>>>>>> upstream/master vector substrings; CollectGoodSubstrings(&substrings); sort(substrings.begin(), substrings.end(), [&] (int id1, int id2) { return DoubleLess(automaton_all_.GetScore(id2), automaton_all_.GetScore(id1)); }); -<<<<<<< HEAD -======= - cout << "good substrings have been collected and sorted" << endl; - ->>>>>>> upstream/master size_t length_dict = 0; for (size_t i = 0; i < substrings.size() && length_dict + kMinLen <= kMaxDict; ++i) { auto* node = GetNode(substrings[i]); if (length_dict + node->len_within_document > kMaxDict) { continue; } -<<<<<<< HEAD length_dict += node->len_within_document; dict_.push_back(substrings[i]); } -======= -// printf("occurs = %d, len = %d\n", node->docs_occurs_in, node->len_within_document); - length_dict += node->len_within_document; - dict_.push_back(substrings[i]); - } - - cout << "dict's length = " << length_dict << endl; ->>>>>>> upstream/master } vector > Dictionary::GetDictSubstringsList() { @@ -161,6 +134,10 @@ string Dictionary::GetDict() { return dict_str; } +SuffixAutomaton& Dictionary::GetAutomaton() { + return automaton_all_; +} + void Dictionary::OutputDictTo(string path) { std::ofstream file(path); file << GetDict(); @@ -171,11 +148,8 @@ void Dictionary::ResetLastDocument() { return; } -<<<<<<< HEAD // cout << "calculate occurences for document with length " << last_document_.size() << endl; -======= - cout << "calculate occurences for document with length " << last_document_.size() << endl; ->>>>>>> upstream/master + size_t cur_hash = (rand() << 16) ^ rand(); size_t id = automaton_all_.root(); size_t pos = 0; @@ -200,20 +174,8 @@ void Dictionary::CollectGoodSubstrings(vector * substrings) { vector max_score_substring(nodes, -1e20); vector max_score_upstring(nodes, -1e20); vector can_to_dict(nodes, true); -<<<<<<< HEAD vector order = automaton_all_.GetNodesInOrder(); -======= - vector order; - order.reserve(nodes - 1); - - for (size_t id : automaton_all_) { - order.push_back(id); - } - - sort(order.begin(), order.end(), [&] (size_t id1, size_t id2) { return GetNode(id1)->len_actual < GetNode(id2)->len_actual; } ); - ->>>>>>> upstream/master // calc max_score_substring for (size_t id : order) { double max_score = -1e20; @@ -291,7 +253,3 @@ void Dictionary::CollectGoodSubstrings(vector * substrings) { bool Dictionary::CanAffordSubstringFrom(Node* node) const { return node->len_within_document >= kMinLen && node->docs_occurs_in >= kMinDocsOccursIn; } -<<<<<<< HEAD - -======= ->>>>>>> upstream/master diff --git a/src/dict_builder/dictionary.hpp b/src/dict_builder/dictionary.hpp index dc10d43..087f649 100644 --- a/src/dict_builder/dictionary.hpp +++ b/src/dict_builder/dictionary.hpp @@ -20,6 +20,8 @@ class Dictionary { Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef); + Dictionary(size_t kMaxDict, size_t kMinLen, SuffixAutomaton& automaton); + ~Dictionary(); void AddDocument(std::string& doc); @@ -36,6 +38,8 @@ class Dictionary { std::string GetDict(); + SuffixAutomaton& GetAutomaton(); + void OutputDictTo(std::string path); void ResetLastDocument(); diff --git a/src/dict_builder/dictionary_test.cpp b/src/dict_builder/dictionary_test.cpp index fd3058a..a78feaa 100644 --- a/src/dict_builder/dictionary_test.cpp +++ b/src/dict_builder/dictionary_test.cpp @@ -25,11 +25,7 @@ TEST(DictionaryTest, MainDictionaryTest) { std::string s2 = "qwecabarty"; std::string s3 = "caba_cabaqwe"; -<<<<<<< HEAD Dictionary dict(100, 3, '#', 1000, 1.0); -======= - Dictionary dict; ->>>>>>> upstream/master dict.AddDocumentViaStopSymbol(s1); dict.AddDocumentViaStopSymbol(s2); dict.AddDocumentViaStopSymbol(s3); diff --git a/src/dict_builder/node.cpp b/src/dict_builder/node.cpp index f64f045..484c51d 100644 --- a/src/dict_builder/node.cpp +++ b/src/dict_builder/node.cpp @@ -74,14 +74,9 @@ bool Node::AddEdge(char ch, size_t to) { } bool Node::AddRevEdge(char ch, size_t from) { -<<<<<<< HEAD for (auto& it : rev_edges_) { if (it.second == from) { it.first = ch; -======= - for (auto it : rev_edges_) { - if (it == make_pair(ch, from)) { ->>>>>>> upstream/master return false; } } @@ -103,11 +98,11 @@ void Node::SortEdges() { std::sort(edges_.begin(), edges_.end()); } -size_t Node::InDegree() { +size_t Node::InDegree() const { return rev_edges_.size(); } -size_t Node::OutDegree() { +size_t Node::OutDegree() const { return edges_.size(); } @@ -132,11 +127,7 @@ bool Node::DeleteRevEdge(size_t from) { if (rev_edges_[i].second == from) { pos = i; break; -<<<<<<< HEAD - } -======= } ->>>>>>> upstream/master } if (pos < rev_edges_.size()) { rev_edges_.erase(rev_edges_.begin() + pos); @@ -159,11 +150,9 @@ bool Node::DeleteRevLink(size_t from) { } return false; } -<<<<<<< HEAD -======= std::unique_ptr Node::GetProtoNode() const { - auto proto_node = std::make_unique(); + auto proto_node = std::unique_ptr(new ProtoNode()); auto *proto_repeated_ptrs_edges = proto_node->mutable_edges(); proto_repeated_ptrs_edges->Reserve(edges_.size()); for (const auto &edge : edges_) { @@ -224,4 +213,3 @@ Node::Node(const ProtoNode& proto_node) : Node() { rev_links_.emplace_back(rev_link); } } ->>>>>>> upstream/master diff --git a/src/dict_builder/node.hpp b/src/dict_builder/node.hpp index bae5ebf..b84c920 100644 --- a/src/dict_builder/node.hpp +++ b/src/dict_builder/node.hpp @@ -11,11 +11,9 @@ #include #include #include // size_t -<<<<<<< HEAD -======= -#include //uinique_ptr -#include ->>>>>>> upstream/master +#include // unique_ptr + +#include "automaton.pb.h" class Node { public: @@ -47,9 +45,9 @@ class Node { void SortEdges(); - size_t InDegree(); + size_t InDegree() const; - size_t OutDegree(); + size_t OutDegree() const; bool DeleteEdge(size_t to); @@ -57,13 +55,10 @@ class Node { bool DeleteRevLink(size_t from); -<<<<<<< HEAD -======= std::unique_ptr GetProtoNode() const ; - explicit Node(const ProtoNode & proto_node); + explicit Node(const ProtoNode& proto_node); ->>>>>>> upstream/master size_t link; size_t len_actual; size_t len_within_document; @@ -75,11 +70,8 @@ class Node { std::vector > edges_; std::vector > rev_edges_; std::vector rev_links_; -<<<<<<< HEAD -======= friend class SerializationTest; ->>>>>>> upstream/master }; #endif // NODE_HPP_ diff --git a/src/dict_builder/proto/automaton.proto b/src/dict_builder/proto/automaton.proto index beb3818..74a3393 100644 --- a/src/dict_builder/proto/automaton.proto +++ b/src/dict_builder/proto/automaton.proto @@ -23,5 +23,7 @@ message ProtoAutomaton { required int64 len_up_to_stop_symbol = 3; required double current_coef = 4; repeated bool is_free_node = 5; - + required uint64 max_size = 6; + required int32 stop_symbol = 7; + required double coef = 8; } \ No newline at end of file diff --git a/src/dict_builder/serialization_tests.cpp b/src/dict_builder/serialization_tests.cpp index 996461c..2c99c38 100644 --- a/src/dict_builder/serialization_tests.cpp +++ b/src/dict_builder/serialization_tests.cpp @@ -41,6 +41,9 @@ class SerializationTest : public testing::Test { EXPECT_EQ(first_automaton.current_coef, second_automaton.current_coef) << msg; EXPECT_EQ(first_automaton.amount_alive_nodes_, second_automaton.amount_alive_nodes_) << msg; EXPECT_EQ(first_automaton.nodes_to_delete_, second_automaton.nodes_to_delete_) << msg; + EXPECT_EQ(first_automaton.kMaxSize, second_automaton.kMaxSize) << msg; + EXPECT_EQ(first_automaton.kCoef, second_automaton.kCoef) << msg; + EXPECT_EQ(first_automaton.kStopSymbol, second_automaton.kStopSymbol) << msg; } SuffixAutomaton SerializeAndDeserialize(const SuffixAutomaton& automaton) { diff --git a/src/dict_builder/suffix_automaton.cpp b/src/dict_builder/suffix_automaton.cpp index eb61978..63bfe08 100644 --- a/src/dict_builder/suffix_automaton.cpp +++ b/src/dict_builder/suffix_automaton.cpp @@ -6,14 +6,11 @@ */ #include +#include #include -#include -<<<<<<< HEAD -#include -======= #include -#include ->>>>>>> upstream/master +#include +#include #include "suffix_automaton.hpp" @@ -23,21 +20,25 @@ using std::max; using std::string; using std::make_pair; -<<<<<<< HEAD -SuffixAutomaton::SuffixAutomaton() : kMaxSize(1 << 18), kStopSymbol('#'), kCoef(0.9), len_up_to_stop_symbol_(1), amount_alive_nodes_(0), current_coef(1.0) { +SuffixAutomaton::SuffixAutomaton() + : kMaxSize(1 << 18), + kStopSymbol('#'), + kCoef(0.9), + len_up_to_stop_symbol_(1), + amount_alive_nodes_(0), + current_coef(1.0) { NewNode(); // ~ nullptr last_node_ = NewNode(); AddToNodesToDelete(last_node_); } -SuffixAutomaton::SuffixAutomaton(char kStopSymbol, size_t kMaxSize, double kCoef) : kMaxSize(kMaxSize), kStopSymbol(kStopSymbol), kCoef(kCoef), len_up_to_stop_symbol_(1), amount_alive_nodes_(0), current_coef(1.0) { -======= -const char SuffixAutomaton::kStopSymbol = '#'; -const size_t SuffixAutomaton::kMaxSize = 1 << 13; -const double SuffixAutomaton::kCoef = 0.95; - -SuffixAutomaton::SuffixAutomaton() : len_up_to_stop_symbol_(1), amount_alive_nodes_(0), current_coef(1.0) { ->>>>>>> upstream/master +SuffixAutomaton::SuffixAutomaton(char kStopSymbol, size_t kMaxSize, double kCoef) + : kMaxSize(kMaxSize), + kStopSymbol(kStopSymbol), + kCoef(kCoef), + len_up_to_stop_symbol_(1), + amount_alive_nodes_(0), + current_coef(1.0) { NewNode(); // ~ nullptr last_node_ = NewNode(); AddToNodesToDelete(last_node_); @@ -48,11 +49,7 @@ SuffixAutomaton::~SuffixAutomaton() {} SuffixAutomaton::iterator SuffixAutomaton::begin() { return SuffixAutomaton::iterator(1, is_free_node_); } -<<<<<<< HEAD - -======= ->>>>>>> upstream/master SuffixAutomaton::iterator SuffixAutomaton::end() { return SuffixAutomaton::iterator(AmountNodes(), is_free_node_); } @@ -78,24 +75,18 @@ double SuffixAutomaton::GetScore(size_t id) { } bool SuffixAutomaton::AddOccurence(size_t id) { - if (!GetNode(id)) { + if (!GetNode(id)) return false; - } -<<<<<<< HEAD - if (GetNode(id)->OutDegree() == 0) { + + if (GetNode(id)->OutDegree() == 0) EraseFromNodesToDelete(id); - } + ++GetNode(id)->docs_occurs_in; GetNode(id)->score_occurs_only += 1.0 / current_coef; - if (GetNode(id)->OutDegree() == 0) { + + if (GetNode(id)->OutDegree() == 0) AddToNodesToDelete(id); - } -======= - EraseFromNodesToDelete(id); - ++GetNode(id)->docs_occurs_in; - GetNode(id)->score_occurs_only += 1.0 / current_coef; - AddToNodesToDelete(id); ->>>>>>> upstream/master + return true; } @@ -103,11 +94,7 @@ void SuffixAutomaton::AddString(const char* const str, size_t length) { for (size_t i = 0; i < length; ++i) { AddCharacter(str[i]); ++len_up_to_stop_symbol_; -<<<<<<< HEAD - } -======= } ->>>>>>> upstream/master } void SuffixAutomaton::AddStringViaStopSymbol(const char* const str, size_t length) { @@ -120,11 +107,7 @@ void SuffixAutomaton::AddStringViaStopSymbol(const char* const str, size_t lengt for (size_t i = 0; i < length; ++i) { AddCharacter(str[i]); ++len_up_to_stop_symbol_; -<<<<<<< HEAD - } -======= } ->>>>>>> upstream/master } size_t SuffixAutomaton::root() const { @@ -135,24 +118,6 @@ bool SuffixAutomaton::Empty() const { return last_node_ == root(); } -<<<<<<< HEAD -======= -void SuffixAutomaton::Output() { - for (size_t id : *this) { - std::cout << id << ": " << GetLongestString(id) << std::endl; - } - - Output(root(), ""); -} - -void SuffixAutomaton::Output(size_t v, std::string s) { - printf("%s v = %zu, occurs = %zu, len = %zu, score = %.5f\n", s.c_str(), v, GetNode(v)->docs_occurs_in, GetNode(v)->len_within_document, GetScore(v)); - for (auto it = GetNode(v)->edges_begin(); it != GetNode(v)->edges_end(); ++it) { - Output(it->second, s + it->first); - } -} - ->>>>>>> upstream/master double SuffixAutomaton::GetCurrentCoef() { return current_coef; } @@ -171,11 +136,7 @@ string SuffixAutomaton::GetLongestString(size_t id) { max_len = cur_len; max_id = cur_id; max_ch = cur_ch; -<<<<<<< HEAD - } -======= } ->>>>>>> upstream/master } str += max_ch; id = max_id; @@ -185,23 +146,15 @@ string SuffixAutomaton::GetLongestString(size_t id) { } bool SuffixAutomaton::ReduceSize() { -<<<<<<< HEAD if (AmountAliveNodes() > 2 * kMaxSize) { - while (AmountAliveNodes() > kMaxSize) { + while (AmountAliveNodes() > kMaxSize) DeleteNode(nodes_to_delete_.begin()->second); -======= - if (AmountAliveNodes() > 2 * kMaxSize) { - while (AmountAliveNodes() > kMaxSize) { - auto min_node = *nodes_to_delete_.begin(); - DeleteNode(min_node.second); ->>>>>>> upstream/master - } + return true; } return false; } -<<<<<<< HEAD vector SuffixAutomaton::GetNodesInOrder() { std::queue q_nodes; vector was_in_q(AmountNodes(), false); @@ -226,8 +179,6 @@ vector SuffixAutomaton::GetNodesInOrder() { return order; } -======= ->>>>>>> upstream/master size_t SuffixAutomaton::NewNode() { if (nodes_pool_.empty()) { is_free_node_.push_back(false); @@ -251,23 +202,19 @@ size_t SuffixAutomaton::NewNode() { } bool SuffixAutomaton::AddLink(size_t from, size_t to) { - if (!GetNode(from) || !GetNode(to)) { + if (!GetNode(from) || !GetNode(to)) return false; - } -<<<<<<< HEAD + size_t old_to = GetNode(from)->link; - if (GetNode(old_to)) { + if (GetNode(old_to)) GetNode(old_to)->DeleteRevLink(from); - } -======= ->>>>>>> upstream/master + GetNode(from)->link = to; GetNode(to)->AddRevLink(from); return true; } bool SuffixAutomaton::AddEdge(size_t from, size_t to, char ch) { -<<<<<<< HEAD GetNode(from)->AddEdge(ch, to); GetNode(to)->AddRevEdge(ch, from); if (GetNode(from)->OutDegree() == 1) { @@ -289,7 +236,7 @@ bool SuffixAutomaton::DeleteNode(size_t id) { if (id == last_node_) { size_t new_last_node_ = 0; for (auto it = GetNode(id)->rev_edges_begin(); it != GetNode(id)->rev_edges_end(); ++it) { - if (!new_last_node_ || GetNode(it->second)->len_actual > GetNode(new_last_node_)->len_actual) { + if (!new_last_node_ || GetNode(it->second)->len_actual > GetNode(new_last_node_)->len_actual) { new_last_node_ = it->second; } } @@ -306,52 +253,20 @@ bool SuffixAutomaton::DeleteNode(size_t id) { // delete incoming edges for (auto it = GetNode(id)->rev_edges_begin(); it != GetNode(id)->rev_edges_end(); ++it) { GetNode(it->second)->DeleteEdge(id); - if (GetNode(it->second)->OutDegree() == 0) { + if (GetNode(it->second)->OutDegree() == 0) { AddToNodesToDelete(it->second); } -======= - if (!GetNode(from) || !GetNode(to)) { - return false; - } - GetNode(from)->AddEdge(ch, to); - GetNode(to)->AddRevEdge(ch, from); - return true; -} - -bool SuffixAutomaton::DeleteNode(size_t id) { - if (!GetNode(id)) { - return false; - } - -// std::cout << "delete " << id << std::endl; - - // redirect incoming links - for (auto it = GetNode(id)->rev_links_begin(); it != GetNode(id)->rev_links_end(); ++it) { - size_t from = *it; - GetNode(from)->link = GetNode(id)->link; - } - - // delete incoming edges - for (auto it = GetNode(id)->rev_edges_begin(); it != GetNode(id)->rev_edges_end(); ++it) { - size_t from = it->second; - GetNode(from)->DeleteEdge(id); ->>>>>>> upstream/master } // delete outcoming links (actually, exactly one link) GetNode(GetNode(id)->link)->DeleteRevLink(id); -<<<<<<< HEAD vector to_delete; // delete outgoing edges -======= - // delete outcoming edges ->>>>>>> upstream/master for (auto it = GetNode(id)->edges_begin(); it != GetNode(id)->edges_end(); ++it) { size_t to = it->second; GetNode(to)->DeleteRevEdge(id); - if (GetNode(to)->InDegree() == 0) { -<<<<<<< HEAD + if (GetNode(to)->InDegree() == 0) { to_delete.push_back(to); } } @@ -365,17 +280,6 @@ bool SuffixAutomaton::DeleteNode(size_t id) { DeleteNode(to); } return true; -======= - DeleteNode(to); - } - } - - --amount_alive_nodes_; - is_free_node_[id] = true; - free_nodes_.push_back(id); - EraseFromNodesToDelete(id); - return true; ->>>>>>> upstream/master } void SuffixAutomaton::AddCharacter(char ch) { @@ -396,11 +300,7 @@ void SuffixAutomaton::AddCharacter(char ch) { AddLink(new_node, root()); return; } -<<<<<<< HEAD - -======= ->>>>>>> upstream/master size_t next = GetNode(prev)->NextNodeThrough(ch); if (GetNode(next)->len_actual == GetNode(prev)->len_actual + 1) { AddLink(new_node, next); @@ -413,7 +313,6 @@ void SuffixAutomaton::AddCharacter(char ch) { GetNode(middle)->score_occurs_only = GetNode(next)->score_occurs_only; GetNode(middle)->docs_occurs_in = GetNode(next)->docs_occurs_in; AddLink(middle, GetNode(next)->link); -<<<<<<< HEAD AddToNodesToDelete(middle); for (auto it = GetNode(next)->edges_begin(); it != GetNode(next)->edges_end(); ++it) { AddEdge(middle, it->second, it->first); @@ -421,14 +320,6 @@ void SuffixAutomaton::AddCharacter(char ch) { for (; prev && GetNode(prev)->NextNodeThrough(ch) == next; prev = GetNode(prev)->link) { DeleteEdge(prev, next); -======= - for (auto it = GetNode(next)->edges_begin(); it != GetNode(next)->edges_end(); ++it) { - AddEdge(middle, it->second, it->first); - } - AddToNodesToDelete(middle); - - for (; prev && GetNode(prev)->NextNodeThrough(ch) == next; prev = GetNode(prev)->link) { ->>>>>>> upstream/master AddEdge(prev, middle, ch); } AddLink(new_node, middle); @@ -436,20 +327,14 @@ void SuffixAutomaton::AddCharacter(char ch) { } void SuffixAutomaton::AddToNodesToDelete(size_t id) { -<<<<<<< HEAD assert(GetNode(id)->OutDegree() == 0); - nodes_to_delete_.insert(make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id)); -} - -void SuffixAutomaton::EraseFromNodesToDelete(size_t id) { - nodes_to_delete_.erase(make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id)); -======= - nodes_to_delete_.insert(make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id)); + nodes_to_delete_.insert( + make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id)); } void SuffixAutomaton::EraseFromNodesToDelete(size_t id) { - nodes_to_delete_.erase(make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id)); ->>>>>>> upstream/master + nodes_to_delete_.erase( + make_pair(make_pair(GetNode(id)->score_occurs_only, GetNode(id)->len_within_document), id)); } SuffixAutomaton::iterator::iterator(size_t id, vector& is_free_node) : id_(id), is_free_node_(is_free_node) {} @@ -476,22 +361,23 @@ bool SuffixAutomaton::iterator::operator ==(const iterator& other) { bool SuffixAutomaton::iterator::operator !=(const iterator& other) { return id_ != other.id_; } -<<<<<<< HEAD -======= std::unique_ptr SuffixAutomaton::GetProtoAutomaton() const { assert(is_free_node_.size() == nodes_pool_.size()); - auto proto_automaton = std::make_unique(); + auto proto_automaton = std::unique_ptr(new ProtoAutomaton()); proto_automaton->set_last_node(last_node_); proto_automaton->set_len_up_to_stop_symbol(len_up_to_stop_symbol_); proto_automaton->set_current_coef(current_coef); + proto_automaton->set_max_size(kMaxSize); + proto_automaton->set_coef(kCoef); + proto_automaton->set_stop_symbol(kStopSymbol); auto* proto_nodes_pool = proto_automaton->mutable_nodes_pool(); proto_nodes_pool->Reserve(nodes_pool_.size()); for (auto& node : nodes_pool_) { //ownership transfer proto_nodes_pool->AddAllocated(node.GetProtoNode().release()); } - assert(proto_nodes_pool->size() == (int) nodes_pool_.size()); + assert(proto_nodes_pool->size() == static_cast(nodes_pool_.size())); auto* proto_is_free_node = proto_automaton->mutable_is_free_node(); proto_is_free_node->Reserve(is_free_node_.size()); for (bool is_free : is_free_node_) { @@ -501,16 +387,23 @@ std::unique_ptr SuffixAutomaton::GetProtoAutomaton() const { return proto_automaton; } -SuffixAutomaton::SuffixAutomaton(const ProtoAutomaton& proto_automaton) { +SuffixAutomaton::SuffixAutomaton(const ProtoAutomaton& proto_automaton) + : amount_alive_nodes_(0) { last_node_ = proto_automaton.last_node(); len_up_to_stop_symbol_ = proto_automaton.len_up_to_stop_symbol(); current_coef = proto_automaton.current_coef(); + kMaxSize = proto_automaton.max_size(); + kCoef = proto_automaton.coef(); + kStopSymbol = proto_automaton.stop_symbol(); + const auto& proto_is_free_node = proto_automaton.is_free_node(); - is_free_node_.resize(proto_is_free_node.size()); + is_free_node_.resize(proto_automaton.is_free_node_size()); const auto& proto_nodes_pool = proto_automaton.nodes_pool(); - nodes_pool_.reserve(proto_nodes_pool.size()); - assert(proto_is_free_node.size() == proto_nodes_pool.size()); + nodes_pool_.reserve(proto_automaton.nodes_pool_size()); + assert(proto_automaton.is_free_node_size() == proto_automaton.nodes_pool_size()); + nodes_pool_.emplace_back(proto_nodes_pool.Get(0)); // zero node + for (size_t i_node = 1; i_node < is_free_node_.size(); ++i_node) { const auto& proto_node = proto_nodes_pool.Get(i_node); nodes_pool_.emplace_back(proto_node); @@ -519,19 +412,20 @@ SuffixAutomaton::SuffixAutomaton(const ProtoAutomaton& proto_automaton) { is_free_node_[i_node] = true; free_nodes_.push_back(i_node); } else { - nodes_to_delete_.insert( - make_pair( - make_pair( - current_node.score_occurs_only, - current_node.len_within_document - ), - i_node - ) - ); + ++amount_alive_nodes_; + if (current_node.OutDegree() == 0) { + nodes_to_delete_.insert( + make_pair( + make_pair( + current_node.score_occurs_only, + current_node.len_within_document + ), + i_node + ) + ); + } } - amount_alive_nodes_ = nodes_to_delete_.size(); // 1 is for zero node assert(amount_alive_nodes_ + free_nodes_.size() + 1 == nodes_pool_.size()); } } ->>>>>>> upstream/master diff --git a/src/dict_builder/suffix_automaton.hpp b/src/dict_builder/suffix_automaton.hpp index 8bd7cca..91dc503 100644 --- a/src/dict_builder/suffix_automaton.hpp +++ b/src/dict_builder/suffix_automaton.hpp @@ -8,18 +8,15 @@ #ifndef SUFFIX_AUTOMATON_HPP_ #define SUFFIX_AUTOMATON_HPP_ +#include +#include #include //size_t +#include //unique_ptr #include #include #include -#include -<<<<<<< HEAD -#include -======= -#include //unique_ptr -#include ->>>>>>> upstream/master +#include "automaton.pb.h" #include "node.hpp" struct compare_nodes { @@ -39,36 +36,23 @@ struct compare_nodes { return len2 < len1; } -<<<<<<< HEAD return id1 > id2; -======= - return id1 < id2; ->>>>>>> upstream/master } }; class SuffixAutomaton { public: -<<<<<<< HEAD // const in the past size_t kMaxSize; char kStopSymbol; double kCoef; -======= - static const size_t kMaxSize; - static const char kStopSymbol; - static const double kCoef; ->>>>>>> upstream/master class iterator; SuffixAutomaton(); -<<<<<<< HEAD SuffixAutomaton(char kStopSymbol, size_t kMaxSize, double kCoef); -======= ->>>>>>> upstream/master ~SuffixAutomaton(); iterator begin(); @@ -88,19 +72,11 @@ class SuffixAutomaton { } inline const Node* GetNode(size_t id) const { -<<<<<<< HEAD return id && !is_free_node_[id] && id < nodes_pool_.size() ? &nodes_pool_[id] : nullptr; } inline Node* GetNode(size_t id) { return id && !is_free_node_[id] && id < nodes_pool_.size() ? &nodes_pool_[id] : nullptr; -======= - return id ? &nodes_pool_[id] : nullptr; - } - - inline Node* GetNode(size_t id) { - return id ? &nodes_pool_[id] : nullptr; ->>>>>>> upstream/master } double GetScore(size_t id); @@ -115,26 +91,17 @@ class SuffixAutomaton { bool Empty() const; -<<<<<<< HEAD -======= - void Output(); - - void Output(size_t v, std::string s); - ->>>>>>> upstream/master double GetCurrentCoef(); std::string GetLongestString(size_t id); bool ReduceSize(); -<<<<<<< HEAD std::vector GetNodesInOrder(); -======= + std::unique_ptr GetProtoAutomaton() const; explicit SuffixAutomaton(const ProtoAutomaton& proto_automaton); ->>>>>>> upstream/master private: size_t NewNode(); @@ -143,11 +110,8 @@ class SuffixAutomaton { bool AddEdge(size_t from, size_t to, char ch); -<<<<<<< HEAD bool DeleteEdge(size_t from, size_t to); -======= ->>>>>>> upstream/master bool DeleteNode(size_t id); void AddCharacter(char ch); @@ -164,17 +128,11 @@ class SuffixAutomaton { size_t amount_alive_nodes_; double current_coef; std::set, size_t>, compare_nodes> nodes_to_delete_; -<<<<<<< HEAD -}; - -======= friend class SerializationTest; }; - ->>>>>>> upstream/master class SuffixAutomaton::iterator { public: iterator(size_t id, std::vector& is_free_node); @@ -195,4 +153,3 @@ class SuffixAutomaton::iterator { }; #endif // SUFFIX_AUTOMATON_HPP_ - diff --git a/src/incremental_updater/CMakeLists.txt b/src/incremental_updater/CMakeLists.txt index 86e586c..6d2d2e7 100644 --- a/src/incremental_updater/CMakeLists.txt +++ b/src/incremental_updater/CMakeLists.txt @@ -5,4 +5,5 @@ target_link_libraries(incremental_generator LINK_PUBLIC vcdenc vcddec dictgen + gflags ) \ No newline at end of file diff --git a/src/incremental_updater/incremental_generator.cpp b/src/incremental_updater/incremental_generator.cpp index a5b1006..d1536ca 100644 --- a/src/incremental_updater/incremental_generator.cpp +++ b/src/incremental_updater/incremental_generator.cpp @@ -1,19 +1,169 @@ +#include +#include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include -#include using std::string; -using open_vcdiff::Statistics; +using std::vector; using std::cout; using std::endl; +using std::cerr; +DEFINE_string(automaton_proto, "automaton.pb", "path to the serialized automaton"); +DEFINE_string(new_documents, "", + "comma-separated list of documents to update the dictionary"); +DEFINE_int64(max_size, 307200, "max allowable number of nodes in the automaton"); +DEFINE_string(output_path, "dictionary", + "name of the file to be recorded new dictionary"); +#define PROTOBUF_AUTOMATON_CONST 8867856 -int main() { - //code coming soon... +void split(vector&, const string&, const string&); +bool serializeAutomatonToDisk( + const SuffixAutomaton& automaton, + const string& automatonPath); +bool deserializeAutomatonFromDisk( + SuffixAutomaton& automaton, + const string& automatonPath); - std::cout << "not implemented yet :(" << std::endl; +int main(int argc, char** argv) { + long t1 = clock(); + google::ParseCommandLineFlags(&argc, &argv, true); + vector docs; + split(docs,FLAGS_new_documents, ","); + if (docs.size() < 2) { + cerr << "Empty new_documents list. For help: '" + << argv[0] << " --help'" << endl; + return 1; + } + SuffixAutomaton automaton = SuffixAutomaton(); + if (deserializeAutomatonFromDisk(automaton, FLAGS_automaton_proto)) + { + cout << "Automation loaded." << endl; + } else { + cerr << "Failed to open: " << FLAGS_automaton_proto << endl; + } + Dictionary dictionary = Dictionary(FLAGS_max_size, 3, automaton); + cout << "AmountNodes: " << automaton.AmountNodes() << endl; + long n_total_added = 0; + for (auto it = docs.begin(); it != docs.end(); ++it) { + std::ifstream doc(*it); + if (doc.is_open()) { + std::ostringstream ss; + ss << doc.rdbuf(); + string str(ss.str()); + cout << *it << " loaded " << str.length() << " simbols." << endl; + n_total_added += str.length(); + dictionary.AddDocumentViaStopSymbol(str); + cout<< *it << " is added to automaton." << endl; + //Remove added file + int ret_code = remove((*it).c_str()); + if (ret_code == 0) { + cout << *it << " deleted." << endl; + } else { + cerr <<"Error during deletion." << *it << endl; + return 1; + } + } + } + cout << "total added: " << n_total_added << " simbols." << endl; + dictionary.BuildDict(); + dictionary.OutputDictTo(FLAGS_output_path); + cout << "Save dictionary." << endl; + if (dictionary.GetAutomaton().AmountNodes() > 2) { + cout << "AmountNodes: " << dictionary.GetAutomaton().AmountNodes() << endl; + if (serializeAutomatonToDisk(dictionary.GetAutomaton(), FLAGS_automaton_proto)) { + cout << "Save automaton." << endl; + } else { + cerr <<"Error during save aumomaton." << endl; + return 1; + } + } else { + cerr <<"Error empty automaton on exit." << endl; + return 1; + } + long t2 = clock(); + cout << "time: " << t2 - t1 << endl; + return 0; +} - return 0; +void split(std::vector & result, + const std::string & str, + const std::string & delimiter) { + size_t start = 0, end = 0; + while (end != string::npos) { + end = str.find(delimiter, start); + result.push_back(str.substr( start, + (end == string::npos) ? string::npos : end - start)); + start = ((end > (string::npos - delimiter.size())) + ? string::npos : end + delimiter.size()); + } +} + +bool serializeAutomatonToDisk( + const SuffixAutomaton& automaton, + const string& automatonPath) +{ + int fd = open(automatonPath.c_str(), O_WRONLY | O_CREAT | O_TRUNC, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd < 0) { + cerr << "Can't create \'" << automatonPath << "\'." << endl; + return false; + } + google::protobuf::io::ZeroCopyOutputStream* raw_output = + new google::protobuf::io::FileOutputStream(fd); + google::protobuf::io::CodedOutputStream* coded_output = + new google::protobuf::io::CodedOutputStream(raw_output); + + auto proto_automaton_ptr = automaton.GetProtoAutomaton(); + std::ostringstream output_stream; + proto_automaton_ptr->SerializeToOstream(&output_stream); + string output_str(output_stream.str()); + coded_output->WriteLittleEndian32(PROTOBUF_AUTOMATON_CONST); + coded_output->WriteVarint32(output_str.length()); + coded_output->WriteRaw(output_str.c_str(), output_str.length()); + + delete coded_output; + delete raw_output; + close(fd); + return true; +} + +bool deserializeAutomatonFromDisk( + SuffixAutomaton& automaton, + const string& automatonPath) { + int fd = open(automatonPath.c_str(), O_RDONLY); + if (fd < 0) { + cerr << "Can't open " << automatonPath << " fd = " << fd << endl; + return false; + } + google::protobuf::io::ZeroCopyInputStream* raw_input = + new google::protobuf::io::FileInputStream(fd); + google::protobuf::io::CodedInputStream* coded_input = + new google::protobuf::io::CodedInputStream(raw_input); + + uint32_t magic_number = 0; + coded_input->ReadLittleEndian32(&magic_number); + if (magic_number != PROTOBUF_AUTOMATON_CONST) { + cerr << "File \'"<< automatonPath << "\' not in expected format." << endl; + return false; + } + + uint32_t sizeToRead = 0; + coded_input->ReadVarint32(&sizeToRead); + char* buffer = new char[sizeToRead]; + coded_input->ReadRaw(buffer, sizeToRead); + ProtoAutomaton proto_automaton_ptr; + proto_automaton_ptr.ParseFromArray((void*)buffer, sizeToRead); + automaton = SuffixAutomaton(proto_automaton_ptr); + delete coded_input; + delete raw_input; + close(fd); + return true; } \ No newline at end of file diff --git a/src/third_party/open-vcdiff/CMakeLists.txt b/src/third_party/open-vcdiff/CMakeLists.txt index 27e1365..e1193da 100644 --- a/src/third_party/open-vcdiff/CMakeLists.txt +++ b/src/third_party/open-vcdiff/CMakeLists.txt @@ -59,7 +59,6 @@ target_link_libraries(vcdenc include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/src/ ${CMAKE_CURRENT_SOURCE_DIR}/src/zlib - ${CMAKE_CURRENT_SOURCE_DIR}/gflags/src ) set(VCDIFF_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) target_include_directories (vcdcom PUBLIC ${VCDIFF_INCLUDE_DIR}) @@ -76,6 +75,9 @@ add_library(gflags gflags/src/gflags_completions.cc gflags/src/gflags_reporting.cc ) + +target_include_directories (gflags PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/gflags/src) + add_executable(vcdiff src/vcdiff_main.cc ) From 5c7b8b629704615f0a78118eb523e66dbec0e959 Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Tue, 9 Dec 2014 15:09:46 +0300 Subject: [PATCH 03/10] Update README.md --- src/dict_builder/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md index c76ecf4..8fec1ef 100644 --- a/src/dict_builder/README.md +++ b/src/dict_builder/README.md @@ -22,3 +22,9 @@ So we can calculate `DocsOccursIn` for each node. We are going to solve the very In the last part of solution we just sort all survived substring by their rating and pick them until we reach the limit on the dictionary size. Time complexity is `O(sum_length_documents)` with relatively small constant. + +**Further improvements** + +We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png). Smaller alpha corresponds to a huge sensitivity to the new documents. One can see that we can reach almost the same by multiplying the newest document by ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126773/render.png). + +Another From a102aeac0f16cc99d190c3adc092940c31a2b124 Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Tue, 9 Dec 2014 21:16:33 +0300 Subject: [PATCH 04/10] Update README.md --- src/dict_builder/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md index 8fec1ef..4a0a40e 100644 --- a/src/dict_builder/README.md +++ b/src/dict_builder/README.md @@ -27,4 +27,6 @@ Time complexity is `O(sum_length_documents)` with relatively small constant. We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png). Smaller alpha corresponds to a huge sensitivity to the new documents. One can see that we can reach almost the same by multiplying the newest document by ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126773/render.png). -Another +Another modification is related to reducing automaton's size when it become too big. In that case we just delete the node without outgoing edges with smallest score (the leaf of the automaton) until we reach desired amount of nodes. + +The remaining part of our model is still the same. From 2ec45853bfa390ee67a07fc2f8bb1f3362ad194b Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Tue, 9 Dec 2014 21:56:12 +0300 Subject: [PATCH 05/10] Update README.md --- src/dict_builder/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md index 4a0a40e..fe6fd24 100644 --- a/src/dict_builder/README.md +++ b/src/dict_builder/README.md @@ -30,3 +30,17 @@ We can make our algorithm online, that is, process documents one by one (or set Another modification is related to reducing automaton's size when it become too big. In that case we just delete the node without outgoing edges with smallest score (the leaf of the automaton) until we reach desired amount of nodes. The remaining part of our model is still the same. + +**Usage** + +We use this tool via objects of the class `Dictionary`. One can pass the following parameters to the constructor: maximum size of the dictionary, minimum length of a string in the dictionary, stop symbol (say, `#`), maximum size of the automaton, coefficient ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png). + +Also, where are useful methods: +`Dictionary::AddDocument` corresponds to the operation `whole_string += document`, +`Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`, +`Dictionary::BuildDict` builds the dictionary from the current whole string, +`Dictionary::GetDict` returns dictionary obtained via the lastest call of the previous method. + +Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty! + +We suggest to call `GetDict` only then one really needs the current dictionary. From b545695f81200b789ab03b26d3c7c5404de88164 Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Tue, 9 Dec 2014 21:56:51 +0300 Subject: [PATCH 06/10] Update README.md --- src/dict_builder/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md index fe6fd24..71a551b 100644 --- a/src/dict_builder/README.md +++ b/src/dict_builder/README.md @@ -38,7 +38,7 @@ We use this tool via objects of the class `Dictionary`. One can pass the followi Also, where are useful methods: `Dictionary::AddDocument` corresponds to the operation `whole_string += document`, `Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`, -`Dictionary::BuildDict` builds the dictionary from the current whole string, +`Dictionary::BuildDict` builds the dictionary from the current whole string, `Dictionary::GetDict` returns dictionary obtained via the lastest call of the previous method. Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty! From ea76166f53b714a17b27e317c7a8d8533b93c769 Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Tue, 9 Dec 2014 22:02:59 +0300 Subject: [PATCH 07/10] Update README.md --- src/dict_builder/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md index 71a551b..cc3448c 100644 --- a/src/dict_builder/README.md +++ b/src/dict_builder/README.md @@ -39,7 +39,7 @@ Also, where are useful methods: `Dictionary::AddDocument` corresponds to the operation `whole_string += document`, `Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`, `Dictionary::BuildDict` builds the dictionary from the current whole string, -`Dictionary::GetDict` returns dictionary obtained via the lastest call of the previous method. +`Dictionary::GetDict` returns dictionary obtained via the lateest call of the previous method. Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty! From 8d9ec12ec6ee8a3c051e1357fe46edfd1004acdc Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Tue, 9 Dec 2014 22:03:28 +0300 Subject: [PATCH 08/10] Update README.md --- src/dict_builder/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md index cc3448c..75c8141 100644 --- a/src/dict_builder/README.md +++ b/src/dict_builder/README.md @@ -39,7 +39,7 @@ Also, where are useful methods: `Dictionary::AddDocument` corresponds to the operation `whole_string += document`, `Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`, `Dictionary::BuildDict` builds the dictionary from the current whole string, -`Dictionary::GetDict` returns dictionary obtained via the lateest call of the previous method. +`Dictionary::GetDict` returns dictionary obtained via the latest call of the previous method. Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty! From 207005bd46715e4fa1a88160ce520b6f83b15532 Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Wed, 10 Dec 2014 02:25:18 +0400 Subject: [PATCH 09/10] added two dicts tester --- src/CMakeLists.txt | 2 +- src/two_dicts_tester/CMakeLists.txt | 8 ++ src/two_dicts_tester/tester.cpp | 155 ++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 src/two_dicts_tester/CMakeLists.txt create mode 100644 src/two_dicts_tester/tester.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 244b01e..cafc9d1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,5 +3,5 @@ add_subdirectory(gtest) add_subdirectory(dict_builder) add_subdirectory(incremental_tester/) add_subdirectory(incremental_updater/) - +add_subdirectory(two_dicts_tester/) diff --git a/src/two_dicts_tester/CMakeLists.txt b/src/two_dicts_tester/CMakeLists.txt new file mode 100644 index 0000000..e1ff617 --- /dev/null +++ b/src/two_dicts_tester/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(two_dicts_tester + tester.cpp +) +target_link_libraries(two_dicts_tester LINK_PUBLIC + vcdenc + vcddec + dictgen +) diff --git a/src/two_dicts_tester/tester.cpp b/src/two_dicts_tester/tester.cpp new file mode 100644 index 0000000..8e431b8 --- /dev/null +++ b/src/two_dicts_tester/tester.cpp @@ -0,0 +1,155 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cout; +using std::endl; +using std::string; + +const char * kOldDictFileNameParam = "-file_name_old"; +const char * kNewDictFileNameParam = "-file_name_new"; +const char * kDocumentsFolderParam = "-folder_documents"; +const char * kResultFileNameParam = "-result_file_name"; + +double CalcScore(const string& path, const string& dict) { + std::ifstream input(path.c_str()); + + if (!input.is_open()) { + throw std::invalid_argument("cannot open file \'" + path + "\'"); + } + + string content; + string buf; + while (input >> buf) { + content += buf; + } + + if (content.empty()) { + return 0.0; + } + + open_vcdiff::VCDiffEncoder encoder(dict.data(), dict.size()); +// encoder.SetFormatFlags(open_vcdiff::VCD_FORMAT_INTERLEAVED); + std::string delta; + encoder.Encode(content.data(), content.size(), &delta); + return (double) delta.size() / content.size(); +} + +string ReadDict(const char * dict_file_name) { + std::ifstream input(dict_file_name); + + if (!input.is_open()) { + throw std::invalid_argument("cannot read from file \'" + string(dict_file_name) + "\'"); + } + + string dict; + input >> dict; + return dict; +} + +double CalcScore(const char * dict_file_name, const char * documents_folder) { + double score = 0.0; + size_t cnt_docs = 0; + + string dict = ReadDict(dict_file_name); + + DIR *dirp; + if (!(dirp = opendir(documents_folder))) { + throw std::invalid_argument("failed to open directory"); + } + + struct dirent *dp; + while ((dp = readdir(dirp)) != NULL) { + struct stat st; + std::string path = string(documents_folder) + "/" + dp->d_name; + if (0 == stat(path.c_str(), &st)) { + if (S_ISREG(st.st_mode)) { + score += CalcScore(path, dict); + ++cnt_docs; + } + } + } + + if (!cnt_docs) { + return 0.0; + } + return score / cnt_docs; +} + +int main(int argc, char ** argv) { + char * old_dict_file_name = nullptr; + char * new_dict_file_name = nullptr; + char * documents_folder = nullptr; + char * result_file_name = nullptr; + + for (int i = 1; i < argc; ) { + if (!strcmp(argv[i], kOldDictFileNameParam)) { + if (i + 1 >= argc) { + cout << "cannot find old dictionary's file name" << endl; + return 1; + } + old_dict_file_name = argv[i + 1]; + i += 2; + } else if (!strcmp(argv[i], kNewDictFileNameParam)) { + if (i + 1 >= argc) { + cout << "cannot find new dictionary's file name" << endl; + return 1; + } + new_dict_file_name = argv[i + 1]; + i += 2; + } else if (!strcmp(argv[i], kDocumentsFolderParam)) { + if (i + 1 >= argc) { + cout << "cannot find documents' folder name" << endl; + return 1; + } + documents_folder = argv[i + 1]; + i += 2; + } else if (!strcmp(argv[i], kResultFileNameParam)) { + if (i + 1 >= argc) { + cout << "cannot find result's file name" << endl; + return 1; + } + result_file_name = argv[i + 1]; + i += 2; + } else { + cout << "unrecognized parameter \'" << string(argv[i]) << "\'" << endl; + return 1; + } + } + + if (!old_dict_file_name) { + cout << "old dictionary's file name must be specified with \'" << string(kOldDictFileNameParam) << "\'" << endl; + return 1; + } + + if (!new_dict_file_name) { + cout << "new dictionary's file name must be specified with \'" << string(kNewDictFileNameParam) << "\'" << endl; + return 1; + } + + if (!documents_folder) { + cout << "documents' folder must be specified with \'" << string(kDocumentsFolderParam) << "\'" << endl; + return 1; + } + + if (!result_file_name) { + cout << "result's file name must be specified with \'" << string(kResultFileNameParam) << "\'" << endl; + return 1; + } + + std::ofstream output(result_file_name); + try { + // smaller result is better + output << CalcScore(new_dict_file_name, documents_folder) / CalcScore(old_dict_file_name, documents_folder) << endl; + } catch (const std::exception& ex) { + cout << "an exception was thrown " << ex.what() << endl; + return 1; + } + return 0; +} + From 693b1604428605a10f744f321cd78c6585df1a69 Mon Sep 17 00:00:00 2001 From: PavelSavchenkov Date: Wed, 10 Dec 2014 01:38:35 +0300 Subject: [PATCH 10/10] Create Readme.md --- src/two_dicts_tester/Readme.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 src/two_dicts_tester/Readme.md diff --git a/src/two_dicts_tester/Readme.md b/src/two_dicts_tester/Readme.md new file mode 100644 index 0000000..31343e1 --- /dev/null +++ b/src/two_dicts_tester/Readme.md @@ -0,0 +1,8 @@ +This is a program that calculates the advantage of using the new dictionary againts the old one. + +**Usage** +`./two_dicts_tester -file_name_old %file_name_of_the_old_dictionary's_file% -file_name_new %file_name_of_the_new_dictionary's_file% -folder_documents %name_of_the_folder_with_documents_to_test% -result_file_name %file_name_of_the_resulted_file_with_one_number%` + +Result is the following fraction `score_of_new_dictionary / score_of_old_dictionary`. + +Smaller score is better since it is `size_encoded_file / size_file`.