From a9429ae0fbd617d005de18121b1992d7aba0f7f9 Mon Sep 17 00:00:00 2001
From: Simon Gog <simon.gog@gmail.com>
Date: Mon, 21 Oct 2019 12:00:18 +0200
Subject: [PATCH 001/102] Fix typo

---
 README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index b1a0946..e6cc74c 100644
--- a/README.md
+++ b/README.md
@@ -105,12 +105,12 @@ Input data format <a name="input"></a>
 
 The input file should list all completions in
 *lexicographical* order.
-For example, see the the file `test_data/trec05_efficiency_queries/trec05_efficiency_queries.completions`.
+For example, see the the file `test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`.
 
 The first column represent the
 ID of the completion; the other columns contain the
 tokens separated by white spaces.
-(The IDs for the file `trec05_efficiency_queries.completions` are
+(The IDs for the file `trec_05_efficiency_queries.completions` are
 fake, i.e., they do not take into account any
 particular assignment.)
 
@@ -119,49 +119,49 @@ preparing the datasets for indexing:
 
 1. The command
 	
-		$ extract_dict.py trec05_efficiency_queries/trec05_efficiency_queries.completions
+		$ extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
 	
 	extract the dictionary
 from a file listing all completions in textual form.
 
 2. The command
 
-		$ python map_dataset.py trec05_efficiency_queries/trec05_efficiency_queries.completions
+		$ python map_dataset.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
 		
 	maps strings to integer ids.
 
 3. The command
 
-		$ python build_stats.py trec05_efficiency_queries/trec05_efficiency_queries.completions.mapped
+		$ python build_stats.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions.mapped
 		
 	calulcates the dataset statistics.
 
 4. The command
 
-		$ python build_inverted_and_forward.py trec05_efficiency_queries/trec05_efficiency_queries.completions
+		$ python build_inverted_and_forward.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
 		
 	builds the inverted and forward files.
 
 If you run the scripts in the reported order, you will get:
 
-- `trec05_efficiency_queries.completions.dict`: lists all the distinct
+- `trec_05_efficiency_queries.completions.dict`: lists all the distinct
 tokens in the completions sorted in lexicographical
 order.
 
-- `trec05_efficiency_queries.completions.mapped`: lists all completions
+- `trec_05_efficiency_queries.completions.mapped`: lists all completions
 whose tokens have been mapped to integer ids
 as assigned by a lexicographically-sorted
 string dictionary (that should be built from the
-tokens listed in `trec05_efficiency_queries.completions.dict`).
+tokens listed in `trec_05_efficiency_queries.completions.dict`).
 Each completion terminates with the id `0`.
 
-- `trec05_efficiency_queries.completions.mapped.stats` contains some
+- `trec_05_efficiency_queries.completions.mapped.stats` contains some
 statistics about the datasets, needed to build
 the data structures more efficiently.
 
 - `trec05_efficiency_queries.completions.inverted` is the inverted file.
 
-- `trec05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec05_efficiency_queries.completions.mapped` but sorted in docID order.
+- `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order.
 
 Benchmarks <a name="benchmarks"></a>
 ----------
@@ -174,4 +174,4 @@ Live demo <a name="demo"></a>
 ----------
 
 Start the web server with the program `./web_server <port> <index_filename>` and access the demo at
-`localhost:<port>`.
\ No newline at end of file
+`localhost:<port>`.

From 5f97e36d6a196bbd0c9dd61a5e6a201f9a009612 Mon Sep 17 00:00:00 2001
From: Simon Gog <simon.gog@gmail.com>
Date: Mon, 21 Oct 2019 12:50:55 +0200
Subject: [PATCH 002/102] Add python command

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e6cc74c..4f3c123 100644
--- a/README.md
+++ b/README.md
@@ -119,7 +119,7 @@ preparing the datasets for indexing:
 
 1. The command
 	
-		$ extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
+		$ python extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
 	
 	extract the dictionary
 from a file listing all completions in textual form.

From 6a772e06d308b578b76bb0c6fce9653ebe4217b7 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Mon, 21 Oct 2019 13:42:34 +0200
Subject: [PATCH 003/102] script updated

---
 test_data/build_inverted_and_forward.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py
index c627699..743b491 100644
--- a/test_data/build_inverted_and_forward.py
+++ b/test_data/build_inverted_and_forward.py
@@ -21,6 +21,7 @@
 with open(input_filename + ".mapped.stats") as f:
     num_terms = int(f.readline())
     print num_terms
+    f.readline() # skip line containing max num. of query terms
     num_docs = int(f.readline())
     print num_docs
 

From b44ca7eeadd4a80f49a2f9a2b7f1a38385519b2f Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Mon, 21 Oct 2019 14:44:03 +0200
Subject: [PATCH 004/102] more to README

---
 README.md                                     | 42 +++++++++++++++++--
 .../collect_results_by_varying_percentage.py  |  3 +-
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 4f3c123..fb803eb 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ Query autocompletion in C++.
 1. [Description](#descr)
 2. [Compiling the code](#compiling)
 3. [Input data format](#input)
+4. [Building an index](#building)
 4. [Benchmarks](#benchmarks)
 5. [Live demo](#demo)
 
@@ -91,7 +92,7 @@ Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs.
 
 For the best of performance, we recommend compiling with:
 
-	$ `cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On`
+	$ cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On
 
 For a testing environment, use the following instead:
 
@@ -163,12 +164,47 @@ the data structures more efficiently.
 
 - `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order.
 
+Building an index <a name="building"></a>
+----------- 
+
+After compiling the code, run the program `./build` to build an index. You can specify the type of the index and the name of the file
+where the index will be written.
+
+For example, with
+
+	$ ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin
+	
+we can build an index of type `ef_type1` from the test file `../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`, that will be serialized to the file `trec05.ef_type1.bin`.
+
+Possible types are `ef_type1`, `ef_type2`, `ef_type3` and `ef_type4`.
+
+
 Benchmarks <a name="benchmarks"></a>
 ----------
 
-Run `benchmark/benchmark_prefix_topk` and `benchmark/benchmark_conjunctive_topk`.
+To run the top-k benchmarks in the `/benchmark` directory,
+we first need some query logs.
+
+You can use
 
-See the directory `results` for the results on the AOL and MSN query log.
+	python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
+
+to partition the input completions by number of query terms.
+
+Then the command
+
+	./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 1000 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.length=3
+
+will execute 1000 top-10 queries with 3 terms, from which only 25%
+of the prefix of the last token is retained.
+(For no locality, it is suggested to shuffle the queries at random, for example using `gshuf` on Mac.)
+
+We automated the collection of results with the script `script/collected_results_by_varying_percentage.py`.
+From within the `/build` directory, run
+
+	$ python ../script/collect_results_by_varying_percentage.py ef_type1 trec05.ef_type3.bin trec_05_efficiency_queries 10 5000
+	
+You can also specify the option "--breakdown" to record timings breakdowns.
 
 Live demo <a name="demo"></a>
 ----------
diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py
index cc1b9a0..f520405 100644
--- a/script/collect_results_by_varying_percentage.py
+++ b/script/collect_results_by_varying_percentage.py
@@ -5,12 +5,11 @@
 dataset_name = sys.argv[3]
 k = sys.argv[4]
 num_queries = sys.argv[5]
-collect_breakdowns = int(sys.argv[6]) # 0 or 1
 
 output_filename = dataset_name + "." + type
 
 breakdown = ""
-if collect_breakdowns != 0:
+if len(sys.argv) > 6 and sys.argv[6] == "--breakdown":
     breakdown = "--breakdown"
     output_filename += ".breakdown"
 

From fcc8165d41c4a702cfa9aa4d5cceda1f47a23306 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Mon, 21 Oct 2019 23:28:40 +0200
Subject: [PATCH 005/102] benchmark locate_prefix

---
 README.md                                     | 10 +-
 benchmark/CMakeLists.txt                      |  3 +-
 benchmark/benchmark_locate_prefix.cpp         | 98 +++++++++++++++++++
 include/completion_trie.hpp                   |  8 +-
 ...te_prefix_results_by_varying_percentage.py | 15 +++
 ...ect_topk_results_by_varying_percentage.py} |  0
 6 files changed, 124 insertions(+), 10 deletions(-)
 create mode 100644 benchmark/benchmark_locate_prefix.cpp
 create mode 100644 script/collect_locate_prefix_results_by_varying_percentage.py
 rename script/{collect_results_by_varying_percentage.py => collect_topk_results_by_varying_percentage.py} (100%)

diff --git a/README.md b/README.md
index fb803eb..6c97ea8 100644
--- a/README.md
+++ b/README.md
@@ -187,24 +187,24 @@ we first need some query logs.
 
 You can use
 
-	python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
+	$ python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
 
 to partition the input completions by number of query terms.
 
 Then the command
 
-	./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 1000 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.length=3
+	$ ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 1000 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.length=3
 
 will execute 1000 top-10 queries with 3 terms, from which only 25%
 of the prefix of the last token is retained.
 (For no locality, it is suggested to shuffle the queries at random, for example using `gshuf` on Mac.)
 
-We automated the collection of results with the script `script/collected_results_by_varying_percentage.py`.
+We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`.
 From within the `/build` directory, run
 
-	$ python ../script/collect_results_by_varying_percentage.py ef_type1 trec05.ef_type3.bin trec_05_efficiency_queries 10 5000
+	$ python ../script/collect_topk_results_by_varying_percentage.py ef_type1 trec05.ef_type3.bin trec_05_efficiency_queries 10 5000
 	
-You can also specify the option "--breakdown" to record timings breakdowns.
+You can also specify the option `--breakdown` to record timings breakdowns.
 
 Live demo <a name="demo"></a>
 ----------
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index cf8359f..d7f9433 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -2,4 +2,5 @@ add_executable(benchmark_topk benchmark_topk.cpp)
 add_executable(benchmark_prefix_topk benchmark_prefix_topk.cpp)
 add_executable(benchmark_conjunctive_topk benchmark_conjunctive_topk.cpp)
 add_executable(benchmark_fc_dictionary benchmark_fc_dictionary.cpp)
-add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp)
\ No newline at end of file
+add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp)
+add_executable(benchmark_locate_prefix benchmark_locate_prefix.cpp)
\ No newline at end of file
diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp
new file mode 100644
index 0000000..6e9a1ab
--- /dev/null
+++ b/benchmark/benchmark_locate_prefix.cpp
@@ -0,0 +1,98 @@
+#include <iostream>
+
+#include "types.hpp"
+#include "statistics.hpp"
+#include "benchmark_common.hpp"
+
+using namespace autocomplete;
+
+template <typename Index>
+void benchmark_locate_prefix(parameters const& params,
+                             fc_dictionary_type const& dict,
+                             uint32_t max_num_queries, float keep,
+                             essentials::json_lines& result) {
+    Index index;
+    {
+        typename Index::builder builder(params);
+        builder.build(index);
+    }
+
+    typedef std::pair<completion_type, range> query_type;
+    std::vector<std::string> strings;
+    std::vector<query_type> queries;
+    uint32_t num_queries = 0;
+
+    {
+        num_queries = load_queries(strings, max_num_queries, keep, std::cin);
+        for (auto const& string : strings) {
+            completion_type prefix;
+            byte_range suffix;
+            parse(dict, string, prefix, suffix);
+            range suffix_lex_range = dict.locate_prefix(suffix);
+            queries.emplace_back(prefix, suffix_lex_range);
+        }
+    }
+
+    auto musec_per_query = [&](double time) {
+        return time / (runs * num_queries);
+    };
+
+    essentials::timer_type timer;
+    timer.start();
+    for (uint32_t run = 0; run != runs; ++run) {
+        for (auto& query : queries) {
+            auto r = index.locate_prefix(query.first, query.second);
+            essentials::do_not_optimize_away(r.end - r.begin);
+        }
+    }
+    timer.stop();
+    result.add("musec_per_query",
+               std::to_string(musec_per_query(timer.elapsed())));
+}
+
+int main(int argc, char** argv) {
+    int mandatory = 5;
+    if (argc < mandatory + 1) {
+        std::cout << argv[0]
+                  << " <type> <collection_basename> <num_terms_per_query> "
+                     "<max_num_queries> <percentage> < queries"
+                  << std::endl;
+        std::cout << "<percentage> is a float in [0,1] and specifies how much "
+                     "we keep of the last token in a query "
+                  << std::endl;
+        return 1;
+    }
+
+    std::string type(argv[1]);
+    parameters params;
+    params.collection_basename = argv[2];
+    params.load();
+
+    std::string num_terms_per_query(argv[3]);
+    uint32_t max_num_queries = std::atoi(argv[4]);
+    float keep = std::atof(argv[5]);
+
+    fc_dictionary_type dict;
+    {
+        fc_dictionary_type::builder builder(params);
+        builder.build(dict);
+    }
+
+    essentials::json_lines result;
+    result.new_line();
+    result.add("num_terms_per_query", num_terms_per_query);
+    result.add("percentage", std::to_string(keep));
+
+    if (type == "trie") {
+        benchmark_locate_prefix<ef_completion_trie>(
+            params, dict, max_num_queries, keep, result);
+    } else if (type == "fc") {
+        benchmark_locate_prefix<integer_fc_dictionary_type>(
+            params, dict, max_num_queries, keep, result);
+    } else {
+        return 1;
+    }
+
+    result.print();
+    return 0;
+}
\ No newline at end of file
diff --git a/include/completion_trie.hpp b/include/completion_trie.hpp
index 8ae9036..3d52ee5 100644
--- a/include/completion_trie.hpp
+++ b/include/completion_trie.hpp
@@ -166,16 +166,16 @@ struct completion_trie {
     completion_trie() {}
 
     // If the last token of the query is not completely specified,
-    // then we search for its lexicographic range among the children of c.
+    // then we search for its lexicographic range among the children of prefix.
     // Return [a,b)
-    range locate_prefix(completion_type const& c,
+    range locate_prefix(completion_type const& prefix,
                         range suffix_lex_range) const {
         range r{global::not_found, global::not_found};
         range pointer{0, m_nodes.front().size()};
         uint32_t i = 0;
 
-        for (; i < c.size(); ++i) {
-            uint64_t pos = m_nodes[i].find(pointer, c[i]);
+        for (; i < prefix.size(); ++i) {
+            uint64_t pos = m_nodes[i].find(pointer, prefix[i]);
             if (pos == global::not_found) return global::invalid_range;
             pointer = m_pointers[i][pos];
         }
diff --git a/script/collect_locate_prefix_results_by_varying_percentage.py b/script/collect_locate_prefix_results_by_varying_percentage.py
new file mode 100644
index 0000000..889fa94
--- /dev/null
+++ b/script/collect_locate_prefix_results_by_varying_percentage.py
@@ -0,0 +1,15 @@
+import sys, os
+
+type = sys.argv[1] # 'trie' or 'fc'
+collection_basename = sys.argv[2]
+dataset_name = sys.argv[3]
+num_queries = sys.argv[4]
+
+output_filename = dataset_name + "." + type + ".locate_prefix.timings.json"
+
+percentages = ["0.0", "0.25", "0.50", "0.75"]
+
+for perc in percentages:
+    for terms in range(2,8): # (1,8)
+        os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename)
+    os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename)
diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_topk_results_by_varying_percentage.py
similarity index 100%
rename from script/collect_results_by_varying_percentage.py
rename to script/collect_topk_results_by_varying_percentage.py

From 3ce021ff7c35ffa1a338ff2a8207336bfc7a0a7e Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 22 Oct 2019 10:29:18 +0200
Subject: [PATCH 006/102] script for benchmarking locate_prefix

---
 ...ate_prefix_results_by_varying_percentage.py |  2 +-
 script/collect_results.py                      | 18 ------------------
 2 files changed, 1 insertion(+), 19 deletions(-)
 delete mode 100644 script/collect_results.py

diff --git a/script/collect_locate_prefix_results_by_varying_percentage.py b/script/collect_locate_prefix_results_by_varying_percentage.py
index 889fa94..e9142d9 100644
--- a/script/collect_locate_prefix_results_by_varying_percentage.py
+++ b/script/collect_locate_prefix_results_by_varying_percentage.py
@@ -10,6 +10,6 @@
 percentages = ["0.0", "0.25", "0.50", "0.75"]
 
 for perc in percentages:
-    for terms in range(2,8): # (1,8)
+    for terms in range(1,8):
         os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename)
     os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename)
diff --git a/script/collect_results.py b/script/collect_results.py
deleted file mode 100644
index 9d0dd22..0000000
--- a/script/collect_results.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import sys, os
-
-type = sys.argv[1]
-exe = sys.argv[2] # prefix_top, conjunctive_topk, topk
-dataset_name = sys.argv[3]
-k = sys.argv[4]
-num_queries = sys.argv[5]
-collect_breakdowns = int(sys.argv[6]) # 0 or 1
-
-breakdown = ""
-if collect_breakdowns != 0:
-    breakdown = "--breakdown"
-
-output_filename = dataset_name + "." + exe + ".timings.json"
-
-for i in range(1, 8):
-    os.system("../build/benchmark_" + exe + " " + type + " " + k + " ../build/" + dataset_name + ".bin " + str(i) + " " + str(num_queries) + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(i) + ".shuffled 2>> " + output_filename)
-os.system("../build/benchmark_" + exe + " " + type + " " + k + " ../build/" + dataset_name + ".bin 8+ " + str(num_queries) + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename)

From ebe971c09cb5c7026d3187292eda31ee4b4dc016 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 22 Oct 2019 12:06:44 +0200
Subject: [PATCH 007/102] typo in README

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 6c97ea8..d222323 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ For a testing environment, use the following instead:
     $ cd debug_build
     $ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
     $ make
-    
+
 Input data format <a name="input"></a>
 -----------------
 
@@ -119,28 +119,28 @@ The scripts in the directory `test_data` help in
 preparing the datasets for indexing:
 
 1. The command
-	
+
 		$ python extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
-	
+
 	extract the dictionary
 from a file listing all completions in textual form.
 
 2. The command
 
 		$ python map_dataset.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
-		
+
 	maps strings to integer ids.
 
 3. The command
 
 		$ python build_stats.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions.mapped
-		
+
 	calulcates the dataset statistics.
 
 4. The command
 
 		$ python build_inverted_and_forward.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
-		
+
 	builds the inverted and forward files.
 
 If you run the scripts in the reported order, you will get:
@@ -165,7 +165,7 @@ the data structures more efficiently.
 - `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order.
 
 Building an index <a name="building"></a>
------------ 
+-----------
 
 After compiling the code, run the program `./build` to build an index. You can specify the type of the index and the name of the file
 where the index will be written.
@@ -173,7 +173,7 @@ where the index will be written.
 For example, with
 
 	$ ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin
-	
+
 we can build an index of type `ef_type1` from the test file `../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`, that will be serialized to the file `trec05.ef_type1.bin`.
 
 Possible types are `ef_type1`, `ef_type2`, `ef_type3` and `ef_type4`.
@@ -202,8 +202,8 @@ of the prefix of the last token is retained.
 We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`.
 From within the `/build` directory, run
 
-	$ python ../script/collect_topk_results_by_varying_percentage.py ef_type1 trec05.ef_type3.bin trec_05_efficiency_queries 10 5000
-	
+	$ python ../script/collect_topk_results_by_varying_percentage.py ef_type1 trec05.ef_type1.bin trec_05_efficiency_queries 10 5000
+
 You can also specify the option `--breakdown` to record timings breakdowns.
 
 Live demo <a name="demo"></a>

From 3e072c6ee8bdfb5062d5b2ad7b009629dead9416 Mon Sep 17 00:00:00 2001
From: Simon Gog <sgog@ebay.com>
Date: Tue, 22 Oct 2019 16:45:21 +0200
Subject: [PATCH 008/102] Add driver for scripts

---
 test_data/preprocess.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100755 test_data/preprocess.sh

diff --git a/test_data/preprocess.sh b/test_data/preprocess.sh
new file mode 100755
index 0000000..ab4dbeb
--- /dev/null
+++ b/test_data/preprocess.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+collections=`find . | grep "\\.completions$"`
+
+for collection in $collections; do
+    echo $collection
+    python extract_dict.py $collection
+    python map_dataset.py $collection
+    python build_stats.py $collection.mapped
+    python build_inverted_and_forward.py $collection
+done

From 74e6a3c8cc2b1e1861cf95331a193bfb124ec527 Mon Sep 17 00:00:00 2001
From: Simon Gog <sgog@ebay.com>
Date: Tue, 22 Oct 2019 18:20:33 +0200
Subject: [PATCH 009/102] Removing utf-8 encoding

---
 test_data/build_inverted_and_forward.py | 2 +-
 test_data/build_stats.py                | 3 ++-
 test_data/extract_dict.py               | 4 ++--
 test_data/map_dataset.py                | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py
index 743b491..c47ea17 100644
--- a/test_data/build_inverted_and_forward.py
+++ b/test_data/build_inverted_and_forward.py
@@ -36,7 +36,7 @@
         discard = False
         for i in range(1, len(x)):
             try:
-                term = x[i].encode('utf-8')
+                term = x[i]
                 try:
                     term_id = tokens[term]
                     if term_id not in mapped:
diff --git a/test_data/build_stats.py b/test_data/build_stats.py
index f9923f0..5fdfdb7 100644
--- a/test_data/build_stats.py
+++ b/test_data/build_stats.py
@@ -35,4 +35,5 @@
 output_file.write(str(len(nodes_per_level)) + "\n")
 for key, value in sorted(nodes_per_level.iteritems(), key = lambda kv: kv[0]):
     output_file.write(str(value) + "\n")
-output_file.close()
\ No newline at end of file
+output_file.close()
+
diff --git a/test_data/extract_dict.py b/test_data/extract_dict.py
index 875f85b..0672351 100644
--- a/test_data/extract_dict.py
+++ b/test_data/extract_dict.py
@@ -21,5 +21,5 @@
 
 dict_file = open(input_filename + ".dict", 'w')
 for key in sorted(tokens):
-    dict_file.write(key.encode('utf-8') + "\n")
-dict_file.close()
\ No newline at end of file
+    dict_file.write(key + "\n")
+dict_file.close()
diff --git a/test_data/map_dataset.py b/test_data/map_dataset.py
index 86e6357..beb7155 100644
--- a/test_data/map_dataset.py
+++ b/test_data/map_dataset.py
@@ -24,7 +24,7 @@
         string_len = 0;
         mapped = [x[0]]
         for i in range(1, len(x)): # x[0] stores the docID
-            t = x[i].encode('utf-8')
+            t = x[i]
             try:
                 id = tokens[t]
                 mapped.append(id)
@@ -48,4 +48,4 @@
 
 stats_file.write(str(len(tokens)) + "\n")
 stats_file.write(str(max_string_len) + "\n")
-stats_file.close()
\ No newline at end of file
+stats_file.close()

From 2c0b0debb906a108504060f9d7a79126a4191755 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 22 Oct 2019 19:29:16 +0200
Subject: [PATCH 010/102] automated testing with doctest

---
 .gitmodules                             |   3 +
 CMakeLists.txt                          |   9 +-
 external/CMakeLists.txt                 |   5 +-
 external/doctest                        |   1 +
 include/integer_fc_dictionary.hpp       |  32 +++-
 test/CMakeLists.txt                     |   9 +-
 test/test_common.hpp                    |  54 ++++++
 test/test_completion_trie.cpp           | 109 +++---------
 test/test_fc_dictionary.cpp             | 209 +++++++-----------------
 test/test_integer_fc_dictionary.cpp     | 177 +++++---------------
 test/test_locate_prefix.cpp             | 131 +++++----------
 test_data/build_inverted_and_forward.py |   2 +-
 test_data/extract_dict.py               |   2 +-
 test_data/map_dataset.py                |   2 +-
 14 files changed, 273 insertions(+), 472 deletions(-)
 create mode 160000 external/doctest
 create mode 100644 test/test_common.hpp

diff --git a/.gitmodules b/.gitmodules
index 72f21cd..60c5af2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "external/mongoose"]
 	path = external/mongoose
 	url = https://github.com/cesanta/mongoose.git
+[submodule "external/doctest"]
+	path = external/doctest
+	url = https://github.com/onqtam/doctest.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4c90e49..1b2fa97 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,4 +50,11 @@ include_directories(${AUTOCOMPLETE_SOURCE_DIR}/include)
 add_subdirectory(external)
 add_subdirectory(src)
 add_subdirectory(benchmark)
-add_subdirectory(test)
\ No newline at end of file
+
+enable_testing()
+file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp)
+foreach(TEST_SRC ${TEST_SOURCES})
+  get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension
+  add_executable(${TEST_SRC_NAME} ${TEST_SRC})
+  add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME})
+endforeach(TEST_SRC)
diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
index d4722aa..5d0ee92 100644
--- a/external/CMakeLists.txt
+++ b/external/CMakeLists.txt
@@ -1 +1,4 @@
-include_directories(essentials/include)
\ No newline at end of file
+include_directories(essentials/include)
+
+set(DOCTEST_INCLUDE_DIR ${AUTOCOMPLETE_SOURCE_DIR}/external/doctest)
+include_directories(${DOCTEST_INCLUDE_DIR})
\ No newline at end of file
diff --git a/external/doctest b/external/doctest
new file mode 160000
index 0000000..7ac22cc
--- /dev/null
+++ b/external/doctest
@@ -0,0 +1 @@
+Subproject commit 7ac22cc2190eb090ff66509015fb2d995bce957e
diff --git a/include/integer_fc_dictionary.hpp b/include/integer_fc_dictionary.hpp
index 218cacf..e0b228b 100644
--- a/include/integer_fc_dictionary.hpp
+++ b/include/integer_fc_dictionary.hpp
@@ -166,8 +166,9 @@ struct integer_fc_dictionary {
             prefix.push_back(global::invalid_term_id);
         }
 
-        locate_bucket(completion_to_uint32_range(prefix), h_end, bucket_id_end,
-                      bucket_id_begin  // hint
+        locate_right_bucket(completion_to_uint32_range(prefix), h_end,
+                            bucket_id_end,
+                            bucket_id_begin  // hint
         );
         uint32_t p_end = bucket_id_end * (BucketSize + 1);
         p_end += right_locate(completion_to_uint32_range(prefix), h_end,
@@ -276,6 +277,33 @@ struct integer_fc_dictionary {
         return false;
     }
 
+    void locate_right_bucket(uint32_range t, uint32_range& h,
+                             id_type& bucket_id,
+                             int lower_bound_hint = 0) const {
+        int lo = lower_bound_hint, hi = buckets() - 1, mi = 0, cmp = 0;
+        size_t n = t.end - t.begin;
+        while (lo <= hi) {
+            mi = (lo + hi) / 2;
+            h = header(mi);
+            cmp = uint32_range_compare(h, t, n);
+            if (cmp > 0) {
+                hi = mi - 1;
+            } else if (cmp < 0) {
+                lo = mi + 1;
+            } else {
+                bucket_id = mi;
+                return;
+            }
+        }
+
+        if (cmp < 0) {
+            bucket_id = mi;
+        } else {
+            bucket_id = mi - 1;
+            h = header(bucket_id);
+        }
+    }
+
 #define INT_FC_DICT_LOCATE_INIT                                      \
     static uint32_t decoded[2 * constants::MAX_NUM_TERMS_PER_QUERY]; \
     memcpy(decoded, h.begin, (h.end - h.begin) * sizeof(uint32_t));  \
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 0687354..4d62c01 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,10 +1,11 @@
-add_executable(test_completion_trie test_completion_trie.cpp)
-add_executable(test_fc_dictionary test_fc_dictionary.cpp)
-add_executable(test_integer_fc_dictionary test_integer_fc_dictionary.cpp)
+# add_executable(test_completion_trie test_completion_trie.cpp)
+# add_executable(test_fc_dictionary test_fc_dictionary.cpp)
+# add_executable(test_locate_prefix test_locate_prefix.cpp)
+# add_executable(test_integer_fc_dictionary test_integer_fc_dictionary.cpp)
+
 add_executable(test_cartesian_tree test_cartesian_tree.cpp)
 add_executable(test_inverted_index test_inverted_index.cpp)
 add_executable(test_forward_index test_forward_index.cpp)
 add_executable(test_unsorted_list test_unsorted_list.cpp)
 add_executable(test_autocomplete test_autocomplete.cpp)
-add_executable(test_locate_prefix test_locate_prefix.cpp)
 add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp)
\ No newline at end of file
diff --git a/test/test_common.hpp b/test/test_common.hpp
new file mode 100644
index 0000000..0bc701a
--- /dev/null
+++ b/test/test_common.hpp
@@ -0,0 +1,54 @@
+#pragma once
+
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include "../external/doctest/doctest/doctest.h"
+
+#include <iostream>
+
+#include "types.hpp"
+#include "../benchmark/benchmark_common.hpp"
+
+namespace autocomplete {
+namespace testing {
+
+static std::string test_filename(
+    "../test_data/trec_05_efficiency_queries/"
+    "trec_05_efficiency_queries.completions");
+
+static std::string tmp_filename("tmp.bin");
+
+id_type locate(std::vector<std::string> const& terms, std::string const& t) {
+    return std::distance(terms.begin(),
+                         std::lower_bound(terms.begin(), terms.end(), t)) +
+           1;
+}
+
+range locate_prefix(std::vector<std::string> const& strings,
+                    std::string const& p) {
+    auto comp_l = [](std::string const& l, std::string const& r) {
+        if (l.size() < r.size()) {
+            return strncmp(l.c_str(), r.c_str(), l.size()) <= 0;
+        }
+        return strcmp(l.c_str(), r.c_str()) < 0;
+    };
+
+    auto comp_r = [](std::string const& l, std::string const& r) {
+        if (l.size() < r.size()) {
+            return strncmp(l.c_str(), r.c_str(), l.size()) < 0;
+        }
+        return strcmp(l.c_str(), r.c_str()) < 0;
+    };
+
+    range r;
+    r.begin = std::distance(
+        strings.begin(),
+        std::lower_bound(strings.begin(), strings.end(), p, comp_l));
+    r.end = std::distance(
+        strings.begin(),
+        std::upper_bound(strings.begin(), strings.end(), p, comp_r));
+
+    return r;
+}
+
+}  // namespace testing
+}  // namespace autocomplete
\ No newline at end of file
diff --git a/test/test_completion_trie.cpp b/test/test_completion_trie.cpp
index 1aba989..c5155e1 100644
--- a/test/test_completion_trie.cpp
+++ b/test/test_completion_trie.cpp
@@ -1,106 +1,37 @@
-#include <iostream>
-
-#include "types.hpp"
-#include "statistics.hpp"
+#include "test_common.hpp"
 
 using namespace autocomplete;
 
-struct completion_comparator {
-    bool operator()(completion_type const& lhs,
-                    completion_type const& rhs) const {
-        size_t l = 0;  // |lcp(lhs,rhs)|
-        while (l < lhs.size() - 1 and l < rhs.size() - 1 and lhs[l] == rhs[l]) {
-            ++l;
-        }
-        return lhs[l] < rhs[l];
-    }
-};
-
-range locate_prefix(std::vector<completion_type> const& completions,
-                    completion_type const& c) {
-    completion_comparator comp;
-    auto b = std::lower_bound(completions.begin(), completions.end(), c, comp);
-    uint64_t begin = std::distance(completions.begin(), b);
-    auto e = std::upper_bound(completions.begin() + begin, completions.end(), c,
-                              comp);
-    uint64_t end = std::distance(completions.begin(), e);
-    return {begin, end};
-}
-
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        }
-    }
+typedef ef_completion_trie completion_trie_type;
 
+TEST_CASE("test completion_trie::is_member()") {
+    char const* output_filename = testing::tmp_filename.c_str();
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = testing::test_filename.c_str();
     params.load();
 
-    // typedef uint64_completion_trie completion_trie_type;
-    typedef ef_completion_trie completion_trie_type;
-
     {
         completion_trie_type::builder builder(params);
         completion_trie_type ct;
         builder.build(ct);
-        ct.print_stats();
-
-        if (output_filename) {
-            essentials::logger("saving data structure to disk...");
-            essentials::save<completion_trie_type>(ct, output_filename);
-            essentials::logger("DONE");
-        }
+        REQUIRE(ct.size() == params.num_completions);
+        essentials::save<completion_trie_type>(ct, output_filename);
     }
 
     {
-        if (output_filename) {
-            completion_trie_type ct;
-            essentials::logger("loading data structure from disk...");
-            essentials::load(ct, output_filename);
-            essentials::logger("DONE");
-            // essentials::print_size(ct);
-            std::cout << "using " << ct.bytes() << " bytes" << std::endl;
-
-            std::vector<completion_type> completions;
-            completions.reserve(params.num_completions);
-            std::ifstream input(params.collection_basename + ".mapped",
-                                std::ios_base::in);
-            if (!input.good()) {
-                throw std::runtime_error("File not found");
-            }
-
-            completion_iterator it(params, input);
-            while (input) {
-                auto& record = *it;
-                completions.push_back(std::move(record.completion));
-                ++it;
-            }
-            input.close();
-
-            // check all completions
-            essentials::logger("testing is_member()...");
-            for (auto const& c : completions) {
-                if (!ct.is_member(c)) {
-                    print_completion(c);
-                    std::cout << " not found!" << std::endl;
-                    return 1;
-                }
-            }
-            essentials::logger("DONE...");
+        completion_trie_type ct;
+        essentials::load(ct, output_filename);
+        REQUIRE(ct.size() == params.num_completions);
+        std::ifstream input(params.collection_basename + ".mapped",
+                            std::ios_base::in);
+        INFO("testing is_member()");
+        completion_iterator it(params, input);
+        while (input) {
+            auto& record = *it;
+            REQUIRE(ct.is_member(record.completion));
+            ++it;
         }
+        input.close();
+        std::remove(output_filename);
     }
-
-    return 0;
 }
diff --git a/test/test_fc_dictionary.cpp b/test/test_fc_dictionary.cpp
index 3f79d1e..50d12b0 100644
--- a/test/test_fc_dictionary.cpp
+++ b/test/test_fc_dictionary.cpp
@@ -1,175 +1,86 @@
-#include <iostream>
-
-#include "types.hpp"
+#include "test_common.hpp"
 
 using namespace autocomplete;
 
-id_type locate(std::vector<std::string> const& terms, std::string const& t) {
-    return std::distance(terms.begin(),
-                         std::lower_bound(terms.begin(), terms.end(), t)) +
-           1;
-}
-
-range locate_prefix(std::vector<std::string> const& terms,
-                    std::string const& p) {
-    auto comp_l = [](std::string const& l, std::string const& r) {
-        if (l.size() < r.size()) {
-            return strncmp(l.c_str(), r.c_str(), l.size()) <= 0;
-        }
-        return strcmp(l.c_str(), r.c_str()) < 0;
-    };
-
-    auto comp_r = [](std::string const& l, std::string const& r) {
-        if (l.size() < r.size()) {
-            return strncmp(l.c_str(), r.c_str(), l.size()) < 0;
-        }
-        return strcmp(l.c_str(), r.c_str()) < 0;
-    };
-
-    range r;
-    r.begin = std::distance(
-        terms.begin(), std::lower_bound(terms.begin(), terms.end(), p, comp_l));
-    r.end =
-        std::distance(terms.begin(),
-                      std::upper_bound(terms.begin(), terms.end(), p, comp_r)) -
-        1;
-
-    return r;
-}
-
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        }
-    }
-
+TEST_CASE("test fc_dictionary") {
+    char const* output_filename = testing::tmp_filename.c_str();
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = testing::test_filename.c_str();
     params.load();
 
     {
-        // build, print and write
         fc_dictionary_type::builder builder(params);
         fc_dictionary_type dict;
         builder.build(dict);
-        std::cout << "using " << dict.bytes() << " bytes" << std::endl;
-
-        if (output_filename) {
-            // essentials::print_size(dict);
-            essentials::logger("saving data structure to disk...");
-            essentials::save<fc_dictionary_type>(dict, output_filename);
-            essentials::logger("DONE");
-        }
+        essentials::save<fc_dictionary_type>(dict, output_filename);
     }
 
     {
-        if (output_filename) {
-            fc_dictionary_type dict;
-            essentials::logger("loading data structure from disk...");
-            essentials::load(dict, output_filename);
-            essentials::logger("DONE");
-            // essentials::print_size(dict);
-            std::cout << "using " << dict.bytes() << " bytes" << std::endl;
-
-            // test locate() and extract for all strings
-            std::vector<std::string> terms;
-            terms.reserve(params.num_terms);
-            std::ifstream input((params.collection_basename + ".dict").c_str(),
-                                std::ios_base::in);
-            if (!input.good()) {
-                throw std::runtime_error("File not found");
-            }
-            std::string term;
-            term.reserve(256 + 1);
+        fc_dictionary_type dict;
+        essentials::load(dict, output_filename);
+
+        // test locate() and extract for all strings
+        std::vector<std::string> terms;
+        terms.reserve(params.num_terms);
+        std::ifstream input((params.collection_basename + ".dict").c_str(),
+                            std::ios_base::in);
+        if (!input.good()) {
+            throw std::runtime_error("File not found");
+        }
+        std::string term;
+        term.reserve(256 + 1);
+        input >> term;
+        while (input) {
+            terms.push_back(std::move(term));
             input >> term;
-            while (input) {
-                terms.push_back(std::move(term));
-                input >> term;
-            }
-            input.close();
-
-            std::cout << "terms.size() " << terms.size() << std::endl;
-
-            std::vector<uint8_t> decoded(2 *
-                                         constants::MAX_NUM_CHARS_PER_QUERY);
-
-            for (auto const& t : terms) {
-                id_type expected = locate(terms, t);
-                id_type got = dict.locate(string_to_byte_range(t));
+        }
+        input.close();
 
-                std::cout << "locating term '" << t << "'" << std::endl;
-                if (got != expected) {
-                    std::cout << "Error: expected id " << expected << ","
-                              << " but got id " << got << std::endl;
-                    return 1;
-                }
+        std::vector<uint8_t> decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
 
-                std::cout << "extracting term '" << t << "'" << std::endl;
-                uint8_t string_len = dict.extract(got, decoded.data());
+        for (auto const& t : terms) {
+            id_type expected = testing::locate(terms, t);
+            id_type got = dict.locate(string_to_byte_range(t));
 
-                if (string_len != t.size()) {
-                    std::cout << "Error: expected size " << t.size() << ","
-                              << " but got size " << string_len << std::endl;
-                    return 1;
-                }
+            REQUIRE_MESSAGE(got == expected, "expected id " << expected
+                                                            << ", but got id "
+                                                            << got);
 
-                auto s = reinterpret_cast<char const*>(decoded.data());
-                for (uint8_t i = 0; i != string_len; ++i) {
-                    if (t[i] != s[i]) {
-                        std::cout << "Error: expected char " << t[i]
-                                  << " but got " << s[i] << std::endl;
-                        return 1;
-                    }
-                }
+            uint8_t string_len = dict.extract(got, decoded.data());
+            REQUIRE_MESSAGE(string_len == t.size(),
+                            "expected size " << t.size() << ", but got size "
+                                             << string_len);
 
-                std::cout << "lexicographic id of '" << t << "' is " << got
-                          << std::endl;
+            auto s = reinterpret_cast<char const*>(decoded.data());
+            for (uint8_t i = 0; i != string_len; ++i) {
+                REQUIRE_MESSAGE(t[i] == s[i], "expected char " << t[i]
+                                                               << " but got "
+                                                               << s[i]);
             }
+        }
 
-            // test locate_prefix() for all strings
-            std::string prefix;
-            prefix.reserve(256 + 1);
-            for (auto const& t : terms) {
-                uint32_t n = t.size();
-                for (uint32_t prefix_len = 1; prefix_len <= n; ++prefix_len) {
-                    prefix.clear();
-                    for (uint32_t i = 0; i != prefix_len; ++i) {
-                        prefix.push_back(t[i]);
-                    }
-
-                    std::cout << "locating prefix '" << prefix << "'"
-                              << std::endl;
-                    range expected = locate_prefix(terms, prefix);
-                    range got =
-                        dict.locate_prefix(string_to_byte_range(prefix));
-
-                    if ((got.begin != expected.begin) or
-                        (got.end != expected.end)) {
-                        std::cout << "Error for prefix '" << prefix
-                                  << "' : expected [" << expected.begin << ","
-                                  << expected.end << "] but got [" << got.begin
-                                  << "," << got.end << "]" << std::endl;
-                        return 1;
-                    }
-
-                    std::cout << "prefix range of '" << prefix << "' is ["
-                              << got.begin << "," << got.end << "]"
-                              << std::endl;
+        // test locate_prefix() for all strings
+        std::string prefix;
+        prefix.reserve(256 + 1);
+        for (auto const& t : terms) {
+            uint32_t n = t.size();
+            for (uint32_t prefix_len = 1; prefix_len <= n; ++prefix_len) {
+                prefix.clear();
+                for (uint32_t i = 0; i != prefix_len; ++i) {
+                    prefix.push_back(t[i]);
                 }
+
+                range expected = testing::locate_prefix(terms, prefix);
+                range got = dict.locate_prefix(string_to_byte_range(prefix));
+                REQUIRE_MESSAGE((got.begin == expected.begin and
+                                 got.end == expected.end - 1),
+                                "Error for prefix '"
+                                    << prefix << "' : expected ["
+                                    << expected.begin << "," << expected.end - 1
+                                    << "] but got [" << got.begin << ","
+                                    << got.end << "]");
             }
         }
+        std::remove(output_filename);
     }
-
-    return 0;
 }
diff --git a/test/test_integer_fc_dictionary.cpp b/test/test_integer_fc_dictionary.cpp
index 4f78052..b67879d 100644
--- a/test/test_integer_fc_dictionary.cpp
+++ b/test/test_integer_fc_dictionary.cpp
@@ -1,155 +1,62 @@
-#include <iostream>
-
-#include "types.hpp"
+#include "test_common.hpp"
 
 using namespace autocomplete;
 
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        }
-    }
-
+TEST_CASE("test integer_fc_dictionary") {
+    char const* output_filename = testing::tmp_filename.c_str();
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = testing::test_filename.c_str();
     params.load();
 
     {
-        // build, print and write
         integer_fc_dictionary_type::builder builder(params);
         integer_fc_dictionary_type dict;
         builder.build(dict);
-        std::cout << "using " << dict.bytes() << " bytes" << std::endl;
-
-        if (output_filename) {
-            // essentials::print_size(dict);
-            essentials::logger("saving data structure to disk...");
-            essentials::save<integer_fc_dictionary_type>(dict, output_filename);
-            essentials::logger("DONE");
-        }
+        essentials::save<integer_fc_dictionary_type>(dict, output_filename);
     }
 
     {
-        if (output_filename) {
-            integer_fc_dictionary_type dict;
-            essentials::logger("loading data structure from disk...");
-            essentials::load(dict, output_filename);
-            essentials::logger("DONE");
-            std::cout << "using " << dict.bytes() << " bytes" << std::endl;
-
-            {
-                essentials::logger("testing extract() and locate()...");
-                std::ifstream input(
-                    (params.collection_basename + ".mapped").c_str(),
-                    std::ios_base::in);
-                completion_iterator it(params, input);
-
-                completion_type decoded(2 * constants::MAX_NUM_TERMS_PER_QUERY);
-                for (id_type id = 0; id != params.num_completions; ++id, ++it) {
-                    auto const& expected = (*it).completion;
-                    assert(expected.size() > 0);
-                    uint8_t size = dict.extract(id, decoded);
-                    if (expected.size() - 1 != size) {
-                        std::cout << "Error in decoding the " << id
-                                  << "-th string: expected size "
-                                  << expected.size() - 1 << ","
-                                  << " but got size " << int(size) << std::endl;
-                        return 1;
-                    }
-
-                    for (uint8_t i = 0; i != size; ++i) {
-                        if (decoded[i] != expected[i]) {
-                            std::cout
-                                << "Error in decoding the " << id
-                                << "-th string: expected " << expected[i] << ","
-                                << " but got " << decoded[i] << " at position "
-                                << int(i) << std::endl;
-                            return 1;
-                        }
-                    }
-
-                    id_type got_id =
-                        dict.locate({decoded.data(), decoded.data() + size});
-                    if (got_id != id) {
-                        std::cout << "Error in locating the " << id
-                                  << "-th string: expected id " << id << ","
-                                  << " but got id " << got_id << std::endl;
-                        return 1;
-                    }
+        integer_fc_dictionary_type dict;
+        essentials::load(dict, output_filename);
+
+        {
+            std::ifstream input(
+                (params.collection_basename + ".mapped").c_str(),
+                std::ios_base::in);
+            completion_iterator it(params, input);
+
+            completion_type decoded(2 * constants::MAX_NUM_TERMS_PER_QUERY);
+            for (id_type id = 0; id != params.num_completions; ++id, ++it) {
+                auto const& expected = (*it).completion;
+                REQUIRE(expected.size() > 0);
+                uint8_t size = dict.extract(id, decoded);
+
+                REQUIRE_MESSAGE(expected.size() - 1 == size,
+                                "Error in decoding the "
+                                    << id << "-th string: expected size "
+                                    << expected.size() - 1 << ","
+                                    << " but got size " << int(size));
+
+                for (uint8_t i = 0; i != size; ++i) {
+                    REQUIRE_MESSAGE(decoded[i] == expected[i],
+                                    "Error in decoding the "
+                                        << id << "-th string: expected "
+                                        << expected[i] << ","
+                                        << " but got " << decoded[i]
+                                        << " at position " << int(i));
                 }
 
-                input.close();
-                essentials::logger("it's all good");
+                id_type got_id =
+                    dict.locate({decoded.data(), decoded.data() + size});
+                REQUIRE_MESSAGE(got_id == id, "Error in locating the "
+                                                  << id
+                                                  << "-th string: expected id "
+                                                  << id << ","
+                                                  << " but got id " << got_id);
             }
 
-            // {
-            //     uint64_completion_trie::builder builder(params);
-            //     uint64_completion_trie ct;
-            //     builder.build(ct);
-            //     std::cout << "using " << ct.bytes() << " bytes" << std::endl;
-
-            //     essentials::logger("testing locate_prefix()...");
-
-            //     std::ifstream input(
-            //         (params.collection_basename + ".mapped").c_str(),
-            //         std::ios_base::in);
-            //     completion_iterator it(params, input);
-
-            //     uint32_t num_checks =
-            //         std::min<uint32_t>(params.num_completions, 30000);
-
-            //     completion_type prefix;
-            //     for (uint32_t i = 0; i != num_checks; ++i, ++it) {
-            //         auto const& expected = (*it).completion;
-            //         assert(expected.size() > 0);
-
-            //         for (uint32_t prefix_len = 1;
-            //              prefix_len <= expected.size() - 1; ++prefix_len) {
-            //             prefix.clear();
-            //             for (uint32_t i = 0; i != prefix_len; ++i) {
-            //                 prefix.push_back(expected[i]);
-            //             }
-
-            //             range expected = ct.locate_prefix(prefix);
-            //             range got = dict.locate_prefix(
-            //                 completion_to_uint32_range(prefix));
-
-            //             if ((got.begin != expected.begin) or
-            //                 (got.end != expected.end - 1)) {
-            //                 std::cout << "Error for prefix ";
-            //                 print_completion(prefix);
-            //                 std::cout << ": expected [" << expected.begin <<
-            //                 ","
-            //                           << expected.end - 1 << "] but got ["
-            //                           << got.begin << "," << got.end << "]"
-            //                           << std::endl;
-            //                 return 1;
-            //             }
-
-            //             // std::cout << "prefix range of ";
-            //             // print_completion(prefix);
-            //             // std::cout << " is [" << got.begin << "," <<
-            //             got.end
-            //             //           << "]" << std::endl;
-            //         }
-            //     }
-
-            //     input.close();
-            //     essentials::logger("it's all good");
-            // }
+            input.close();
         }
+        std::remove(output_filename);
     }
-
-    return 0;
 }
diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp
index fd3dcb4..8938965 100644
--- a/test/test_locate_prefix.cpp
+++ b/test/test_locate_prefix.cpp
@@ -1,81 +1,35 @@
-#include <iostream>
-
-#include "types.hpp"
-#include "../benchmark/benchmark_common.hpp"
+#include "test_common.hpp"
 
 using namespace autocomplete;
 
-range locate_prefix(std::vector<std::string> const& strings,
-                    std::string const& p) {
-    auto comp_l = [](std::string const& l, std::string const& r) {
-        if (l.size() < r.size()) {
-            return strncmp(l.c_str(), r.c_str(), l.size()) <= 0;
-        }
-        return strcmp(l.c_str(), r.c_str()) < 0;
-    };
-
-    auto comp_r = [](std::string const& l, std::string const& r) {
-        if (l.size() < r.size()) {
-            return strncmp(l.c_str(), r.c_str(), l.size()) < 0;
-        }
-        return strcmp(l.c_str(), r.c_str()) < 0;
-    };
-
-    range r;
-    r.begin = std::distance(
-        strings.begin(),
-        std::lower_bound(strings.begin(), strings.end(), p, comp_l));
-    r.end = std::distance(
-        strings.begin(),
-        std::upper_bound(strings.begin(), strings.end(), p, comp_r));
-
-    return r;
-}
+typedef ef_completion_trie completion_trie_type;
 
 template <typename Dictionary, typename Index>
-int test_locate_prefix(Dictionary const& dict, Index const& index,
-                       std::vector<std::string> const& queries,
-                       std::vector<std::string> const& strings) {
+void test_locate_prefix(Dictionary const& dict, Index const& index,
+                        std::vector<std::string> const& queries,
+                        std::vector<std::string> const& strings) {
     for (auto const& query : queries) {
-        std::string query_copy = query;
-        range expected = locate_prefix(strings, query);
-
-        // std::cout << "query: '" << query << "'" << std::endl;
+        range expected = testing::locate_prefix(strings, query);
         completion_type prefix;
         byte_range suffix;
-        parse(dict, query_copy, prefix, suffix);
-
-        // print_completion(prefix);
-        // std::cout << std::endl;
-        // print(suffix);
-        // std::cout << std::endl;
+        parse(dict, query, prefix, suffix);
 
         range suffix_lex_range = dict.locate_prefix(suffix);
         suffix_lex_range.begin += 1;
         suffix_lex_range.end += 1;
         range got = index.locate_prefix(prefix, suffix_lex_range);
 
-        if ((got.begin != expected.begin) or (got.end != expected.end)) {
-            std::cout << "Error for query '" << query << "': ";
-            std::cout << "expected [" << expected.begin << "," << expected.end
-                      << ") but got [" << got.begin << "," << got.end << ")"
-                      << std::endl;
-            return 1;
-        }
+        REQUIRE_MESSAGE(
+            (got.begin == expected.begin and got.end == expected.end),
+            "Error for query '" << query << "': expected [" << expected.begin
+                                << "," << expected.end << ") but got ["
+                                << got.begin << "," << got.end << ")");
     }
-
-    return 0;
 }
 
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename>" << std::endl;
-        return 1;
-    }
-
+TEST_CASE("test locate_prefix()") {
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = testing::test_filename.c_str();
     params.load();
 
     fc_dictionary_type dict;
@@ -102,40 +56,41 @@ int main(int argc, char** argv) {
                            " strings");
     }
 
-    uint32_t max_num_queries = std::atoi(argv[2]);
+    constexpr uint32_t max_num_queries = 5000;
     std::vector<std::string> queries;
-    essentials::logger("loading queries...");
-    uint32_t num_queries =
-        load_queries(queries, max_num_queries, true, std::cin);
-    essentials::logger("loaded " + std::to_string(num_queries) + " queries");
+    static std::vector<float> percentages = {0.0, 0.25, 0.50, 0.75, 1.0};
+    static std::vector<uint32_t> query_terms = {1, 2, 3, 4, 5, 6, 7};
 
-    {
-        // typedef uint64_completion_trie completion_trie_type;
-        typedef ef_completion_trie completion_trie_type;
+    completion_trie_type ct_index;
+    integer_fc_dictionary_type fc_index;
 
-        completion_trie_type index;
-        {
-            completion_trie_type::builder builder(params);
-            builder.build(index);
-        }
-        essentials::logger("testing locate_prefix() for completion_trie...");
-        int ret = test_locate_prefix(dict, index, queries, strings);
-        if (ret) return 1;
-        essentials::logger("it's all good");
+    {
+        completion_trie_type::builder builder(params);
+        builder.build(ct_index);
+        REQUIRE(ct_index.size() == params.num_completions);
     }
 
     {
-        integer_fc_dictionary_type index;
-        {
-            integer_fc_dictionary_type::builder builder(params);
-            builder.build(index);
-        }
-        essentials::logger(
-            "testing locate_prefix() for integer_fc_dictionary...");
-        int ret = test_locate_prefix(dict, index, queries, strings);
-        if (ret) return 1;
-        essentials::logger("it's all good");
+        integer_fc_dictionary_type::builder builder(params);
+        builder.build(fc_index);
+        REQUIRE(fc_index.size() == params.num_completions);
     }
 
-    return 0;
+    for (auto perc : percentages) {
+        for (auto num_terms : query_terms) {
+            std::cout << "percentage " << perc * 100.0 << "%, num_terms "
+                      << num_terms << std::endl;
+            {
+                queries.clear();
+                std::ifstream querylog((params.collection_basename +
+                                        ".length=" + std::to_string(num_terms))
+                                           .c_str());
+                load_queries(queries, max_num_queries, perc, querylog);
+                querylog.close();
+            }
+
+            test_locate_prefix(dict, ct_index, queries, strings);
+            test_locate_prefix(dict, fc_index, queries, strings);
+        }
+    }
 }
diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py
index 743b491..c47ea17 100644
--- a/test_data/build_inverted_and_forward.py
+++ b/test_data/build_inverted_and_forward.py
@@ -36,7 +36,7 @@
         discard = False
         for i in range(1, len(x)):
             try:
-                term = x[i].encode('utf-8')
+                term = x[i]
                 try:
                     term_id = tokens[term]
                     if term_id not in mapped:
diff --git a/test_data/extract_dict.py b/test_data/extract_dict.py
index 875f85b..e3c05b5 100644
--- a/test_data/extract_dict.py
+++ b/test_data/extract_dict.py
@@ -21,5 +21,5 @@
 
 dict_file = open(input_filename + ".dict", 'w')
 for key in sorted(tokens):
-    dict_file.write(key.encode('utf-8') + "\n")
+    dict_file.write(key + "\n")
 dict_file.close()
\ No newline at end of file
diff --git a/test_data/map_dataset.py b/test_data/map_dataset.py
index 86e6357..1a8fd13 100644
--- a/test_data/map_dataset.py
+++ b/test_data/map_dataset.py
@@ -24,7 +24,7 @@
         string_len = 0;
         mapped = [x[0]]
         for i in range(1, len(x)): # x[0] stores the docID
-            t = x[i].encode('utf-8')
+            t = x[i]
             try:
                 id = tokens[t]
                 mapped.append(id)

From 5afbdf530d9c1ce767c777ad84cf875299ce8896 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 22 Oct 2019 20:33:12 +0200
Subject: [PATCH 011/102] automated testing with doctest

---
 CMakeLists.txt               |   2 +-
 test/CMakeLists.txt          |   7 ---
 test/test_cartesian_tree.cpp |  83 --------------------------
 test/test_unsorted_list.cpp  | 110 ++++++++++-------------------------
 4 files changed, 32 insertions(+), 170 deletions(-)
 delete mode 100644 test/test_cartesian_tree.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b2fa97..5b89fe7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,7 @@ add_subdirectory(src)
 add_subdirectory(benchmark)
 
 enable_testing()
-file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp)
+file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp)
 foreach(TEST_SRC ${TEST_SOURCES})
   get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension
   add_executable(${TEST_SRC_NAME} ${TEST_SRC})
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4d62c01..c220919 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,11 +1,4 @@
-# add_executable(test_completion_trie test_completion_trie.cpp)
-# add_executable(test_fc_dictionary test_fc_dictionary.cpp)
-# add_executable(test_locate_prefix test_locate_prefix.cpp)
-# add_executable(test_integer_fc_dictionary test_integer_fc_dictionary.cpp)
-
-add_executable(test_cartesian_tree test_cartesian_tree.cpp)
 add_executable(test_inverted_index test_inverted_index.cpp)
 add_executable(test_forward_index test_forward_index.cpp)
-add_executable(test_unsorted_list test_unsorted_list.cpp)
 add_executable(test_autocomplete test_autocomplete.cpp)
 add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp)
\ No newline at end of file
diff --git a/test/test_cartesian_tree.cpp b/test/test_cartesian_tree.cpp
deleted file mode 100644
index 0c4fd38..0000000
--- a/test/test_cartesian_tree.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <iostream>
-#include <functional>
-
-#include "types.hpp"
-
-using namespace autocomplete;
-
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        }
-    }
-
-    parameters params;
-    params.collection_basename = argv[1];
-    params.load();
-
-    {
-        // build and write
-
-        // std::vector<uint32_t> doc_ids = {23, 2, 4,  0,  88, 23, 2, 4,  55, 3,
-        //                            7,  6, 90, 34, 2,  3,  1, 12, 23};
-
-        std::vector<uint32_t> doc_ids;
-        doc_ids.reserve(params.num_completions);
-        std::ifstream input(params.collection_basename + ".mapped",
-                            std::ios_base::in);
-        if (!input.good()) {
-            throw std::runtime_error("File not found");
-        }
-        completion_iterator it(params, input);
-        while (input) {
-            auto const& record = *it;
-            doc_ids.push_back(record.doc_id);
-            ++it;
-        }
-        input.close();
-
-        cartesian_tree rmq;
-        rmq.build(doc_ids, std::less<uint32_t>());
-        assert(rmq.size() == doc_ids.size());
-        std::cout << "using " << rmq.bytes() << " bytes" << std::endl;
-
-        if (output_filename) {
-            // essentials::print_size(rmq);
-            essentials::logger("saving data structure to disk...");
-            essentials::save<cartesian_tree>(rmq, output_filename);
-            essentials::logger("DONE");
-        }
-    }
-
-    {
-        // load and print
-        if (output_filename) {
-            cartesian_tree rmq;
-            essentials::logger("loading data structure from disk...");
-            essentials::load(rmq, output_filename);
-            essentials::logger("DONE");
-
-            std::cout << "using " << rmq.bytes() << " bytes" << std::endl;
-
-            for (size_t i = 0; i != rmq.size(); ++i) {
-                for (size_t j = i; j != rmq.size(); ++j) {
-                    std::cout << "rmq[" << i << "," << j
-                              << "] = " << rmq.rmq(i, j) << std::endl;
-                }
-            }
-        }
-    }
-
-    return 0;
-}
diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp
index 9b9b000..44abc5e 100644
--- a/test/test_unsorted_list.cpp
+++ b/test/test_unsorted_list.cpp
@@ -1,15 +1,7 @@
-#include <iostream>
-#include <functional>
-
-#include "types.hpp"
+#include "test_common.hpp"
 
 using namespace autocomplete;
 
-static const uint32_t max_k = 15;
-static const uint32_t k = 10;
-static_assert(k <= max_k, "k must be less than max allowed");
-static const uint32_t num_queries = 10000;
-
 std::vector<id_type> naive_topk(std::vector<id_type> const& input, range r,
                                 uint32_t k) {
     uint32_t range_len = r.end - r.begin;
@@ -41,37 +33,22 @@ std::vector<range> gen_random_queries(uint32_t num_queries,
     return queries;
 }
 
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        }
-    }
-
+TEST_CASE("test unsorted_list") {
+    char const* output_filename = testing::tmp_filename.c_str();
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = testing::test_filename.c_str();
     params.load();
 
+    static const uint32_t k = 10;
+    static_assert(k <= constants::MAX_K, "k must be less than max allowed");
+    static const uint32_t num_queries = 5000;
+
     std::vector<id_type> doc_ids;
 
     {
-        // build and write
         doc_ids.reserve(params.num_completions);
         std::ifstream input(params.collection_basename + ".mapped",
                             std::ios_base::in);
-        if (!input.good()) {
-            throw std::runtime_error("File not found");
-        }
         completion_iterator it(params, input);
         while (input) {
             auto const& record = *it;
@@ -85,64 +62,39 @@ int main(int argc, char** argv) {
             std::vector<id_type> tmp = doc_ids;
             std::sort(tmp.begin(), tmp.end());
             for (id_type id = 0; id != doc_ids.size(); ++id) {
-                if (tmp[id] != id) {
-                    std::cout << "Error: id " << id << " not found"
-                              << std::endl;
-                    return 1;
-                }
+                REQUIRE_MESSAGE(tmp[id] == id,
+                                "Error: id " << id << " not found");
             }
         }
 
         succinct_rmq list;
         list.build(doc_ids);
-        assert(list.size() == doc_ids.size());
-        std::cout << "using " << list.bytes() << " bytes" << std::endl;
+        REQUIRE(list.size() == doc_ids.size());
 
-        if (output_filename) {
-            // essentials::print_size(list);
-            essentials::logger("saving data structure to disk...");
-            essentials::save<succinct_rmq>(list, output_filename);
-            essentials::logger("DONE");
-        }
+        essentials::save<succinct_rmq>(list, output_filename);
     }
 
     {
-        if (output_filename) {
-            succinct_rmq list;
-            essentials::logger("loading data structure from disk...");
-            essentials::load(list, output_filename);
-            essentials::logger("DONE");
-
-            std::cout << "using " << list.bytes() << " bytes" << std::endl;
-
-            std::vector<id_type> topk(max_k);
-            auto queries = gen_random_queries(num_queries, doc_ids.size());
-            std::cout << "testing top-" << k << " " << num_queries
-                      << " random queries..." << std::endl;
-
-            for (auto q : queries) {
-                auto expected = naive_topk(doc_ids, q, k);
-                uint32_t num_elements = list.topk(q, k, topk);
-
-                if (expected.size() != num_elements) {
-                    std::cout << "Error: expected " << expected.size()
-                              << " topk elements but got " << num_elements
-                              << std::endl;
-                    return 1;
-                }
-
-                for (uint32_t i = 0; i != num_elements; ++i) {
-                    if (topk[i] != expected[i]) {
-                        std::cout << "Error: expected " << expected[i]
-                                  << " but got " << topk[i] << std::endl;
-                        return 1;
-                    }
-                }
+        succinct_rmq list;
+        essentials::load(list, output_filename);
+
+        std::vector<id_type> topk(constants::MAX_K);
+        auto queries = gen_random_queries(num_queries, doc_ids.size());
+
+        for (auto q : queries) {
+            auto expected = naive_topk(doc_ids, q, k);
+            uint32_t results = list.topk(q, k, topk);
+            REQUIRE_MESSAGE(expected.size() == results,
+                            "Error: expected " << expected.size()
+                                               << " topk elements but got "
+                                               << results);
+            for (uint32_t i = 0; i != results; ++i) {
+                REQUIRE_MESSAGE(topk[i] == expected[i],
+                                "Error: expected " << expected[i] << " but got "
+                                                   << topk[i]);
             }
-
-            std::cout << "it's all good" << std::endl;
         }
-    }
 
-    return 0;
+        std::remove(output_filename);
+    }
 }

From f0eee6dfc70f52f5405b9dc324f37f91a918e3f1 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 22 Oct 2019 20:34:23 +0200
Subject: [PATCH 012/102] empty todo

---
 TODO.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/TODO.md b/TODO.md
index 082ced9..e69de29 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,2 +0,0 @@
-
-- Study the effect of compression.

From 48fab02dfa10f3136159b2fa7163a82bf12bf8f4 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 22 Oct 2019 20:46:54 +0200
Subject: [PATCH 013/102] README updated

---
 README.md | 52 +++++++++++++++++++++-------------------------------
 1 file changed, 21 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index d222323..a117f62 100644
--- a/README.md
+++ b/README.md
@@ -7,9 +7,10 @@ Query autocompletion in C++.
 1. [Description](#descr)
 2. [Compiling the code](#compiling)
 3. [Input data format](#input)
-4. [Building an index](#building)
-4. [Benchmarks](#benchmarks)
-5. [Live demo](#demo)
+4. [Running the unit tests](#testing)
+5. [Building an index](#building)
+6. [Benchmarks](#benchmarks)
+7. [Live demo](#demo)
 
 Description <a name="descr"></a>
 -----------
@@ -115,35 +116,14 @@ tokens separated by white spaces.
 fake, i.e., they do not take into account any
 particular assignment.)
 
-The scripts in the directory `test_data` help in
-preparing the datasets for indexing:
+The script `preprocess.sh` in the directory `test_data` helps
+in preparing the data for indexing.
+Thus, from within the directory `test_data`, it is sufficient
+to do:
+	
+	$ bash preprocess.sh
 
-1. The command
-
-		$ python extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
-
-	extract the dictionary
-from a file listing all completions in textual form.
-
-2. The command
-
-		$ python map_dataset.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
-
-	maps strings to integer ids.
-
-3. The command
-
-		$ python build_stats.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions.mapped
-
-	calulcates the dataset statistics.
-
-4. The command
-
-		$ python build_inverted_and_forward.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
-
-	builds the inverted and forward files.
-
-If you run the scripts in the reported order, you will get:
+If you run the script, you will get:
 
 - `trec_05_efficiency_queries.completions.dict`: lists all the distinct
 tokens in the completions sorted in lexicographical
@@ -164,6 +144,16 @@ the data structures more efficiently.
 
 - `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order.
 
+Running the unit tests <a name="testing"></a>
+-----------
+
+The unit tests are written using [doctest](https://github.com/onqtam/doctest).
+
+After compilation and preparation of the data for indexing (see Section [Input data format](#input)), it is advised
+to run the unit tests with:
+
+	$ make test
+
 Building an index <a name="building"></a>
 -----------
 

From 2f70613697170ee08192f5073ae0a4490063e616 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 23 Oct 2019 12:08:07 +0200
Subject: [PATCH 014/102] better testing

---
 CMakeLists.txt               |   2 +-
 test/CMakeLists.txt          |   1 -
 test/test_inverted_index.cpp | 187 +++++++++++++++++------------------
 test/test_unsorted_list.cpp  |  90 +++++++++++++++--
 4 files changed, 173 insertions(+), 107 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b89fe7..3fa9125 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,7 @@ add_subdirectory(src)
 add_subdirectory(benchmark)
 
 enable_testing()
-file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp)
+file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp test/test_inverted_index.cpp)
 foreach(TEST_SRC ${TEST_SOURCES})
   get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension
   add_executable(${TEST_SRC_NAME} ${TEST_SRC})
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c220919..a78df87 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,3 @@
-add_executable(test_inverted_index test_inverted_index.cpp)
 add_executable(test_forward_index test_forward_index.cpp)
 add_executable(test_autocomplete test_autocomplete.cpp)
 add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp)
\ No newline at end of file
diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp
index 81f913e..aefdaae 100644
--- a/test/test_inverted_index.cpp
+++ b/test/test_inverted_index.cpp
@@ -1,127 +1,124 @@
-#include <iostream>
-
-#include "types.hpp"
+#include "test_common.hpp"
 
 using namespace autocomplete;
 
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
+typedef ef_inverted_index inverted_index_type;
+typedef std::vector<id_type> term_ids;
+
+std::vector<term_ids> gen_random_queries(uint32_t num_queries,
+                                         uint32_t max_num_terms,
+                                         uint32_t max_range_len) {
+    assert(max_num_terms > 1);
+    std::vector<term_ids> queries;
+    queries.reserve(num_queries);
+    essentials::uniform_int_rng<uint32_t> random_num_terms(2, max_num_terms);
+    essentials::uniform_int_rng<uint32_t> random_term_id(1, max_range_len);
+
+    for (uint32_t i = 0; i != num_queries; ++i) {
+        term_ids q;
+        uint32_t num_terms = random_num_terms.gen();
+        q.reserve(num_terms);
+        uint32_t num_distinct_terms = 0;
+        while (true) {
+            q.clear();
+            for (uint32_t i = 0; i != num_terms; ++i) {
+                auto t = random_term_id.gen();
+                assert(t >= 1 and t <= max_range_len);
+                q.push_back(t);
+            }
+            std::sort(q.begin(), q.end());
+            auto end = std::unique(q.begin(), q.end());
+            num_distinct_terms = std::distance(q.begin(), end);
+            if (num_distinct_terms >= 2) break;
         }
+        q.resize(num_distinct_terms);
+        queries.push_back(q);
     }
 
+    return queries;
+}
+
+TEST_CASE("test inverted_index::intersection_iterator") {
+    char const* output_filename = testing::tmp_filename.c_str();
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = testing::test_filename.c_str();
     params.load();
 
-    typedef ef_inverted_index inverted_index_type;
-
     {
-        // build, print and write
         inverted_index_type::builder builder(params);
         inverted_index_type index;
         builder.build(index);
-        std::cout << "using " << index.bytes() << " bytes" << std::endl;
-        std::cout << "num docs " << index.num_docs() << std::endl;
-        std::cout << "num terms " << index.num_terms() << std::endl;
-
-        if (output_filename) {
-            essentials::logger("saving data structure to disk...");
-            essentials::save<inverted_index_type>(index, output_filename);
-            essentials::logger("DONE");
-        }
+        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_terms() == params.num_terms);
+        essentials::save<inverted_index_type>(index, output_filename);
     }
 
     {
-        if (output_filename) {
-            inverted_index_type index;
-            essentials::logger("loading data structure from disk...");
-            essentials::load(index, output_filename);
-            essentials::logger("DONE");
-            std::cout << "using " << index.bytes() << " bytes" << std::endl;
-            std::cout << "num docs " << index.num_docs() << std::endl;
-            std::cout << "num terms " << index.num_terms() << std::endl;
-
-            std::vector<id_type> intersection(index.num_docs());  // at most
-            std::vector<id_type> term_ids;
-            term_ids.reserve(2);
-
-            // id_type i = 293;
-            // id_type j = 294;
-            // id_type i = 899;
-            // id_type j = 822;
-            id_type i = 2401599 - 1;
-            id_type j = 1752198 - 1;
-            term_ids.push_back(i + 1);
-            term_ids.push_back(j + 1);
-            // uint64_t size = index.intersect(term_ids, intersection);
+        inverted_index_type index;
+        essentials::load(index, output_filename);
+        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_terms() == params.num_terms);
+
+        static const uint32_t num_queries = 1000000;
+        static const uint32_t max_num_terms = 5;
+        auto queries =
+            gen_random_queries(num_queries, max_num_terms, index.num_terms());
+
+        std::vector<id_type> first(index.num_docs());
+        std::vector<id_type> second(index.num_docs());
+        std::vector<id_type> intersection(index.num_docs());
+
+        for (auto const& q : queries) {
+            uint32_t first_size = 0;
+            uint32_t second_size = 0;
+            assert(q.size() >= 2);
 
             {
-                std::cout << "intersection between " << i << " and " << j
-                          << " is: ";
-                uint32_t i = 0;
-                auto intersec_it = index.intersection_iterator(term_ids);
-                while (intersec_it.has_next()) {
-                    id_type doc_id = *intersec_it;
-                    std::cout << doc_id << " ";
-                    ++i;
-                    ++intersec_it;
+                auto it = index.iterator(q[0] - 1);
+                first_size = it.size();
+                for (uint32_t i = 0; i != first_size; ++i) {
+                    first[i] = it.access(i);
                 }
-                std::cout << std::endl;
             }
 
-            std::vector<id_type> a;
             {
-                auto it = index.iterator(i);
-                a.resize(it.size());
-                for (uint32_t i = 0; i != a.size(); ++i) {
-                    a[i] = it.access(i);
+                auto it = index.iterator(q[1] - 1);
+                second_size = it.size();
+                for (uint32_t i = 0; i != second_size; ++i) {
+                    second[i] = it.access(i);
                 }
             }
 
-            std::vector<id_type> b;
-            {
-                auto it = index.iterator(j);
-                b.resize(it.size());
-                for (uint32_t i = 0; i != b.size(); ++i) {
-                    b[i] = it.access(i);
+            auto end = std::set_intersection(
+                first.begin(), first.begin() + first_size, second.begin(),
+                second.begin() + second_size, intersection.begin());
+            first_size = std::distance(intersection.begin(), end);
+            first.swap(intersection);
+
+            for (uint32_t i = 2; i != q.size(); ++i) {
+                auto it = index.iterator(q[i] - 1);
+                second_size = it.size();
+                for (uint32_t i = 0; i != second_size; ++i) {
+                    second[i] = it.access(i);
                 }
+                end = std::set_intersection(
+                    first.begin(), first.begin() + first_size, second.begin(),
+                    second.begin() + second_size, intersection.begin());
+                first_size = std::distance(intersection.begin(), end);
+                first.swap(intersection);
             }
 
-            auto it = std::set_intersection(a.begin(), a.end(), b.begin(),
-                                            b.end(), intersection.begin());
-            intersection.resize(it - intersection.begin());
-            std::cout << "intersection between " << i << " and " << j
-                      << " is: ";
-            for (auto x : intersection) {
-                std::cout << x << " ";
+            auto it = index.intersection_iterator(q);
+            uint32_t n = 0;
+            for (; it.has_next(); ++n, ++it) {
+                auto doc_id = *it;
+                REQUIRE_MESSAGE(
+                    doc_id == first[n],
+                    "expected doc_id " << first[n] << " but got " << doc_id);
             }
-            std::cout << std::endl;
-
-            // for (uint32_t i = 1; i != index.num_terms() + 1; ++i) {
-            //     for (uint32_t j = i; j != index.num_terms() + 1; ++j) {
-            //         term_ids.clear();
-            //         term_ids.push_back(i);
-            //         term_ids.push_back(j);
-            //         uint64_t size = index.intersect(term_ids, intersection);
-            //         std::cout << "size of intersection between " << i << "
-            //         and "
-            //                   << j << " is " << size << std::endl;
-            //     }
-            // }
+            REQUIRE_MESSAGE(n == first_size, "expected " << first_size
+                                                         << " results, but got "
+                                                         << n);
         }
     }
-
-    return 0;
 }
diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp
index 44abc5e..8e791bb 100644
--- a/test/test_unsorted_list.cpp
+++ b/test/test_unsorted_list.cpp
@@ -2,16 +2,21 @@
 
 using namespace autocomplete;
 
-std::vector<id_type> naive_topk(std::vector<id_type> const& input, range r,
-                                uint32_t k) {
+uint32_t naive_topk(std::vector<id_type> const& input, range r, uint32_t k,
+                    std::vector<id_type>& topk, bool unique = false) {
     uint32_t range_len = r.end - r.begin;
-    std::vector<id_type> topk(range_len);
     for (uint32_t i = 0; i != range_len; ++i) {
         topk[i] = input[r.begin + i];
     }
     std::sort(topk.begin(), topk.begin() + range_len);
-    topk.resize(std::min<uint32_t>(k, range_len));
-    return topk;
+    uint32_t results = 0;
+    if (unique) {
+        auto end = std::unique(topk.begin(), topk.begin() + range_len);
+        results = std::min<uint32_t>(k, std::distance(topk.begin(), end));
+    } else {
+        results = std::min<uint32_t>(k, range_len);
+    }
+    return results;
 }
 
 std::vector<range> gen_random_queries(uint32_t num_queries,
@@ -33,7 +38,7 @@ std::vector<range> gen_random_queries(uint32_t num_queries,
     return queries;
 }
 
-TEST_CASE("test unsorted_list") {
+TEST_CASE("test unsorted_list on doc_ids") {
     char const* output_filename = testing::tmp_filename.c_str();
     parameters params;
     params.collection_basename = testing::test_filename.c_str();
@@ -70,7 +75,6 @@ TEST_CASE("test unsorted_list") {
         succinct_rmq list;
         list.build(doc_ids);
         REQUIRE(list.size() == doc_ids.size());
-
         essentials::save<succinct_rmq>(list, output_filename);
     }
 
@@ -80,12 +84,13 @@ TEST_CASE("test unsorted_list") {
 
         std::vector<id_type> topk(constants::MAX_K);
         auto queries = gen_random_queries(num_queries, doc_ids.size());
+        std::vector<id_type> expected(params.num_completions);
 
         for (auto q : queries) {
-            auto expected = naive_topk(doc_ids, q, k);
+            uint32_t expected_results = naive_topk(doc_ids, q, k, expected);
             uint32_t results = list.topk(q, k, topk);
-            REQUIRE_MESSAGE(expected.size() == results,
-                            "Error: expected " << expected.size()
+            REQUIRE_MESSAGE(expected_results == results,
+                            "Error: expected " << expected_results
                                                << " topk elements but got "
                                                << results);
             for (uint32_t i = 0; i != results; ++i) {
@@ -98,3 +103,68 @@ TEST_CASE("test unsorted_list") {
         std::remove(output_filename);
     }
 }
+
+TEST_CASE("test unsorted_list on minimal doc_ids") {
+    char const* output_filename = testing::tmp_filename.c_str();
+    parameters params;
+    params.collection_basename = testing::test_filename.c_str();
+    params.load();
+
+    static const uint32_t k = 10;
+    static_assert(k <= constants::MAX_K, "k must be less than max allowed");
+    static const uint32_t num_queries = 5000;
+
+    std::vector<id_type> doc_ids;
+
+    {
+        doc_ids.reserve(params.num_terms);
+        std::ifstream input((params.collection_basename + ".inverted").c_str(),
+                            std::ios_base::in);
+        id_type first;
+        for (uint64_t i = 0; i != params.num_terms; ++i) {
+            uint32_t n = 0;
+            input >> n;
+            input >> first;
+            doc_ids.push_back(first);
+            for (uint64_t k = 1; k != n; ++k) {
+                id_type x;
+                input >> x;
+                (void)x;  // discard
+            }
+        }
+        input.close();
+        REQUIRE(doc_ids.size() == params.num_terms);
+
+        succinct_rmq list;
+        list.build(doc_ids);
+        REQUIRE(list.size() == doc_ids.size());
+        essentials::save<succinct_rmq>(list, output_filename);
+    }
+
+    {
+        succinct_rmq list;
+        essentials::load(list, output_filename);
+
+        std::vector<id_type> topk(constants::MAX_K);
+        auto queries = gen_random_queries(num_queries, doc_ids.size());
+        constexpr bool unique = true;
+        std::vector<id_type> expected(params.num_terms);
+
+        for (auto q : queries) {
+            uint32_t expected_results =
+                naive_topk(doc_ids, q, k, expected, unique);
+            uint32_t results = list.topk(q, k, topk, unique);
+            REQUIRE_MESSAGE(expected_results == results,
+                            "Error: expected " << expected_results
+                                               << " topk elements but got "
+                                               << results);
+            for (uint32_t i = 0; i != results; ++i) {
+                REQUIRE_MESSAGE(topk[i] == expected[i],
+                                "Error: expected " << expected[i] << " but got "
+                                                   << topk[i]);
+            }
+        }
+
+        std::remove(output_filename);
+    }
+}
\ No newline at end of file

From 1562354f18307edb27f9190da9ccf93064b42393 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 23 Oct 2019 13:59:57 +0200
Subject: [PATCH 015/102] more testing

---
 CMakeLists.txt                      |   2 +-
 include/delta_forward_index.hpp     | 149 ---------------------
 include/forward_index.hpp           | 201 ----------------------------
 include/types.hpp                   |  13 +-
 test/CMakeLists.txt                 |   1 -
 test/test_compact_forward_index.cpp |  47 +++++++
 test/test_forward_index.cpp         |  58 --------
 test/test_inverted_index.cpp        |  45 +++++++
 8 files changed, 98 insertions(+), 418 deletions(-)
 delete mode 100644 include/delta_forward_index.hpp
 delete mode 100644 include/forward_index.hpp
 create mode 100644 test/test_compact_forward_index.cpp
 delete mode 100644 test/test_forward_index.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3fa9125..bc8c298 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,7 @@ add_subdirectory(src)
 add_subdirectory(benchmark)
 
 enable_testing()
-file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp test/test_inverted_index.cpp)
+file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp test/test_inverted_index.cpp test/test_compact_forward_index.cpp)
 foreach(TEST_SRC ${TEST_SOURCES})
   get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension
   add_executable(${TEST_SRC_NAME} ${TEST_SRC})
diff --git a/include/delta_forward_index.hpp b/include/delta_forward_index.hpp
deleted file mode 100644
index 6a302ab..0000000
--- a/include/delta_forward_index.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-#pragma once
-
-#include "parameters.hpp"
-#include "bit_vector.hpp"
-#include "ef/ef_sequence.hpp"
-
-namespace autocomplete {
-
-struct delta_forward_index {
-    struct builder {
-        builder() {}
-
-        builder(parameters const& params)
-            : m_num_integers(0)
-            , m_num_terms(params.num_terms) {
-            essentials::logger("building forward_index...");
-            uint64_t num_completions = params.num_completions;
-            std::ifstream input(
-                (params.collection_basename + ".forward").c_str(),
-                std::ios_base::in);
-            m_pointers.push_back(0);
-            for (uint64_t i = 0; i != num_completions; ++i) {
-                uint32_t n = 0;
-                input >> n;
-                assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY);
-                write_gamma_nonzero(m_data, n);
-                m_num_integers += n;
-                for (uint64_t k = 0; k != n; ++k) {
-                    id_type x;
-                    input >> x;
-                    write_delta(m_data, x);
-                }
-                m_pointers.push_back(m_data.size());
-            }
-            m_pointers.pop_back();
-            input.close();
-            essentials::logger("DONE");
-        }
-
-        void swap(delta_forward_index::builder& other) {
-            std::swap(other.m_num_integers, m_num_integers);
-            std::swap(other.m_num_terms, m_num_terms);
-            other.m_pointers.swap(m_pointers);
-            other.m_data.swap(m_data);
-        }
-
-        void build(delta_forward_index& fi) {
-            fi.m_num_integers = m_num_integers;
-            fi.m_num_terms = m_num_terms;
-            fi.m_pointers.build(m_pointers);
-            fi.m_data.build(&m_data);
-            builder().swap(*this);
-        }
-
-    private:
-        uint64_t m_num_integers;
-        uint64_t m_num_terms;
-        std::vector<uint64_t> m_pointers;
-        bit_vector_builder m_data;
-    };
-
-    delta_forward_index() {}
-
-    struct forward_list_iterator_type {
-        forward_list_iterator_type(bits_iterator<bit_vector> const& it,
-                                   uint64_t n)
-            : m_it(it)
-            , m_n(n)
-            , m_i(0) {}
-
-        uint64_t size() const {
-            return m_n;
-        }
-
-        void operator++() {
-            m_i += 1;
-        }
-
-        id_type operator*() {
-            return read_delta(m_it);
-        }
-
-        bool intersects(const range r) {
-            for (uint64_t i = 0; i != size(); ++i) {
-                auto val = operator*();
-                if (r.contains(val)) return true;
-            }
-            return false;
-        }
-
-    private:
-        bits_iterator<bit_vector> m_it;
-        uint64_t m_n;
-        uint64_t m_i;
-    };
-
-    forward_list_iterator_type iterator(id_type doc_id) {
-        uint64_t offset = m_pointers.access(doc_id);
-        bits_iterator<bit_vector> it(m_data, offset);
-        uint64_t n = read_gamma_nonzero(it);
-        return {it, n};
-    }
-
-    bool intersects(const id_type doc_id, const range r) {
-        return iterator(doc_id).intersects(r);
-    }
-
-    uint64_t num_integers() const {
-        return m_num_integers;
-    }
-
-    uint64_t num_terms() const {
-        return m_num_terms;
-    }
-
-    uint64_t num_docs() const {
-        return m_pointers.size();
-    }
-
-    size_t data_bytes() const {
-        return m_data.bytes();
-    }
-
-    size_t pointer_bytes() const {
-        return m_pointers.bytes();
-    }
-
-    size_t bytes() const {
-        return essentials::pod_bytes(m_num_integers) +
-               essentials::pod_bytes(m_num_terms) + m_pointers.bytes() +
-               m_data.bytes();
-    }
-
-    template <typename Visitor>
-    void visit(Visitor& visitor) {
-        visitor.visit(m_num_integers);
-        visitor.visit(m_num_terms);
-        visitor.visit(m_pointers);
-        visitor.visit(m_data);
-    }
-
-private:
-    uint64_t m_num_integers;
-    uint64_t m_num_terms;
-    ef::ef_sequence m_pointers;
-    bit_vector m_data;
-};
-
-}  // namespace autocomplete
\ No newline at end of file
diff --git a/include/forward_index.hpp b/include/forward_index.hpp
deleted file mode 100644
index 51c7c63..0000000
--- a/include/forward_index.hpp
+++ /dev/null
@@ -1,201 +0,0 @@
-#pragma once
-
-#include "parameters.hpp"
-#include "integer_codes.hpp"
-#include "building_util.hpp"
-#include "ef/ef_sequence.hpp"
-
-namespace autocomplete {
-
-template <typename ListType>
-struct forward_index {
-    typedef ListType forward_list_type;
-    typedef typename forward_list_type::iterator forward_list_iterator_type;
-    typedef uncompressed_list<uint8_t> permutation_list_type;
-    typedef
-        typename permutation_list_type::iterator permutation_list_iterator_type;
-
-    struct builder {
-        builder() {}
-
-        builder(parameters const& params)
-            : m_num_integers(0)
-            , m_num_terms(params.num_terms) {
-            essentials::logger("building forward_index...");
-
-            uint64_t num_completions = params.num_completions;
-
-            std::ifstream input(
-                (params.collection_basename + ".forward").c_str(),
-                std::ios_base::in);
-
-            std::vector<id_type> list;
-            std::vector<id_type> sorted_permutation;
-            std::vector<uint8_t> permutation;
-
-            m_pointers.push_back(0);
-
-            for (uint64_t i = 0; i != num_completions; ++i) {
-                list.clear();
-                sorted_permutation.clear();
-                permutation.clear();
-
-                uint32_t n = 0;
-                input >> n;
-                assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY);
-                m_num_integers += n;
-                list.reserve(n);
-                sorted_permutation.reserve(n);
-
-                for (uint64_t k = 0; k != n; ++k) {
-                    id_type x;
-                    input >> x;
-                    list.push_back(x);
-                    sorted_permutation.push_back(k);
-                }
-
-                write_gamma_nonzero(m_bvb, n);
-                if (ListType::is_byte_aligned) util::push_pad(m_bvb);
-
-                std::sort(
-                    sorted_permutation.begin(), sorted_permutation.end(),
-                    [&](id_type l, id_type r) { return list[l] < list[r]; });
-
-                permutation.resize(n);
-                for (uint32_t i = 0; i != n; ++i) {
-                    permutation[sorted_permutation[i]] = i;
-                }
-
-                std::sort(list.begin(), list.end());
-                forward_list_type::build(m_bvb, list.begin(), m_num_terms + 1,
-                                         n);
-                util::push_pad(m_bvb);
-                m_pointers.push_back(m_bvb.size());
-
-                permutation_list_type::build(m_bvb, permutation.begin(), n + 1,
-                                             n);
-                m_pointers.push_back(m_bvb.size());
-            }
-
-            m_pointers.pop_back();
-            input.close();
-            essentials::logger("DONE");
-        }
-
-        void swap(forward_index::builder& other) {
-            std::swap(other.m_num_integers, m_num_integers);
-            std::swap(other.m_num_terms, m_num_terms);
-            other.m_pointers.swap(m_pointers);
-            other.m_bvb.swap(m_bvb);
-        }
-
-        void build(forward_index<ListType>& fi) {
-            fi.m_num_integers = m_num_integers;
-            fi.m_num_terms = m_num_terms;
-            fi.m_pointers.build(m_pointers);
-            fi.m_data.build(&m_bvb);
-            builder().swap(*this);
-        }
-
-    private:
-        uint64_t m_num_integers;
-        uint64_t m_num_terms;
-        std::vector<uint64_t> m_pointers;
-        bit_vector_builder m_bvb;
-    };
-
-    forward_index() {}
-
-    bool intersects(id_type doc_id, range r) {
-        return get(doc_id).intersects(r);
-    }
-
-    struct permuting_iterator_type {
-        permuting_iterator_type(forward_list_iterator_type const& sorted,
-                                permutation_list_iterator_type const& permuted)
-            : m_i(0)
-            , m_sorted(sorted)
-            , m_permuted(permuted) {
-            assert(sorted.size() == permuted.size());
-        }
-
-        uint32_t size() const {
-            return m_sorted.size();
-        }
-
-        id_type operator*() {
-            return m_sorted.access(m_permuted.access(m_i));
-        }
-
-        void operator++() {
-            ++m_i;
-        }
-
-    private:
-        uint32_t m_i;
-        forward_list_iterator_type m_sorted;
-        permutation_list_iterator_type m_permuted;
-    };
-
-    permuting_iterator_type iterator(id_type doc_id) {
-        uint64_t offset = m_pointers.access(doc_id * 2);
-        bits_iterator<bit_vector> it(m_data, offset);
-        uint64_t n = read_gamma_nonzero(it);
-        if (ListType::is_byte_aligned) util::eat_pad(it);
-        forward_list_iterator_type it_sorted(m_data, it.position(),
-                                             m_num_terms + 1, n);
-        offset = m_pointers.access(doc_id * 2 + 1);
-        permutation_list_iterator_type it_permutation(m_data, offset, n + 1, n);
-        return permuting_iterator_type(it_sorted, it_permutation);
-    }
-
-    uint64_t num_integers() const {
-        return m_num_integers;
-    }
-
-    uint64_t num_terms() const {
-        return m_num_terms;
-    }
-
-    uint64_t num_docs() const {
-        return m_pointers.size();
-    }
-
-    size_t data_bytes() const {
-        return m_data.bytes();
-    }
-
-    size_t pointer_bytes() const {
-        return m_pointers.bytes();
-    }
-
-    size_t bytes() const {
-        return essentials::pod_bytes(m_num_integers) +
-               essentials::pod_bytes(m_num_terms) + m_pointers.bytes() +
-               m_data.bytes();
-    }
-
-    template <typename Visitor>
-    void visit(Visitor& visitor) {
-        visitor.visit(m_num_integers);
-        visitor.visit(m_num_terms);
-        visitor.visit(m_pointers);
-        visitor.visit(m_data);
-    }
-
-private:
-    uint64_t m_num_integers;
-    uint64_t m_num_terms;
-    ef::ef_sequence m_pointers;
-    bit_vector m_data;
-
-    forward_list_iterator_type get(id_type doc_id) {
-        uint64_t offset = m_pointers.access(doc_id * 2);
-        bits_iterator<bit_vector> it(m_data, offset);
-        uint64_t n = read_gamma_nonzero(it);
-        if (ListType::is_byte_aligned) util::eat_pad(it);
-        return {m_data, it.position(), m_num_terms + 1, n};
-    }
-};
-
-}  // namespace autocomplete
\ No newline at end of file
diff --git a/include/types.hpp b/include/types.hpp
index 1083cfc..6481276 100644
--- a/include/types.hpp
+++ b/include/types.hpp
@@ -3,13 +3,12 @@
 #include "completion_trie.hpp"
 #include "fc_dictionary.hpp"
 #include "integer_fc_dictionary.hpp"
-#include "uint_vec.hpp"
 #include "unsorted_list.hpp"
-#include "uncompressed_list.hpp"
 
-#include "forward_index.hpp"
+// #include "uint_vec.hpp"
+// #include "uncompressed_list.hpp"
+
 #include "compact_forward_index.hpp"
-#include "delta_forward_index.hpp"
 
 #include "inverted_index.hpp"
 #include "blocked_inverted_index.hpp"
@@ -40,14 +39,11 @@ typedef fc_dictionary<> fc_dictionary_type;
 typedef integer_fc_dictionary<> integer_fc_dictionary_type;
 
 typedef unsorted_list<cartesian_tree> succinct_rmq;
-typedef uncompressed_list<uint32_t> uncompressed_list32_t;
+// typedef uncompressed_list<uint32_t> uncompressed_list32_t;
 
 // typedef inverted_index<uncompressed_list32_t> uncompressed_inverted_index;
 typedef inverted_index<ef::compact_ef> ef_inverted_index;
 
-// typedef forward_index<uncompressed_list32_t> uncompressed_forward_index;
-// typedef forward_index<ef::compact_ef> ef_forward_index;
-
 // typedef blocked_inverted_index<uncompressed_list32_t>
 //     uncompressed_blocked_inverted_index;
 typedef blocked_inverted_index<ef::compact_ef> ef_blocked_inverted_index;
@@ -77,4 +73,5 @@ typedef autocomplete3<integer_fc_dictionary_type, succinct_rmq,
 typedef autocomplete4<integer_fc_dictionary_type, succinct_rmq,
                       fc_dictionary_type, ef_blocked_inverted_index>
     ef_autocomplete_type4;
+
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a78df87..bc5f04f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,3 +1,2 @@
-add_executable(test_forward_index test_forward_index.cpp)
 add_executable(test_autocomplete test_autocomplete.cpp)
 add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp)
\ No newline at end of file
diff --git a/test/test_compact_forward_index.cpp b/test/test_compact_forward_index.cpp
new file mode 100644
index 0000000..aa09403
--- /dev/null
+++ b/test/test_compact_forward_index.cpp
@@ -0,0 +1,47 @@
+#include "test_common.hpp"
+
+using namespace autocomplete;
+
+TEST_CASE("test compact_forward_index::iterator") {
+    char const* output_filename = testing::tmp_filename.c_str();
+    parameters params;
+    params.collection_basename = testing::test_filename.c_str();
+    params.load();
+
+    {
+        compact_forward_index::builder builder(params);
+        compact_forward_index index;
+        builder.build(index);
+        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_terms() == params.num_terms);
+        essentials::save<compact_forward_index>(index, output_filename);
+    }
+
+    {
+        compact_forward_index index;
+        essentials::load(index, output_filename);
+        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_terms() == params.num_terms);
+
+        std::ifstream input((params.collection_basename + ".forward").c_str(),
+                            std::ios_base::in);
+        for (uint64_t i = 0; i != index.num_terms(); ++i) {
+            auto it = index.iterator(i);
+            uint32_t n = 0;
+            input >> n;
+            REQUIRE_MESSAGE(n == it.size(), "list has size " << it.size()
+                                                             << " instead of "
+                                                             << n);
+            for (uint64_t k = 0; k != n; ++k, ++it) {
+                id_type expected;
+                input >> expected;
+                auto got = *it;
+                REQUIRE_MESSAGE(got == expected,
+                                "got " << got << " but expected " << expected);
+            }
+        }
+        input.close();
+
+        std::remove(output_filename);
+    }
+};
diff --git a/test/test_forward_index.cpp b/test/test_forward_index.cpp
deleted file mode 100644
index 576215d..0000000
--- a/test/test_forward_index.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <iostream>
-
-#include "types.hpp"
-
-using namespace autocomplete;
-
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        }
-    }
-
-    parameters params;
-    params.collection_basename = argv[1];
-    params.load();
-
-    typedef compact_forward_index forward_index_type;
-
-    {
-        forward_index_type::builder builder(params);
-        forward_index_type index;
-        builder.build(index);
-        std::cout << "using " << index.bytes() << " bytes" << std::endl;
-        std::cout << "num docs " << index.num_docs() << std::endl;
-        std::cout << "num terms " << index.num_terms() << std::endl;
-
-        if (output_filename) {
-            essentials::logger("saving data structure to disk...");
-            essentials::save<forward_index_type>(index, output_filename);
-            essentials::logger("DONE");
-        }
-    }
-
-    {
-        if (output_filename) {
-            forward_index_type index;
-            essentials::logger("loading data structure from disk...");
-            essentials::load(index, output_filename);
-            essentials::logger("DONE");
-            std::cout << "using " << index.bytes() << " bytes" << std::endl;
-            std::cout << "num docs " << index.num_docs() << std::endl;
-            std::cout << "num terms " << index.num_terms() << std::endl;
-        }
-    }
-
-    return 0;
-}
diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp
index aefdaae..ec93363 100644
--- a/test/test_inverted_index.cpp
+++ b/test/test_inverted_index.cpp
@@ -38,6 +38,50 @@ std::vector<term_ids> gen_random_queries(uint32_t num_queries,
     return queries;
 }
 
+TEST_CASE("test inverted_index::iterator") {
+    char const* output_filename = testing::tmp_filename.c_str();
+    parameters params;
+    params.collection_basename = testing::test_filename.c_str();
+    params.load();
+
+    {
+        inverted_index_type::builder builder(params);
+        inverted_index_type index;
+        builder.build(index);
+        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_terms() == params.num_terms);
+        essentials::save<inverted_index_type>(index, output_filename);
+    }
+
+    {
+        inverted_index_type index;
+        essentials::load(index, output_filename);
+        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_terms() == params.num_terms);
+
+        std::ifstream input((params.collection_basename + ".inverted").c_str(),
+                            std::ios_base::in);
+        for (uint64_t i = 0; i != index.num_terms(); ++i) {
+            auto it = index.iterator(i);
+            uint32_t n = 0;
+            input >> n;
+            REQUIRE_MESSAGE(n == it.size(), "list has size " << it.size()
+                                                             << " instead of "
+                                                             << n);
+            for (uint64_t k = 0; k != n; ++k, ++it) {
+                id_type expected;
+                input >> expected;
+                auto got = *it;
+                REQUIRE_MESSAGE(got == expected,
+                                "got " << got << " but expected " << expected);
+            }
+        }
+        input.close();
+
+        std::remove(output_filename);
+    }
+};
+
 TEST_CASE("test inverted_index::intersection_iterator") {
     char const* output_filename = testing::tmp_filename.c_str();
     parameters params;
@@ -120,5 +164,6 @@ TEST_CASE("test inverted_index::intersection_iterator") {
                                                          << " results, but got "
                                                          << n);
         }
+        std::remove(output_filename);
     }
 }

From 90aa2e7f1bd2817c6ad7395e6808744eb7ebcc67 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 23 Oct 2019 14:11:21 +0200
Subject: [PATCH 016/102] install.sh script

---
 README.md  | 22 ++++++++++++++++------
 install.sh | 11 +++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)
 create mode 100644 install.sh

diff --git a/README.md b/README.md
index a117f62..209aafb 100644
--- a/README.md
+++ b/README.md
@@ -5,12 +5,13 @@ Query autocompletion in C++.
 
 ##### Table of contents
 1. [Description](#descr)
-2. [Compiling the code](#compiling)
-3. [Input data format](#input)
-4. [Running the unit tests](#testing)
-5. [Building an index](#building)
-6. [Benchmarks](#benchmarks)
-7. [Live demo](#demo)
+2. [Installation](#install)
+3. [Compiling the code](#compiling)
+4. [Input data format](#input)
+5. [Running the unit tests](#testing)
+6. [Building an index](#building)
+7. [Benchmarks](#benchmarks)
+8. [Live demo](#demo)
 
 Description <a name="descr"></a>
 -----------
@@ -66,6 +67,15 @@ A recursive heap-based algorithm is used to produce the smallest docIDs in M[l,r
 The final string extraction step is identical to that of the
 prefix search.
 
+Installation <a name="install"></a>
+------------------
+
+Just run
+
+	$ ./install.sh
+	
+from the parent directory. The script builds the code; prepare the test data in the folder `test_data` for indexing; executes the unit tests.
+
 Compiling the code <a name="compiling"></a>
 ------------------
 
diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000..9e8da9e
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,11 @@
+git submodule init
+git submodule update
+mkdir -p build
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On
+make
+cd ../test_data
+./preprocess.sh
+cd ../build
+make test
+cd ..

From 164df361295e49bea91a428030a3b3dd8280e1e6 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 23 Oct 2019 15:22:38 +0200
Subject: [PATCH 017/102] more testing

---
 CMakeLists.txt                       |   2 +-
 test/CMakeLists.txt                  |   2 -
 test/test_autocomplete.cpp           | 147 +++++++++++----------------
 test/test_blocked_inverted_index.cpp |  81 +++++++++------
 test/test_common.hpp                 |  35 +++++++
 test/test_inverted_index.cpp         |  38 +------
 6 files changed, 149 insertions(+), 156 deletions(-)
 delete mode 100644 test/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc8c298..181c024 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,7 +52,7 @@ add_subdirectory(src)
 add_subdirectory(benchmark)
 
 enable_testing()
-file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp test/test_inverted_index.cpp test/test_compact_forward_index.cpp)
+file(GLOB TEST_SOURCES test/test_*.cpp)
 foreach(TEST_SRC ${TEST_SOURCES})
   get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension
   add_executable(${TEST_SRC_NAME} ${TEST_SRC})
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
deleted file mode 100644
index bc5f04f..0000000
--- a/test/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_executable(test_autocomplete test_autocomplete.cpp)
-add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp)
\ No newline at end of file
diff --git a/test/test_autocomplete.cpp b/test/test_autocomplete.cpp
index d4fcefa..964a451 100644
--- a/test/test_autocomplete.cpp
+++ b/test/test_autocomplete.cpp
@@ -1,110 +1,81 @@
-#include <iostream>
-
-#include "types.hpp"
-#include "statistics.hpp"
+#include "test_common.hpp"
 
 using namespace autocomplete;
 
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        }
-    }
+typedef ef_autocomplete_type1 index_type;
 
+TEST_CASE("test autocomplete topk functions") {
+    char const* output_filename = testing::tmp_filename.c_str();
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = testing::test_filename.c_str();
     params.load();
 
-    // typedef uncompressed_autocomplete_type index_type;
-    // typedef ef_autocomplete_type index_type;
-    typedef ef_autocomplete_type2 index_type;
-
     {
         index_type index(params);
-        if (output_filename) {
-            essentials::logger("saving data structure to disk...");
-            essentials::save<index_type>(index, output_filename);
-            essentials::logger("DONE");
-        }
+        essentials::save<index_type>(index, output_filename);
     }
 
     {
-        if (output_filename) {
-            index_type index;
-            essentials::logger("loading data structure from disk...");
-            essentials::load(index, output_filename);
-            essentials::logger("DONE");
-            index.print_stats();
-
-            {
-                essentials::logger("testing prefix_topk()...");
-                uint32_t k = 7;
-                std::vector<std::string> queries = {
-                    "a",        "10",          "african",
-                    "air",      "commercial",  "internet",
-                    "paris",    "somerset",    "the",
-                    "the new",  "the perfect", "the starting line",
-                    "yu gi oh", "for sale",    "dave mat",
-                    "florence", "florida be",  "for s",
-                    "for sa",   "for sal",     "for sale",
-                    "ford a",   "ford au",     "ford m",
-                    "ford mu",  "for",         "fo",
-                    "f",        "matt",        "fl",
-                    "florir",   "fly",         "the starting l",
-                    "floridaaa"};
-
-                for (auto& query : queries) {
-                    auto it = index.prefix_topk(query, k);
-                    std::cout << "top-" << it.size() << " completions for '"
-                              << query << "':\n";
-                    for (uint32_t i = 0; i != it.size(); ++i, ++it) {
-                        auto completion = *it;
-                        std::cout << "(" << completion.score << ", '";
-                        print(completion.string);
-                        std::cout << "')" << std::endl;
-                    }
+        index_type index;
+        essentials::load(index, output_filename);
+
+        {
+            essentials::logger("testing prefix_topk()...");
+            uint32_t k = 7;
+            std::vector<std::string> queries = {
+                "a",        "10",          "african",
+                "air",      "commercial",  "internet",
+                "paris",    "somerset",    "the",
+                "the new",  "the perfect", "the starting line",
+                "yu gi oh", "for sale",    "dave mat",
+                "florence", "florida be",  "for s",
+                "for sa",   "for sal",     "for sale",
+                "ford a",   "ford au",     "ford m",
+                "ford mu",  "for",         "fo",
+                "f",        "matt",        "fl",
+                "florir",   "fly",         "the starting l",
+                "floridaaa"};
+
+            for (auto& query : queries) {
+                auto it = index.prefix_topk(query, k);
+                std::cout << "top-" << it.size() << " completions for '"
+                          << query << "':\n";
+                for (uint32_t i = 0; i != it.size(); ++i, ++it) {
+                    auto completion = *it;
+                    std::cout << "(" << completion.score << ", '";
+                    print(completion.string);
+                    std::cout << "')" << std::endl;
                 }
-
-                essentials::logger("DONE");
             }
 
-            {
-                essentials::logger("testing conjunctive_topk()...");
-                uint32_t k = 7;
-                std::vector<std::string> queries = {
-                    "dave mat", "florence", "florida be",    "for s",
-                    "for sa",   "for sal",  "for sale",      "ford a",
-                    "ford au",  "ford m",   "ford mu",       "for",
-                    "fo",       "f",        "matt",          "fl",
-                    "flor",     "fly",      "the starting l"};
+            essentials::logger("DONE");
+        }
 
-                for (auto& query : queries) {
-                    auto it = index.conjunctive_topk(query, k);
-                    std::cout << "top-" << it.size() << " completions for '"
-                              << query << "':\n";
-                    for (uint32_t i = 0; i != it.size(); ++i, ++it) {
-                        auto completion = *it;
-                        std::cout << "(" << completion.score << ", '";
-                        print(completion.string);
-                        std::cout << "')" << std::endl;
-                    }
+        {
+            essentials::logger("testing conjunctive_topk()...");
+            uint32_t k = 7;
+            std::vector<std::string> queries = {
+                "dave mat", "florence", "florida be",    "for s",
+                "for sa",   "for sal",  "for sale",      "ford a",
+                "ford au",  "ford m",   "ford mu",       "for",
+                "fo",       "f",        "matt",          "fl",
+                "flor",     "fly",      "the starting l"};
+
+            for (auto& query : queries) {
+                auto it = index.conjunctive_topk(query, k);
+                std::cout << "top-" << it.size() << " completions for '"
+                          << query << "':\n";
+                for (uint32_t i = 0; i != it.size(); ++i, ++it) {
+                    auto completion = *it;
+                    std::cout << "(" << completion.score << ", '";
+                    print(completion.string);
+                    std::cout << "')" << std::endl;
                 }
-
-                essentials::logger("DONE");
             }
+
+            essentials::logger("DONE");
         }
     }
 
-    return 0;
+    std::remove(output_filename);
 }
diff --git a/test/test_blocked_inverted_index.cpp b/test/test_blocked_inverted_index.cpp
index 94fc274..80a9bc1 100644
--- a/test/test_blocked_inverted_index.cpp
+++ b/test/test_blocked_inverted_index.cpp
@@ -1,40 +1,63 @@
-#include <iostream>
-
-#include "types.hpp"
+#include "test_common.hpp"
 
 using namespace autocomplete;
 
-int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> [-o output_filename]"
-                  << std::endl;
-        return 1;
-    }
-
-    char const* output_filename = nullptr;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        }
-    }
+typedef ef_blocked_inverted_index blocked_inverted_index_type;
+typedef ef_inverted_index inverted_index_type;
 
+TEST_CASE("test blocked_inverted_index::intersection_iterator") {
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = testing::test_filename.c_str();
     params.load();
-    const float c = 0.01;
+
+    inverted_index_type ii;
 
     {
-        // build, print and write
-        ef_blocked_inverted_index::builder builder(params, c);
-        ef_blocked_inverted_index bii;
-        builder.build(bii);
-        std::cout << "using " << bii.bytes() << " bytes" << std::endl;
-        std::cout << "num docs " << bii.num_docs() << std::endl;
-        std::cout << "num terms " << bii.num_terms() << std::endl;
+        inverted_index_type::builder ii_builder(params);
+        ii_builder.build(ii);
+        REQUIRE(ii.num_docs() == params.num_completions);
+        REQUIRE(ii.num_terms() == params.num_terms);
     }
 
-    return 0;
+    {
+        static const uint32_t num_queries = 10000;
+        static const uint32_t max_num_terms = 3;
+        auto queries = testing::gen_random_queries(num_queries, max_num_terms,
+                                                   params.num_terms);
+
+        static const std::vector<float> C = {0.0125, 0.025, 0.05, 0.1};
+        blocked_inverted_index_type blocked_ii;
+        uint64_t total;
+
+        for (auto c : C) {
+            total = 0;
+            {
+                blocked_inverted_index_type::builder blocked_ii_builder(params,
+                                                                        c);
+                blocked_ii_builder.build(blocked_ii);
+            }
+
+            REQUIRE(blocked_ii.num_docs() == params.num_completions);
+            REQUIRE(blocked_ii.num_terms() == params.num_terms);
+
+            for (auto& q : queries) {
+                auto ii_it = ii.intersection_iterator(q);
+                auto blocked_ii_it =
+                    blocked_ii.intersection_iterator(q, {0, 0});
+
+                uint32_t n = 0;
+                for (; ii_it.has_next(); ++n, ++ii_it, ++blocked_ii_it) {
+                    auto got = *blocked_ii_it;
+                    auto expected = *ii_it;
+                    REQUIRE_MESSAGE(got == expected, "expected doc_id "
+                                                         << expected
+                                                         << " but got " << got);
+                }
+                if (n) total += n;
+                REQUIRE(blocked_ii_it.has_next() == false);
+            }
+
+            std::cout << total << std::endl;
+        }
+    }
 }
diff --git a/test/test_common.hpp b/test/test_common.hpp
index 0bc701a..580a07e 100644
--- a/test/test_common.hpp
+++ b/test/test_common.hpp
@@ -50,5 +50,40 @@ range locate_prefix(std::vector<std::string> const& strings,
     return r;
 }
 
+typedef std::vector<id_type> term_ids;
+
+std::vector<term_ids> gen_random_queries(uint32_t num_queries,
+                                         uint32_t max_num_terms,
+                                         uint32_t max_range_len) {
+    assert(max_num_terms > 1);
+    std::vector<term_ids> queries;
+    queries.reserve(num_queries);
+    essentials::uniform_int_rng<uint32_t> random_num_terms(2, max_num_terms);
+    essentials::uniform_int_rng<uint32_t> random_term_id(1, max_range_len);
+
+    for (uint32_t i = 0; i != num_queries; ++i) {
+        term_ids q;
+        uint32_t num_terms = random_num_terms.gen();
+        q.reserve(num_terms);
+        uint32_t num_distinct_terms = 0;
+        while (true) {
+            q.clear();
+            for (uint32_t i = 0; i != num_terms; ++i) {
+                auto t = random_term_id.gen();
+                assert(t >= 1 and t <= max_range_len);
+                q.push_back(t);
+            }
+            std::sort(q.begin(), q.end());
+            auto end = std::unique(q.begin(), q.end());
+            num_distinct_terms = std::distance(q.begin(), end);
+            if (num_distinct_terms >= 2) break;
+        }
+        q.resize(num_distinct_terms);
+        queries.push_back(q);
+    }
+
+    return queries;
+}
+
 }  // namespace testing
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp
index ec93363..b96b708 100644
--- a/test/test_inverted_index.cpp
+++ b/test/test_inverted_index.cpp
@@ -3,40 +3,6 @@
 using namespace autocomplete;
 
 typedef ef_inverted_index inverted_index_type;
-typedef std::vector<id_type> term_ids;
-
-std::vector<term_ids> gen_random_queries(uint32_t num_queries,
-                                         uint32_t max_num_terms,
-                                         uint32_t max_range_len) {
-    assert(max_num_terms > 1);
-    std::vector<term_ids> queries;
-    queries.reserve(num_queries);
-    essentials::uniform_int_rng<uint32_t> random_num_terms(2, max_num_terms);
-    essentials::uniform_int_rng<uint32_t> random_term_id(1, max_range_len);
-
-    for (uint32_t i = 0; i != num_queries; ++i) {
-        term_ids q;
-        uint32_t num_terms = random_num_terms.gen();
-        q.reserve(num_terms);
-        uint32_t num_distinct_terms = 0;
-        while (true) {
-            q.clear();
-            for (uint32_t i = 0; i != num_terms; ++i) {
-                auto t = random_term_id.gen();
-                assert(t >= 1 and t <= max_range_len);
-                q.push_back(t);
-            }
-            std::sort(q.begin(), q.end());
-            auto end = std::unique(q.begin(), q.end());
-            num_distinct_terms = std::distance(q.begin(), end);
-            if (num_distinct_terms >= 2) break;
-        }
-        q.resize(num_distinct_terms);
-        queries.push_back(q);
-    }
-
-    return queries;
-}
 
 TEST_CASE("test inverted_index::iterator") {
     char const* output_filename = testing::tmp_filename.c_str();
@@ -105,8 +71,8 @@ TEST_CASE("test inverted_index::intersection_iterator") {
 
         static const uint32_t num_queries = 1000000;
         static const uint32_t max_num_terms = 5;
-        auto queries =
-            gen_random_queries(num_queries, max_num_terms, index.num_terms());
+        auto queries = testing::gen_random_queries(num_queries, max_num_terms,
+                                                   index.num_terms());
 
         std::vector<id_type> first(index.num_docs());
         std::vector<id_type> second(index.num_docs());

From a6941ef198fdec754211cb0216c1ef5e681385a0 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 23 Oct 2019 15:42:13 +0200
Subject: [PATCH 018/102] example.sh

---
 README.md          | 12 +++++++++---
 example.sh         |  3 +++
 src/web_server.cpp |  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 example.sh

diff --git a/README.md b/README.md
index 209aafb..60911a4 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Query autocompletion in C++.
 
 ##### Table of contents
 1. [Description](#descr)
-2. [Installation](#install)
+2. [Installation and quick start](#install)
 3. [Compiling the code](#compiling)
 4. [Input data format](#input)
 5. [Running the unit tests](#testing)
@@ -67,15 +67,21 @@ A recursive heap-based algorithm is used to produce the smallest docIDs in M[l,r
 The final string extraction step is identical to that of the
 prefix search.
 
-Installation <a name="install"></a>
+Installation and quick start <a name="install"></a>
 ------------------
 
 Just run
 
-	$ ./install.sh
+	$ bash ./install.sh
 	
 from the parent directory. The script builds the code; prepare the test data in the folder `test_data` for indexing; executes the unit tests.
 
+For having a minimal running example, just run
+
+	$ bash ./example.sh
+
+and then access the service [here](http://127.0.0.1:8000).
+
 Compiling the code <a name="compiling"></a>
 ------------------
 
diff --git a/example.sh b/example.sh
new file mode 100644
index 0000000..4ac00bf
--- /dev/null
+++ b/example.sh
@@ -0,0 +1,3 @@
+cd build
+./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin
+./web_server 8000 trec_05.ef_type1.bin
\ No newline at end of file
diff --git a/src/web_server.cpp b/src/web_server.cpp
index 94a259b..7a0a61c 100644
--- a/src/web_server.cpp
+++ b/src/web_server.cpp
@@ -26,7 +26,7 @@ std::string escape_json(std::string const& s) {
 
 using namespace autocomplete;
 
-typedef ef_autocomplete_type3 topk_index_type;
+typedef ef_autocomplete_type1 topk_index_type;
 
 static std::string s_http_port("8000");
 static struct mg_serve_http_opts s_http_server_opts;

From 496960930651c5fae42267b46892d722ce41e0f4 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 24 Oct 2019 10:18:06 +0200
Subject: [PATCH 019/102] minor

---
 CMakeLists.txt                     |  2 +-
 include/autocomplete3.hpp          |  2 +-
 include/blocked_inverted_index.hpp |  2 +-
 include/compact_vector.hpp         |  2 +-
 include/completion_trie.hpp        |  5 +++--
 include/ef/ef_sequence.hpp         |  6 +++---
 include/fc_dictionary.hpp          |  5 ++++-
 include/inverted_index.hpp         |  4 ++--
 include/parameters.hpp             |  7 +++++--
 include/uint_vec.hpp               | 10 +++++-----
 include/util_types.hpp             |  5 +++++
 test/test_common.hpp               |  4 +---
 12 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 181c024..2908d2c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,7 @@ endif ()
 
 if(UNIX)
 
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index 550aac5..ab0abb1 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -321,7 +321,7 @@ struct autocomplete3 {
 
     template <typename Iterator>
     uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) {
-        assert(!r.is_invalid());
+        assert(r.is_valid());
 
         auto& topk_scores = m_pool.scores();
         min_priority_queue_type q;
diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index 79319fe..dfd452d 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -263,7 +263,7 @@ struct blocked_inverted_index {
             : m_i(0)
             , m_num_docs(ii->num_docs())
             , m_suffix(r) {
-            assert(!r.is_invalid());
+            assert(r.is_valid());
 
             if (!term_ids.empty()) {
                 m_iterators.reserve(term_ids.size());  // at most
diff --git a/include/compact_vector.hpp b/include/compact_vector.hpp
index f0cd1bd..eb3f9b0 100644
--- a/include/compact_vector.hpp
+++ b/include/compact_vector.hpp
@@ -277,7 +277,7 @@ struct compact_vector {
     }
 
     uint64_t find(const range r, uint64_t id) {
-        assert(!r.is_invalid());
+        assert(r.is_valid());
         assert(r.end <= size());
         return util::find(*this, id, r.begin, r.end - 1);
     }
diff --git a/include/completion_trie.hpp b/include/completion_trie.hpp
index 3d52ee5..2bc68ea 100644
--- a/include/completion_trie.hpp
+++ b/include/completion_trie.hpp
@@ -170,7 +170,7 @@ struct completion_trie {
     // Return [a,b)
     range locate_prefix(completion_type const& prefix,
                         range suffix_lex_range) const {
-        range r{global::not_found, global::not_found};
+        range r = global::invalid_range;
         range pointer{0, m_nodes.front().size()};
         uint32_t i = 0;
 
@@ -195,10 +195,11 @@ struct completion_trie {
             r.end += size;
         }
 
-        assert(r.end > r.begin);
+        assert(r.is_valid());
         return r;
     }
 
+    // NOTE: not used
     bool is_member(completion_type const& c) const {
         assert(c.size() > 0);
         range pointer{0, m_nodes.front().size()};
diff --git a/include/ef/ef_sequence.hpp b/include/ef/ef_sequence.hpp
index 10970d6..0d1f436 100644
--- a/include/ef/ef_sequence.hpp
+++ b/include/ef/ef_sequence.hpp
@@ -142,14 +142,14 @@ struct ef_sequence {
     }
 
     uint64_t find(const range r, uint64_t id) const {
-        assert(!r.is_invalid());
+        assert(r.is_valid());
         assert(r.end <= size());
         uint64_t prev_upper = previous_range_upperbound(r);
         return util::find(*this, id + prev_upper, r.begin, r.end - 1);
     }
 
     range find(const range r, const range lex) const {
-        assert(!r.is_invalid());
+        assert(r.is_valid());
         assert(r.end <= size());
         auto prev_upper = previous_range_upperbound(r);
 
@@ -251,7 +251,7 @@ struct ef_sequence {
     }
 
     uint64_t previous_range_upperbound(const range r) const {
-        assert(!r.is_invalid());
+        assert(r.is_valid());
         return r.begin ? access(r.begin - 1) : 0;
     }
 };
diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp
index 271f970..1b5aa9b 100644
--- a/include/fc_dictionary.hpp
+++ b/include/fc_dictionary.hpp
@@ -307,10 +307,13 @@ struct fc_dictionary {
 
         // NOTE 1: excluding null terminators, allow us to use memcpy here
         // because we know exactly how many bytes to copy: this is much faster
-        // than looping until we hit '\0'. NOTE 2: always copying a fixed amount
+        // than looping until we hit '\0'.
+
+        // NOTE 2: always copying a fixed amount
         // of bytes (constants::MAX_NUM_CHARS_PER_QUERY) is much faster than
         // copying an exact amount, e.g., suffix_len (althoung it could be
         // less), so do not do: memcpy(out+ l, in, suffix_len).
+
         memcpy(out + l, in, constants::MAX_NUM_CHARS_PER_QUERY);
 
         return l + suffix_len;
diff --git a/include/inverted_index.hpp b/include/inverted_index.hpp
index 7c84bd7..cd4ad29 100644
--- a/include/inverted_index.hpp
+++ b/include/inverted_index.hpp
@@ -41,7 +41,7 @@ struct inverted_index {
                 }
                 m_minimal_doc_ids.push_back(list.front());
                 write_gamma_nonzero(m_bvb, n);
-                if (ListType::is_byte_aligned) util::push_pad(m_bvb);
+                if constexpr (ListType::is_byte_aligned) util::push_pad(m_bvb);
                 ListType::build(m_bvb, list.begin(), m_num_docs, list.size());
                 m_pointers.push_back(m_bvb.size());
             }
@@ -86,7 +86,7 @@ struct inverted_index {
         uint64_t offset = m_pointers.access(term_id);
         bits_iterator<bit_vector> it(m_data, offset);
         uint64_t n = read_gamma_nonzero(it);
-        if (ListType::is_byte_aligned) util::eat_pad(it);
+        if constexpr (ListType::is_byte_aligned) util::eat_pad(it);
         return {m_data, it.position(), m_num_docs, n};
     }
 
diff --git a/include/parameters.hpp b/include/parameters.hpp
index db44d71..9d03783 100644
--- a/include/parameters.hpp
+++ b/include/parameters.hpp
@@ -41,8 +41,11 @@ struct parameters {
         }
 
         nodes_per_level.resize(num_levels, 0);
-        for (uint32_t i = 0; i != num_levels; ++i) {
-            input >> nodes_per_level[i];
+        uint32_t i = 0;
+        for (; i != num_levels and input; ++i) input >> nodes_per_level[i];
+        if (i != num_levels) {
+            throw std::runtime_error(
+                "File with statistics may be truncated or malformed");
         }
     }
 
diff --git a/include/uint_vec.hpp b/include/uint_vec.hpp
index 86d60c4..adeaa8c 100644
--- a/include/uint_vec.hpp
+++ b/include/uint_vec.hpp
@@ -74,14 +74,14 @@ struct uint_vec {
     }
 
     uint64_t find(const range r, UintType id) const {
-        assert(!r.is_invalid());
+        assert(r.is_valid());
         assert(r.end <= size());
-        UintType prev_upper = previous_range_upperbound(r);
+        auto prev_upper = previous_range_upperbound(r);
         return util::find(*this, id + prev_upper, r.begin, r.end - 1);
     }
 
     range find(const range r, const range lex) const {
-        assert(!r.is_invalid());
+        assert(r.is_valid());
         assert(r.end <= size());
         auto prev_upper = previous_range_upperbound(r);
 
@@ -131,9 +131,9 @@ struct uint_vec {
     std::vector<UintType> m_data;
 
     UintType previous_range_upperbound(const range r) const {
-        assert(!r.is_invalid());
+        assert(r.is_valid());
         return r.begin ? access(r.begin - 1) : 0;
     }
-};  // namespace autocomplete
+};
 
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/include/util_types.hpp b/include/util_types.hpp
index 7405378..e056bb6 100644
--- a/include/util_types.hpp
+++ b/include/util_types.hpp
@@ -36,6 +36,7 @@ struct range {
     uint64_t begin;
     uint64_t end;
     bool is_invalid() const;
+    bool is_valid() const;
     bool contains(uint64_t val) const;
 };
 
@@ -48,6 +49,10 @@ bool range::is_invalid() const {
            end == global::invalid_range.end or begin > end;
 }
 
+bool range::is_valid() const {
+    return !is_invalid();
+}
+
 bool range::contains(uint64_t val) const {
     if (val >= begin and val <= end) return true;
     return false;
diff --git a/test/test_common.hpp b/test/test_common.hpp
index 580a07e..24f4540 100644
--- a/test/test_common.hpp
+++ b/test/test_common.hpp
@@ -69,9 +69,7 @@ std::vector<term_ids> gen_random_queries(uint32_t num_queries,
         while (true) {
             q.clear();
             for (uint32_t i = 0; i != num_terms; ++i) {
-                auto t = random_term_id.gen();
-                assert(t >= 1 and t <= max_range_len);
-                q.push_back(t);
+                q.push_back(random_term_id.gen());
             }
             std::sort(q.begin(), q.end());
             auto end = std::unique(q.begin(), q.end());

From a642eefce34bc8e4a11a125e5da5a01646fb676b Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 24 Oct 2019 11:09:35 +0200
Subject: [PATCH 020/102] benchmark fc_dictionary::locate_prefix

---
 benchmark/benchmark_fc_dictionary.cpp | 26 +++++++++++++++++++++++++-
 include/fc_dictionary.hpp             |  3 ++-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp
index f566edd..d8a53e5 100644
--- a/benchmark/benchmark_fc_dictionary.cpp
+++ b/benchmark/benchmark_fc_dictionary.cpp
@@ -8,7 +8,7 @@ using namespace autocomplete;
 template <typename Dictionary>
 void perf_test(Dictionary const& dict,
                std::vector<std::string> const& queries) {
-    std::vector<uint8_t> decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
+    static std::vector<uint8_t> decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
     essentials::timer_type timer;
 
     for (uint32_t i = 0; i != runs; ++i) {
@@ -43,6 +43,30 @@ void perf_test(Dictionary const& dict,
 
     std::cout << "extract: " << (timer.average() * 1000.0) / ids.size()
               << " [ns/string]" << std::endl;
+
+    static std::vector<float> percentages = {0.0, 0.25, 0.50, 0.75, 1.0};
+    // static std::vector<float> percentages = {0.1, 0.2, 0.3, 0.4, 0.5,
+    //                                          0.6, 0.7, 0.8, 0.9, 1.0};
+    for (auto p : percentages) {
+        timer.reset();
+        for (uint32_t i = 0; i != runs; ++i) {
+            timer.start();
+            for (auto const& query : queries) {
+                size_t size = query.size();
+                size_t n = size * p;
+                if (n == 0) n += 1;  // at least one char
+                uint8_t const* addr =
+                    reinterpret_cast<uint8_t const*>(query.data());
+                range r = dict.locate_prefix({addr, addr + n});
+                essentials::do_not_optimize_away(r.end - r.begin);
+            }
+            timer.stop();
+        }
+
+        std::cout << "locate_prefix-" << p * 100.0
+                  << "%: " << (timer.average() * 1000.0) / queries.size()
+                  << " [ns/string]" << std::endl;
+    }
 }
 
 #define exe(BUCKET_SIZE)                                                     \
diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp
index 1b5aa9b..bde263e 100644
--- a/include/fc_dictionary.hpp
+++ b/include/fc_dictionary.hpp
@@ -223,7 +223,8 @@ struct fc_dictionary {
         if (cmp < 0) {
             bucket_id = mi;
         } else {
-            bucket_id = mi - 1;
+            assert(cmp > 0);
+            bucket_id = hi;
             h = header(bucket_id);
         }
 

From 8753a0a9e89c639382693396c993bc2a018d0456 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 24 Oct 2019 11:17:39 +0200
Subject: [PATCH 021/102] updated fc_dictionary results

---
 benchmark/benchmark_fc_dictionary.cpp |  2 +-
 results/fc_dictionary.md              | 95 ++++++++++++++++++---------
 2 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp
index d8a53e5..1d94c8e 100644
--- a/benchmark/benchmark_fc_dictionary.cpp
+++ b/benchmark/benchmark_fc_dictionary.cpp
@@ -63,7 +63,7 @@ void perf_test(Dictionary const& dict,
             timer.stop();
         }
 
-        std::cout << "locate_prefix-" << p * 100.0
+        std::cout << "\tlocate_prefix-" << p * 100.0
                   << "%: " << (timer.average() * 1000.0) / queries.size()
                   << " [ns/string]" << std::endl;
     }
diff --git a/results/fc_dictionary.md b/results/fc_dictionary.md
index 39e64b7..37ff080 100644
--- a/results/fc_dictionary.md
+++ b/results/fc_dictionary.md
@@ -1,40 +1,75 @@
 #### Results on the AOL querylog.
 
 	pibiri@rubino:~/autocomplete/build$ ./benchmark_fc_dictionary ../test_data/aol/aol.completions 1000000 < ../test_data/aol/aol.completions.dict_queries.1M.shuffled 
-	2019-10-14 14:54:24: loading queries...
-	2019-10-14 14:54:24: loaded 1000000 queries
-	2019-10-14 14:54:24: building fc_dictionary with bucket size 4...
-	2019-10-14 14:54:25: DONE
+	2019-10-24 11:11:49: loading queries...
+	2019-10-24 11:11:49: loaded 1000000 queries
+	2019-10-24 11:11:49: building fc_dictionary with bucket size 4...
+	2019-10-24 11:11:50: DONE
 	using 42938890 bytes
-	locate: 559.666 [ns/string]
-	extract: 165.846 [ns/string]
-	2019-10-14 14:54:32: building fc_dictionary with bucket size 8...
-	2019-10-14 14:54:33: DONE
+	locate: 557.091 [ns/string]
+	extract: 168.772 [ns/string]
+	locate_prefix-0%: 213.453 [ns/string]
+	locate_prefix-25%: 794.612 [ns/string]
+	locate_prefix-50%: 1064.44 [ns/string]
+	locate_prefix-75%: 912.04 [ns/string]
+	locate_prefix-100%: 702.745 [ns/string]
+	2019-10-24 11:12:12: building fc_dictionary with bucket size 8...
+	2019-10-24 11:12:12: DONE
 	using 38111527 bytes
-	locate: 515.359 [ns/string]
-	extract: 151.121 [ns/string]
-	2019-10-14 14:54:40: building fc_dictionary with bucket size 16...
-	2019-10-14 14:54:40: DONE
+	locate: 511.503 [ns/string]
+	extract: 152.331 [ns/string]
+	locate_prefix-0%: 223.374 [ns/string]
+	locate_prefix-25%: 686.093 [ns/string]
+	locate_prefix-50%: 873.161 [ns/string]
+	locate_prefix-75%: 758.029 [ns/string]
+	locate_prefix-100%: 638.576 [ns/string]
+	2019-10-24 11:12:32: building fc_dictionary with bucket size 16...
+	2019-10-24 11:12:32: DONE
 	using 35270205 bytes
-	locate: 474.319 [ns/string]
-	extract: 138.07 [ns/string]
-	2019-10-14 14:54:47: building fc_dictionary with bucket size 32...
-	2019-10-14 14:54:47: DONE
+	locate: 478.592 [ns/string]
+	extract: 139.109 [ns/string]
+	locate_prefix-0%: 228.416 [ns/string]
+	locate_prefix-25%: 662.483 [ns/string]
+	locate_prefix-50%: 769.227 [ns/string]
+	locate_prefix-75%: 685.358 [ns/string]
+	locate_prefix-100%: 615.757 [ns/string]
+	2019-10-24 11:12:51: building fc_dictionary with bucket size 32...
+	2019-10-24 11:12:51: DONE
 	using 33722303 bytes
-	locate: 490 [ns/string]
-	extract: 150.671 [ns/string]
-	2019-10-14 14:54:54: building fc_dictionary with bucket size 64...
-	2019-10-14 14:54:54: DONE
+	locate: 484.72 [ns/string]
+	extract: 150.21 [ns/string]
+	locate_prefix-0%: 273.595 [ns/string]
+	locate_prefix-25%: 717.559 [ns/string]
+	locate_prefix-50%: 790.342 [ns/string]
+	locate_prefix-75%: 728.409 [ns/string]
+	locate_prefix-100%: 681.921 [ns/string]
+	2019-10-24 11:13:11: building fc_dictionary with bucket size 64...
+	2019-10-24 11:13:11: DONE
 	using 32910194 bytes
-	locate: 585.408 [ns/string]
-	extract: 197.131 [ns/string]
-	2019-10-14 14:55:03: building fc_dictionary with bucket size 128...
-	2019-10-14 14:55:03: DONE
+	locate: 585.835 [ns/string]
+	extract: 194.183 [ns/string]
+	locate_prefix-0%: 667.159 [ns/string]
+	locate_prefix-25%: 962.096 [ns/string]
+	locate_prefix-50%: 1056.04 [ns/string]
+	locate_prefix-75%: 1014.63 [ns/string]
+	locate_prefix-100%: 978.718 [ns/string]
+	2019-10-24 11:13:39: building fc_dictionary with bucket size 128...
+	2019-10-24 11:13:39: DONE
 	using 32496375 bytes
-	locate: 812.441 [ns/string]
-	extract: 293.022 [ns/string]
-	2019-10-14 14:55:15: building fc_dictionary with bucket size 256...
-	2019-10-14 14:55:15: DONE
+	locate: 810.282 [ns/string]
+	extract: 286.967 [ns/string]
+	locate_prefix-0%: 574.352 [ns/string]
+	locate_prefix-25%: 1248.92 [ns/string]
+	locate_prefix-50%: 1435.28 [ns/string]
+	locate_prefix-75%: 1419.18 [ns/string]
+	locate_prefix-100%: 1398.48 [ns/string]
+	2019-10-24 11:14:16: building fc_dictionary with bucket size 256...
+	2019-10-24 11:14:16: DONE
 	using 32286042 bytes
-	locate: 1283.83 [ns/string]
-	extract: 485.985 [ns/string]
\ No newline at end of file
+	locate: 1281.09 [ns/string]
+	extract: 470.922 [ns/string]
+	locate_prefix-0%: 1065.07 [ns/string]
+	locate_prefix-25%: 2099.35 [ns/string]
+	locate_prefix-50%: 2387.39 [ns/string]
+	locate_prefix-75%: 2407.04 [ns/string]
+	locate_prefix-100%: 2403.04 [ns/string]
\ No newline at end of file

From f45fa9d7f04e66f58fa350e22996ac9d7dd39367 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 24 Oct 2019 11:43:14 +0200
Subject: [PATCH 022/102] updated partition_queries script

---
 README.md                                | 12 +++++++-----
 test_data/partition_queries_by_length.py | 25 ++++++++++++++++++------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 60911a4..31c1649 100644
--- a/README.md
+++ b/README.md
@@ -73,10 +73,10 @@ Installation and quick start <a name="install"></a>
 Just run
 
 	$ bash ./install.sh
-	
+
 from the parent directory. The script builds the code; prepare the test data in the folder `test_data` for indexing; executes the unit tests.
 
-For having a minimal running example, just run
+After that, for having a minimal running example, just run
 
 	$ bash ./example.sh
 
@@ -136,7 +136,7 @@ The script `preprocess.sh` in the directory `test_data` helps
 in preparing the data for indexing.
 Thus, from within the directory `test_data`, it is sufficient
 to do:
-	
+
 	$ bash preprocess.sh
 
 If you run the script, you will get:
@@ -195,7 +195,10 @@ You can use
 
 	$ python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
 
-to partition the input completions by number of query terms.
+to partition the input completions by number of query terms. Each partition
+of queries is shuffled at random to avoid locality of access.
+(By default, 8 shards will be created: the ones having [1,7] query terms and
+the one collecting all completions with >= 8 query terms).
 
 Then the command
 
@@ -203,7 +206,6 @@ Then the command
 
 will execute 1000 top-10 queries with 3 terms, from which only 25%
 of the prefix of the last token is retained.
-(For no locality, it is suggested to shuffle the queries at random, for example using `gshuf` on Mac.)
 
 We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`.
 From within the `/build` directory, run
diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py
index f9cb561..7f14b42 100644
--- a/test_data/partition_queries_by_length.py
+++ b/test_data/partition_queries_by_length.py
@@ -1,11 +1,15 @@
 import sys
 import numpy as np
+import random
 
 input_filename = sys.argv[1]
 
 num_shards = 7
-files = [open(input_filename + ".length=" + str(i), "w") for i in range(1,num_shards + 1)]
-all_others = open(input_filename + ".length=" + str(num_shards + 1) + "+", "w")
+files = [open(input_filename + ".length=" + str(i) + ".shuffled", "w") for i in range(1,num_shards + 1)]
+all_others = open(input_filename + ".length=" + str(num_shards + 1) + "+.shuffled", "w")
+
+strings = [[] for i in range(0, num_shards)]
+all_others_strings = []
 
 lines = 0
 with open(input_filename, 'r') as f:
@@ -14,14 +18,23 @@
         l = len(x) - 1
 
         if l > num_shards:
-            all_others.write(line)
+            all_others_strings.append(line)
         else:
-            files[l - 1].write(line)
+            strings[l - 1].append(line)
 
         lines += 1
         if lines % 1000000 == 0:
             print("processed " + str(lines) + " lines")
 
-for f in files:
-    f.close()
+
+for i in range(0, num_shards):
+    random.shuffle(strings[i])
+    for s in strings[i]:
+        files[i].write(s)
+    files[i].close()
+
+random.shuffle(all_others_strings)
+for s in all_others_strings:
+    all_others.write(s)
 all_others.close()
+

From b54375d2090025ff017ed2c3a3bc4d4629ae2886 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 24 Oct 2019 12:15:09 +0200
Subject: [PATCH 023/102] minor

---
 include/autocomplete.hpp  | 4 ++--
 include/autocomplete2.hpp | 4 ++--
 include/autocomplete3.hpp | 4 ++--
 include/autocomplete4.hpp | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp
index 9f01ed0..a9a281c 100644
--- a/include/autocomplete.hpp
+++ b/include/autocomplete.hpp
@@ -102,7 +102,7 @@ struct autocomplete {
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
 
         uint32_t num_completions = 0;
-        if (!r.is_invalid()) {
+        if (r.is_valid()) {
             num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
         }
 
@@ -148,7 +148,7 @@ struct autocomplete {
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         uint32_t num_completions = 0;
-        if (!r.is_invalid()) {
+        if (r.is_valid()) {
             num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
         }
         timers[1].stop();
diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp
index 3003c02..c1c3e76 100644
--- a/include/autocomplete2.hpp
+++ b/include/autocomplete2.hpp
@@ -124,7 +124,7 @@ struct autocomplete2 {
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
 
         uint32_t num_completions = 0;
-        if (!r.is_invalid()) {
+        if (r.is_valid()) {
             num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
         }
 
@@ -171,7 +171,7 @@ struct autocomplete2 {
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         uint32_t num_completions = 0;
-        if (!r.is_invalid()) {
+        if (r.is_valid()) {
             num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
         }
         timers[1].stop();
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index ab0abb1..db7353f 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -123,7 +123,7 @@ struct autocomplete3 {
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
 
         uint32_t num_completions = 0;
-        if (!r.is_invalid()) {
+        if (r.is_valid()) {
             num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
         }
 
@@ -163,7 +163,7 @@ struct autocomplete3 {
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         uint32_t num_completions = 0;
-        if (!r.is_invalid()) {
+        if (r.is_valid()) {
             num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
         }
         timers[1].stop();
diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index 8b3d882..88018f7 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -108,7 +108,7 @@ struct autocomplete4 {
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
 
         uint32_t num_completions = 0;
-        if (!r.is_invalid()) {
+        if (r.is_valid()) {
             num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
         }
 
@@ -138,7 +138,7 @@ struct autocomplete4 {
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         uint32_t num_completions = 0;
-        if (!r.is_invalid()) {
+        if (r.is_valid()) {
             num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
         }
         timers[1].stop();

From 6d9bdae1be07e525d6539882110a2e7272a84fce Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 24 Oct 2019 14:34:17 +0200
Subject: [PATCH 024/102] deduplication of query terms

---
 include/autocomplete.hpp           | 45 +++++++--------------
 include/autocomplete2.hpp          | 43 +++++++-------------
 include/autocomplete3.hpp          | 64 +++++++++++++-----------------
 include/autocomplete4.hpp          |  2 +
 include/autocomplete_common.hpp    |  6 +++
 include/blocked_inverted_index.hpp |  1 -
 src/CMakeLists.txt                 |  1 +
 src/check_topk.cpp                 | 64 ++++++++++++++++++++++++++++++
 8 files changed, 130 insertions(+), 96 deletions(-)
 create mode 100644 src/check_topk.cpp

diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp
index a9a281c..47b4472 100644
--- a/include/autocomplete.hpp
+++ b/include/autocomplete.hpp
@@ -73,13 +73,7 @@ struct autocomplete {
                 true  // must return unique results
             );
         } else {
-            if (prefix.size() == 1) {  // we've got nothing to intersect
-                auto it = m_inverted_index.iterator(prefix.front() - 1);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            } else {
-                auto it = m_inverted_index.intersection_iterator(prefix);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            }
+            num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         }
 
         return extract_strings(num_completions);
@@ -114,13 +108,7 @@ struct autocomplete {
                     true  // must return unique results
                 );
             } else {
-                if (prefix.size() == 1) {  // we've got nothing to intersect
-                    auto it = m_inverted_index.iterator(prefix.front() - 1);
-                    num_completions = conjunctive_topk(it, suffix_lex_range, k);
-                } else {
-                    auto it = m_inverted_index.intersection_iterator(prefix);
-                    num_completions = conjunctive_topk(it, suffix_lex_range, k);
-                }
+                num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
             }
         }
 
@@ -163,13 +151,7 @@ struct autocomplete {
                     true  // must return unique results
                 );
             } else {
-                if (prefix.size() == 1) {  // we've got nothing to intersect
-                    auto it = m_inverted_index.iterator(prefix.front() - 1);
-                    num_completions = conjunctive_topk(it, suffix_lex_range, k);
-                } else {
-                    auto it = m_inverted_index.intersection_iterator(prefix);
-                    num_completions = conjunctive_topk(it, suffix_lex_range, k);
-                }
+                num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
             }
         }
         timers[2].stop();
@@ -243,21 +225,13 @@ struct autocomplete {
         // step 2
         timers[2].start();
         if (num_terms == 1) {  // special case
-
             suffix_lex_range.end += 1;
             num_completions = m_unsorted_minimal_docs_list.topk(
                 suffix_lex_range, k, m_pool.scores(),
                 true  // must return unique results
             );
-
         } else {
-            if (prefix.size() == 1) {  // we've got nothing to intersect
-                auto it = m_inverted_index.iterator(prefix.front() - 1);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            } else {
-                auto it = m_inverted_index.intersection_iterator(prefix);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            }
+            num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         }
         timers[2].stop();
 
@@ -303,6 +277,17 @@ struct autocomplete {
         assert(m_pool.size() == 0);
     }
 
+    uint32_t conjunctive_topk(completion_type& prefix, const range suffix,
+                              uint32_t const k) {
+        deduplicate(prefix);
+        if (prefix.size() == 1) {  // we've got nothing to intersect
+            auto it = m_inverted_index.iterator(prefix.front() - 1);
+            return conjunctive_topk(it, suffix, k);
+        }
+        auto it = m_inverted_index.intersection_iterator(prefix);
+        return conjunctive_topk(it, suffix, k);
+    }
+
     template <typename Iterator>
     uint32_t conjunctive_topk(Iterator& it, const range r, uint32_t const k) {
         auto& topk_scores = m_pool.scores();
diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp
index c1c3e76..ece6d2e 100644
--- a/include/autocomplete2.hpp
+++ b/include/autocomplete2.hpp
@@ -96,13 +96,7 @@ struct autocomplete2 {
             );
             extract_completions(num_completions);
         } else {
-            if (prefix.size() == 1) {  // we've got nothing to intersect
-                auto it = m_inverted_index.iterator(prefix.front() - 1);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            } else {
-                auto it = m_inverted_index.intersection_iterator(prefix);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            }
+            num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         }
 
         return extract_strings(num_completions);
@@ -137,13 +131,7 @@ struct autocomplete2 {
                 );
                 extract_completions(num_completions);
             } else {
-                if (prefix.size() == 1) {  // we've got nothing to intersect
-                    auto it = m_inverted_index.iterator(prefix.front() - 1);
-                    num_completions = conjunctive_topk(it, suffix_lex_range, k);
-                } else {
-                    auto it = m_inverted_index.intersection_iterator(prefix);
-                    num_completions = conjunctive_topk(it, suffix_lex_range, k);
-                }
+                num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
             }
         } else {
             extract_completions(num_completions);
@@ -186,13 +174,7 @@ struct autocomplete2 {
                 );
                 extract_completions(num_completions);
             } else {
-                if (prefix.size() == 1) {  // we've got nothing to intersect
-                    auto it = m_inverted_index.iterator(prefix.front() - 1);
-                    num_completions = conjunctive_topk(it, suffix_lex_range, k);
-                } else {
-                    auto it = m_inverted_index.intersection_iterator(prefix);
-                    num_completions = conjunctive_topk(it, suffix_lex_range, k);
-                }
+                num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
             }
         } else {
             extract_completions(num_completions);
@@ -275,13 +257,7 @@ struct autocomplete2 {
             );
             extract_completions(num_completions);
         } else {
-            if (prefix.size() == 1) {  // we've got nothing to intersect
-                auto it = m_inverted_index.iterator(prefix.front() - 1);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            } else {
-                auto it = m_inverted_index.intersection_iterator(prefix);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            }
+            num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         }
         timers[2].stop();
 
@@ -345,6 +321,17 @@ struct autocomplete2 {
         }
     }
 
+    uint32_t conjunctive_topk(completion_type& prefix, const range suffix,
+                              uint32_t const k) {
+        deduplicate(prefix);
+        if (prefix.size() == 1) {  // we've got nothing to intersect
+            auto it = m_inverted_index.iterator(prefix.front() - 1);
+            return conjunctive_topk(it, suffix, k);
+        }
+        auto it = m_inverted_index.intersection_iterator(prefix);
+        return conjunctive_topk(it, suffix, k);
+    }
+
     template <typename Iterator>
     uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) {
         auto& topk_scores = m_pool.scores();
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index db7353f..44c1bf4 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -89,20 +89,15 @@ struct autocomplete3 {
         init();
         completion_type prefix;
         byte_range suffix;
-        parse(m_dictionary, query, prefix, suffix);
+        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
+        assert(num_terms > 0);
 
         uint32_t num_completions = 0;
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
 
-        if (prefix.size() == 1) {  // we've got nothing to intersect
-            auto it = m_inverted_index.iterator(prefix.front() - 1);
-            num_completions = conjunctive_topk(it, suffix_lex_range, k);
-        } else {
-            auto it = m_inverted_index.intersection_iterator(prefix);
-            num_completions = conjunctive_topk(it, suffix_lex_range, k);
-        }
-
+        num_completions =
+            conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
         extract_completions(num_completions);
         return extract_strings(num_completions);
     }
@@ -128,16 +123,8 @@ struct autocomplete3 {
         }
 
         if (num_completions < k) {
-            if (num_terms == 1) {  // we've got nothing to intersect
-                iterator it(0, m_inverted_index.num_docs());
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            } else if (prefix.size() == 1) {  // we've got nothing to intersect
-                auto it = m_inverted_index.iterator(prefix.front() - 1);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            } else {
-                auto it = m_inverted_index.intersection_iterator(prefix);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            }
+            num_completions =
+                conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
         }
 
         extract_completions(num_completions);
@@ -170,16 +157,8 @@ struct autocomplete3 {
 
         timers[2].start();
         if (num_completions < k) {
-            if (num_terms == 1) {  // we've got nothing to intersect
-                iterator it(0, m_inverted_index.num_docs());
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            } else if (prefix.size() == 1) {  // we've got nothing to intersect
-                auto it = m_inverted_index.iterator(prefix.front() - 1);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            } else {
-                auto it = m_inverted_index.intersection_iterator(prefix);
-                num_completions = conjunctive_topk(it, suffix_lex_range, k);
-            }
+            num_completions =
+                conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
         }
         timers[2].stop();
 
@@ -238,7 +217,8 @@ struct autocomplete3 {
         init();
         completion_type prefix;
         byte_range suffix{0, 0};
-        parse(m_dictionary, query, prefix, suffix);
+        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
+        assert(num_terms > 0);
         timers[0].stop();
 
         uint32_t num_completions = 0;
@@ -251,13 +231,8 @@ struct autocomplete3 {
 
         // step 2
         timers[2].start();
-        if (prefix.size() == 1) {  // we've got nothing to intersect
-            auto it = m_inverted_index.iterator(prefix.front() - 1);
-            num_completions = conjunctive_topk(it, suffix_lex_range, k);
-        } else {
-            auto it = m_inverted_index.intersection_iterator(prefix);
-            num_completions = conjunctive_topk(it, suffix_lex_range, k);
-        }
+        num_completions =
+            conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
         timers[2].stop();
 
         // step 3
@@ -319,6 +294,21 @@ struct autocomplete3 {
         }
     }
 
+    uint32_t conjunctive_topk(uint32_t num_terms, completion_type& prefix,
+                              const range suffix_lex_range, const uint32_t k) {
+        if (num_terms == 1) {  // we've got nothing to intersect
+            iterator it(0, m_inverted_index.num_docs());
+            return conjunctive_topk(it, suffix_lex_range, k);
+        }
+        deduplicate(prefix);
+        if (prefix.size() == 1) {  // we've got nothing to intersect
+            auto it = m_inverted_index.iterator(prefix.front() - 1);
+            return conjunctive_topk(it, suffix_lex_range, k);
+        }
+        auto it = m_inverted_index.intersection_iterator(prefix);
+        return conjunctive_topk(it, suffix_lex_range, k);
+    }
+
     template <typename Iterator>
     uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) {
         assert(r.is_valid());
diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index 88018f7..d0f3304 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -283,6 +283,7 @@ struct autocomplete4 {
     uint32_t conjunctive_topk(completion_type& prefix, const range suffix,
                               const uint32_t k) {
         auto& topk_scores = m_pool.scores();
+        deduplicate(prefix);
         auto it = m_inverted_index.intersection_iterator(prefix, suffix);
         uint32_t results = 0;
         for (; it.has_next(); ++it) {
@@ -319,4 +320,5 @@ struct autocomplete4 {
         return m_pool.begin();
     }
 };
+
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp
index c04f8b6..362a706 100644
--- a/include/autocomplete_common.hpp
+++ b/include/autocomplete_common.hpp
@@ -19,4 +19,10 @@ uint32_t parse(Dictionary const& dict, std::string const& query,
     return num_terms;
 }
 
+void deduplicate(completion_type& c) {
+    std::sort(c.begin(), c.end());
+    auto end = std::unique(c.begin(), c.end());
+    c.resize(std::distance(c.begin(), end));
+}
+
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index dfd452d..0d3d4ed 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -267,7 +267,6 @@ struct blocked_inverted_index {
 
             if (!term_ids.empty()) {
                 m_iterators.reserve(term_ids.size());  // at most
-                std::sort(term_ids.begin(), term_ids.end());
                 uint32_t current_block_id = ii->block_id(term_ids.front());
                 uint32_t i = 0;
                 uint32_t prev_i = 0;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7b000b1..a9e4661 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,3 +2,4 @@ add_executable(build build.cpp)
 add_executable(web_server web_server.cpp ../external/mongoose/mongoose.c)
 add_executable(output_ds2i_format output_ds2i_format.cpp)
 add_executable(statistics statistics.cpp)
+add_executable(check_topk check_topk.cpp)
\ No newline at end of file
diff --git a/src/check_topk.cpp b/src/check_topk.cpp
new file mode 100644
index 0000000..cb466a1
--- /dev/null
+++ b/src/check_topk.cpp
@@ -0,0 +1,64 @@
+#include <iostream>
+
+#include "types.hpp"
+#include "../benchmark/benchmark_common.hpp"
+
+using namespace autocomplete;
+
+template <typename Index>
+void check_topk(char const* binary_filename1, char const* binary_filename2,
+                uint32_t k, uint32_t max_num_queries, float keep) {
+    Index index1;
+    ef_autocomplete_type1 index2;
+    essentials::load(index1, binary_filename1);
+    essentials::load(index2, binary_filename2);
+    std::vector<std::string> queries;
+    load_queries(queries, max_num_queries, keep, std::cin);
+    for (auto const& query : queries) {
+        size_t n1 = index1.topk(query, k).size();
+        size_t n2 = index2.topk(query, k).size();
+        if (n1 != n2) {
+            std::cout << query << std::endl;
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+    int mandatory = 6;
+    if (argc < mandatory + 1) {
+        std::cout << argv[0]
+                  << " <type> <k> <binary_filename1> "
+                     "<binary_filename2> "
+                     "<max_num_queries> <percentage> < queries"
+                  << std::endl;
+        std::cout << "<percentage> is a float in [0,1] and specifies how much "
+                     "we keep of the last token in a query "
+                  << std::endl;
+        return 1;
+    }
+
+    std::string type(argv[1]);
+    uint32_t k = std::atoi(argv[2]);
+    char const* binary_filename1 = argv[3];
+    char const* binary_filename2 = argv[4];
+    uint32_t max_num_queries = std::atoi(argv[5]);
+    float keep = std::atof(argv[6]);
+
+    if (type == "ef_type1") {
+        check_topk<ef_autocomplete_type1>(binary_filename1, binary_filename2, k,
+                                          max_num_queries, keep);
+    } else if (type == "ef_type2") {
+        check_topk<ef_autocomplete_type2>(binary_filename1, binary_filename2, k,
+                                          max_num_queries, keep);
+    } else if (type == "ef_type3") {
+        check_topk<ef_autocomplete_type3>(binary_filename1, binary_filename2, k,
+                                          max_num_queries, keep);
+    } else if (type == "ef_type4") {
+        check_topk<ef_autocomplete_type4>(binary_filename1, binary_filename2, k,
+                                          max_num_queries, keep);
+    } else {
+        return 1;
+    }
+
+    return 0;
+}
\ No newline at end of file

From 9212c98b9f3892c194250363575f6e6dfcbfd12c Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 24 Oct 2019 14:47:38 +0200
Subject: [PATCH 025/102] assert

---
 include/blocked_inverted_index.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index 0d3d4ed..ec6c4b6 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -266,6 +266,10 @@ struct blocked_inverted_index {
             assert(r.is_valid());
 
             if (!term_ids.empty()) {
+                assert(std::is_sorted(term_ids.begin(), term_ids.end()));
+                assert(std::unique(term_ids.begin(), term_ids.end()) ==
+                       term_ids.end());
+
                 m_iterators.reserve(term_ids.size());  // at most
                 uint32_t current_block_id = ii->block_id(term_ids.front());
                 uint32_t i = 0;

From 7194cba247f13638facd0ca9c4407e926ad2af8b Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 24 Oct 2019 21:36:12 +0200
Subject: [PATCH 026/102] minor

---
 benchmark/benchmark_common.hpp     | 22 ++++------
 include/blocked_inverted_index.hpp | 65 ++++++++++++++----------------
 2 files changed, 39 insertions(+), 48 deletions(-)

diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp
index 0fdae98..0fbcc26 100644
--- a/benchmark/benchmark_common.hpp
+++ b/benchmark/benchmark_common.hpp
@@ -11,21 +11,15 @@ size_t load_queries(std::vector<std::string>& queries, uint32_t max_num_queries,
     queries.reserve(max_num_queries);
     for (uint32_t i = 0; i != max_num_queries; ++i) {
         if (!std::getline(is, line)) break;
-
         auto query = line.substr(line.find(' ') + 1, line.size());
-        int32_t size = query.size() - 1;
-        while (size >= 0 and query[size] != ' ') --size;
-        auto last_token = query.substr(size + 1, query.size() - size);
-        uint32_t num_chars =
-            last_token.size() - std::ceil(last_token.size() * percentage);
-        char first = last_token.front();
-        for (uint32_t i = 0; i != num_chars; ++i) last_token.pop_back();
-
-        // retain at least one char
-        if (last_token.empty()) last_token.push_back(first);
-        assert(last_token.size() > 0);
-
-        queries.push_back(query.substr(0, size + 1) + last_token);
+        assert(query.size() > 0);
+        size_t size = query.size() - 1;
+        while (size > 0 and query[size] != ' ') --size;
+        size_t last_token_size = query.size() - size;
+        size_t end = size + std::ceil(last_token_size * percentage) + 1 +
+                     1;  // retain at least one char
+        for (size = query.size(); size > end; --size) query.pop_back();
+        queries.push_back(query);
     }
     return queries.size();
 }
diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index ec6c4b6..9a21d0c 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -270,7 +270,7 @@ struct blocked_inverted_index {
                 assert(std::unique(term_ids.begin(), term_ids.end()) ==
                        term_ids.end());
 
-                m_iterators.reserve(term_ids.size());  // at most
+                m_blocks.reserve(term_ids.size());  // at most
                 uint32_t current_block_id = ii->block_id(term_ids.front());
                 uint32_t i = 0;
                 uint32_t prev_i = 0;
@@ -284,7 +284,7 @@ struct blocked_inverted_index {
                         for (; prev_i != i; ++prev_i) {
                             block.term_ids.push_back(term_ids[prev_i]);
                         }
-                        m_iterators.push_back(std::move(block));
+                        m_blocks.push_back(std::move(block));
                     }
                     current_block_id = b;
                 }
@@ -294,16 +294,15 @@ struct blocked_inverted_index {
                 for (; prev_i != i; ++prev_i) {
                     block.term_ids.push_back(term_ids[prev_i]);
                 }
-                m_iterators.push_back(std::move(block));
+                m_blocks.push_back(std::move(block));
 
-                assert(m_iterators.size() > 0);
-                std::sort(m_iterators.begin(), m_iterators.end(),
+                std::sort(m_blocks.begin(), m_blocks.end(),
                           [](auto const& l, auto const& r) {
                               return l.docs_iterator.size() <
                                      r.docs_iterator.size();
                           });
 
-                m_candidate = m_iterators[0].docs_iterator.access(0);
+                m_candidate = m_blocks[0].docs_iterator.access(0);
             } else {
                 m_candidate = 0;
             }
@@ -334,10 +333,10 @@ struct blocked_inverted_index {
         }
 
         void operator++() {
-            assert(m_i == m_iterators.size());
-            if (!m_iterators.empty()) {
-                if (m_iterators.size() > 1) {
-                    m_candidate = m_iterators[0].docs_iterator.next();
+            assert(m_i == m_blocks.size());
+            if (!m_blocks.empty()) {
+                if (m_blocks.size() > 1) {
+                    m_candidate = m_blocks[0].docs_iterator.next();
                 }
             } else {
                 m_candidate += 1;
@@ -347,17 +346,16 @@ struct blocked_inverted_index {
         }
 
         bool intersects() {
-            for (auto& block : m_range) {
-                uint64_t val = block.docs_iterator.next_geq(m_candidate);
+            for (auto& b : m_range) {
+                uint64_t val = b.docs_iterator.next_geq(m_candidate);
                 if (val == m_candidate) {
-                    uint64_t pos = block.docs_iterator.position();
-                    assert(block.docs_iterator.access(pos) == m_candidate);
-                    uint64_t begin = block.offsets_iterator.access(pos);
-                    uint64_t end = block.offsets_iterator.access(pos + 1);
+                    uint64_t pos = b.docs_iterator.position();
+                    assert(b.docs_iterator.access(pos) == m_candidate);
+                    uint64_t begin = b.offsets_iterator.access(pos);
+                    uint64_t end = b.offsets_iterator.access(pos + 1);
                     assert(end > begin);
-                    uint32_t lower_bound = block.lower_bound;
                     for (uint64_t i = begin; i != end; ++i) {
-                        auto t = block.terms_iterator.access(i) + lower_bound;
+                        auto t = b.terms_iterator.access(i) + b.lower_bound;
                         if (t > m_suffix.end) break;
                         if (m_suffix.contains(t)) return true;
                     }
@@ -370,26 +368,25 @@ struct blocked_inverted_index {
         id_type m_candidate;
         size_t m_i;
         uint64_t m_num_docs;
-        std::vector<block_type> m_iterators;
+        std::vector<block_type> m_blocks;
         std::vector<block_type> m_range;
         range m_suffix;
 
         bool in() {  // is candidate doc in intersection?
 
-            uint64_t pos = m_iterators[m_i].docs_iterator.position();
-            if (pos == m_iterators[m_i].docs_iterator.size()) return false;
-            uint64_t begin = m_iterators[m_i].offsets_iterator.access(pos);
-            uint64_t end = m_iterators[m_i].offsets_iterator.access(pos + 1);
+            auto& b = m_blocks[m_i];
+            uint64_t pos = b.docs_iterator.position();
+            if (pos == b.docs_iterator.size()) return false;
+            uint64_t begin = b.offsets_iterator.access(pos);
+            uint64_t end = b.offsets_iterator.access(pos + 1);
             assert(end > begin);
-            if (end - begin < m_iterators[m_i].term_ids.size()) return false;
+            if (end - begin < b.term_ids.size()) return false;
 
             uint64_t i = begin;
-            uint32_t lower_bound = m_iterators[m_i].lower_bound;
-            for (auto x : m_iterators[m_i].term_ids) {
+            for (auto x : b.term_ids) {
                 bool found = false;
                 for (; i != end; ++i) {
-                    auto t =
-                        m_iterators[m_i].terms_iterator.access(i) + lower_bound;
+                    auto t = b.terms_iterator.access(i) + b.lower_bound;
                     if (t == x) {
                         found = true;
                         break;
@@ -402,18 +399,18 @@ struct blocked_inverted_index {
         }
 
         void next() {
-            if (m_iterators.empty()) return;
-            if (m_iterators.size() == 1) {
-                while (m_candidate < m_num_docs and m_i != m_iterators.size()) {
+            if (m_blocks.empty()) return;
+            if (m_blocks.size() == 1) {
+                while (m_candidate < m_num_docs and m_i != m_blocks.size()) {
                     assert(m_i == 0);
-                    m_candidate = m_iterators[m_i].docs_iterator.next();
+                    m_candidate = m_blocks[m_i].docs_iterator.next();
                     if (in()) ++m_i;
                 }
             } else {
-                while (m_candidate < m_num_docs and m_i != m_iterators.size()) {
+                while (m_candidate < m_num_docs and m_i != m_blocks.size()) {
                     // NOTE: since we work with unions of posting lists,
                     // next_geq by scan runs faster
-                    auto val = m_iterators[m_i].docs_iterator.next_geq_by_scan(
+                    auto val = m_blocks[m_i].docs_iterator.next_geq_by_scan(
                         m_candidate);
                     bool is_in = in();
                     if (val == m_candidate and is_in) {

From efeb99f091daae4c547321f33e0e28bd423bf56a Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 26 Oct 2019 12:28:55 +0200
Subject: [PATCH 027/102] using cmd_line_parser

---
 .gitmodules                        |  3 ++
 external/cmd_line_parser           |  1 +
 external/mongoose                  |  2 +-
 include/blocked_inverted_index.hpp |  5 ++-
 src/build.cpp                      | 54 ++++++++++++------------------
 5 files changed, 30 insertions(+), 35 deletions(-)
 create mode 160000 external/cmd_line_parser

diff --git a/.gitmodules b/.gitmodules
index 60c5af2..5b9dc7e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "external/doctest"]
 	path = external/doctest
 	url = https://github.com/onqtam/doctest.git
+[submodule "external/cmd_line_parser"]
+	path = external/cmd_line_parser
+	url = https://github.com/jermp/cmd_line_parser.git
diff --git a/external/cmd_line_parser b/external/cmd_line_parser
new file mode 160000
index 0000000..de6d870
--- /dev/null
+++ b/external/cmd_line_parser
@@ -0,0 +1 @@
+Subproject commit de6d870f8f01076f671a4eed6bbe55f3b9217d05
diff --git a/external/mongoose b/external/mongoose
index c41a221..dce60c6 160000
--- a/external/mongoose
+++ b/external/mongoose
@@ -1 +1 @@
-Subproject commit c41a22195ceabc02ffd0379f0e71d6c3575337aa
+Subproject commit dce60c6dbb096f3b96e1a45cbfdfd55e18b38bb6
diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index 9a21d0c..8425e4e 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -23,7 +23,10 @@ struct blocked_inverted_index {
             : m_num_integers(0)
             , m_num_docs(params.num_completions)
             , m_num_terms(params.num_terms) {
-            assert(c > 0.0);
+            if (!(c > 0.0 and c <= 1.0)) {
+                throw std::runtime_error("c must be in (0,1]");
+            }
+
             essentials::logger("building blocked_inverted_index with c = " +
                                std::to_string(c) + "...");
 
diff --git a/src/build.cpp b/src/build.cpp
index 732318f..ba73954 100644
--- a/src/build.cpp
+++ b/src/build.cpp
@@ -2,57 +2,48 @@
 
 #include "types.hpp"
 #include "statistics.hpp"
+#include "../external/cmd_line_parser/include/parser.hpp"
 
 using namespace autocomplete;
 
 template <typename Index>
-void build(parameters const& params, char const* output_filename) {
+void build(parameters const& params, std::string const& output_filename) {
     Index index(params);
     index.print_stats();
-    if (output_filename) {
+    if (output_filename != "") {
         essentials::logger("saving data structure to disk...");
-        essentials::save<Index>(index, output_filename);
+        essentials::save<Index>(index, output_filename.c_str());
         essentials::logger("DONE");
     }
 }
 
 void build_type4(parameters const& params, const float c,
-                 char const* output_filename) {
+                 std::string const& output_filename) {
     ef_autocomplete_type4 index(params, c);
     index.print_stats();
-    if (output_filename) {
+    if (output_filename != "") {
         essentials::logger("saving data structure to disk...");
-        essentials::save<ef_autocomplete_type4>(index, output_filename);
+        essentials::save<ef_autocomplete_type4>(index, output_filename.c_str());
         essentials::logger("DONE");
     }
 }
 
 int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory + 1) {
-        std::cout << argv[0]
-                  << " <type> <collection_basename> [-o output_filename] [-c c]"
-                  << std::endl;
-        return 1;
-    }
-
-    std::string type(argv[1]);
+    cmd_line_parser::parser parser(argc, argv);
+    parser.add("type", "Index type.");
+    parser.add("collection_basename", "Collection basename.");
+    parser.add("output_filename", "Output filename.", "-o", false);
+    parser.add(
+        "c",
+        "Value for Bast and Weber's technique: c must be a float in (0,1].",
+        "-c", false);
+    if (!parser.parse()) return 1;
+
+    auto type = parser.get<std::string>("type");
     parameters params;
-    params.collection_basename = argv[2];
+    params.collection_basename = parser.get<std::string>("collection_basename");
     params.load();
-
-    char const* output_filename = nullptr;
-    float c = 0.0;
-
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "-o") {
-            ++i;
-            output_filename = argv[i];
-        } else if (std::string(argv[i]) == "-c") {
-            ++i;
-            c = std::stof(argv[i]);
-        }
-    }
+    auto output_filename = parser.get<std::string>("output_filename");
 
     if (type == "ef_type1") {
         build<ef_autocomplete_type1>(params, output_filename);
@@ -61,10 +52,7 @@ int main(int argc, char** argv) {
     } else if (type == "ef_type3") {
         build<ef_autocomplete_type3>(params, output_filename);
     } else if (type == "ef_type4") {
-        if (c == 0.0) {
-            std::cerr << "c must be greater than 0.0" << std::endl;
-            return 1;
-        }
+        auto c = parser.get<float>("c");
         build_type4(params, c, output_filename);
     } else {
         return 1;

From 1465feccf13a08e58c387d38910e347f4f5c78c9 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 26 Oct 2019 13:49:38 +0200
Subject: [PATCH 028/102] using cmd_line_parser

---
 benchmark/benchmark_common.hpp                |  14 +++
 benchmark/benchmark_conjunctive_topk.cpp      | 110 ++++++++----------
 benchmark/benchmark_integer_fc_dictionary.cpp |   2 +-
 benchmark/benchmark_locate_prefix.cpp         |  48 ++++----
 benchmark/benchmark_prefix_topk.cpp           | 106 ++++++++---------
 benchmark/benchmark_topk.cpp                  |  66 ++++-------
 src/statistics.cpp                            |  18 +--
 7 files changed, 162 insertions(+), 202 deletions(-)

diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp
index 0fbcc26..135992d 100644
--- a/benchmark/benchmark_common.hpp
+++ b/benchmark/benchmark_common.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "../external/cmd_line_parser/include/parser.hpp"
+
 namespace autocomplete {
 
 static const uint32_t runs = 5;
@@ -24,4 +26,16 @@ size_t load_queries(std::vector<std::string>& queries, uint32_t max_num_queries,
     return queries.size();
 }
 
+void configure_parser_for_benchmarking(cmd_line_parser::parser& parser) {
+    parser.add("type", "Index type.");
+    parser.add("k", "top-k value.");
+    parser.add("index_filename", "Index filename.");
+    parser.add("num_terms_per_query", "Number of terms per query.");
+    parser.add("max_num_queries", "Maximum number of queries to execute.");
+    parser.add("percentage",
+               "A float in [0,1] specifying how much we keep of the last token "
+               "in a query.");
+    parser.add("breakdown", "Collect timings breakdown.", "--breakdown");
+}
+
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp
index 2a04c4c..83e2c99 100644
--- a/benchmark/benchmark_conjunctive_topk.cpp
+++ b/benchmark/benchmark_conjunctive_topk.cpp
@@ -1,110 +1,92 @@
 #include <iostream>
 
 #include "types.hpp"
-#include "statistics.hpp"
 #include "benchmark_common.hpp"
 
 using namespace autocomplete;
 
 template <typename Index>
-void benchmark_conjunctive_topk(char const* binary_filename, uint32_t k,
-                                uint32_t max_num_queries,
-                                essentials::json_lines& breakdowns,
-                                bool breakdown) {
-    Index autocomp;
-    essentials::logger("loading data structure from disk...");
-    essentials::load(autocomp, binary_filename);
-    essentials::logger("DONE");
-    autocomp.print_stats();
+void benchmark(std::string const& index_filename, uint32_t k,
+               uint32_t max_num_queries, float keep,
+               essentials::json_lines& breakdowns, bool breakdown) {
+    Index index;
+    essentials::load(index, index_filename.c_str());
 
     std::vector<std::string> queries;
-    essentials::logger("loading queries...");
     uint32_t num_queries =
-        load_queries(queries, max_num_queries, 0.25, std::cin);
-    essentials::logger("loaded " + std::to_string(num_queries) + " queries");
+        load_queries(queries, max_num_queries, keep, std::cin);
 
-    auto ns_x_query = [&](double time) {
-        return uint64_t(time / (runs * num_queries) * 1000);
+    uint64_t reported_strings = 0;
+    auto musec_per_query = [&](double time) {
+        return time / (runs * num_queries);
     };
 
-    essentials::logger("benchmarking conjunctive_topk queries...");
-    uint64_t reported_strings = 0;
+    breakdowns.add("num_queries", std::to_string(num_queries));
 
     if (breakdown) {
         std::vector<timer_type> timers(4);
         for (uint32_t run = 0; run != runs; ++run) {
             for (auto const& query : queries) {
-                auto it = autocomp.conjunctive_topk(query, k, timers);
+                auto it = index.prefix_topk(query, k, timers);
                 reported_strings += it.size();
             }
         }
-        essentials::logger("DONE");
         std::cout << reported_strings << std::endl;
-        breakdowns.add("num_queries", std::to_string(num_queries));
-        breakdowns.add("parsing_ns_per_query",
-                       std::to_string(ns_x_query(timers[0].elapsed())));
-        breakdowns.add("dictionary_search_ns_per_query",
-                       std::to_string(ns_x_query(timers[1].elapsed())));
-        breakdowns.add("conjunctive_search_ns_per_query",
-                       std::to_string(ns_x_query(timers[2].elapsed())));
-        breakdowns.add("reporting_ns_per_query",
-                       std::to_string(ns_x_query(timers[3].elapsed())));
+        breakdowns.add("parsing_musec_per_query",
+                       std::to_string(musec_per_query(timers[0].elapsed())));
+        breakdowns.add("dictionary_search_musec_per_query",
+                       std::to_string(musec_per_query(timers[1].elapsed())));
+        breakdowns.add("conjunctive_search_musec_per_query",
+                       std::to_string(musec_per_query(timers[2].elapsed())));
+        breakdowns.add("reporting_musec_per_query",
+                       std::to_string(musec_per_query(timers[3].elapsed())));
     } else {
         essentials::timer_type timer;
         timer.start();
         for (uint32_t run = 0; run != runs; ++run) {
             for (auto const& query : queries) {
-                auto it = autocomp.conjunctive_topk(query, k);
+                auto it = index.prefix_topk(query, k);
                 reported_strings += it.size();
             }
         }
         timer.stop();
-        essentials::logger("DONE");
         std::cout << reported_strings << std::endl;
-        breakdowns.add("num_queries", std::to_string(num_queries));
-        breakdowns.add("ns_per_query",
-                       std::to_string(ns_x_query(timer.elapsed())));
+        breakdowns.add("musec_per_query",
+                       std::to_string(musec_per_query(timer.elapsed())));
     }
 }
 
 int main(int argc, char** argv) {
-    int mandatory = 5;
-    if (argc < mandatory + 1) {
-        std::cout << argv[0]
-                  << " <type> <k> <binary_filename> <num_terms_per_query> "
-                     "<max_num_queries> --breakdown < queries"
-                  << std::endl;
-        return 1;
-    }
+    cmd_line_parser::parser parser(argc, argv);
+    configure_parser_for_benchmarking(parser);
+    if (!parser.parse()) return 1;
 
-    std::string type(argv[1]);
-    uint32_t k = std::atoi(argv[2]);
-    char const* binary_filename = argv[3];
-    std::string num_terms_per_query(argv[4]);
-    uint32_t max_num_queries = std::atoi(argv[5]);
-
-    bool breakdown = false;
-    for (int i = mandatory; i != argc; ++i) {
-        if (std::string(argv[i]) == "--breakdown") {
-            breakdown = true;
-        }
-    }
+    auto type = parser.get<std::string>("type");
+    auto k = parser.get<uint32_t>("k");
+    auto index_filename = parser.get<std::string>("index_filename");
+    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
+    auto keep = parser.get<float>("percentage");
+    auto breakdown = parser.get<bool>("breakdown");
 
     essentials::json_lines breakdowns;
     breakdowns.new_line();
-    breakdowns.add("num_terms_per_query", num_terms_per_query);
+    breakdowns.add("num_terms_per_query",
+                   parser.get<std::string>("num_terms_per_query"));
+    breakdowns.add("percentage", std::to_string(keep));
 
-    if (type == "type1") {
-        benchmark_conjunctive_topk<ef_autocomplete_type1>(
-            binary_filename, k, max_num_queries, breakdowns, breakdown);
-    } else if (type == "type2") {
-        benchmark_conjunctive_topk<ef_autocomplete_type2>(
-            binary_filename, k, max_num_queries, breakdowns, breakdown);
-    } else if (type == "type3") {
-        benchmark_conjunctive_topk<ef_autocomplete_type3>(
-            binary_filename, k, max_num_queries, breakdowns, breakdown);
+    if (type == "ef_type1") {
+        benchmark<ef_autocomplete_type1>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
+    } else if (type == "ef_type2") {
+        benchmark<ef_autocomplete_type2>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
+    } else if (type == "ef_type3") {
+        benchmark<ef_autocomplete_type3>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
+    } else if (type == "ef_type4") {
+        benchmark<ef_autocomplete_type4>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
     } else {
-        std::cout << "error: unknown type '" << type << "'" << std::endl;
         return 1;
     }
 
diff --git a/benchmark/benchmark_integer_fc_dictionary.cpp b/benchmark/benchmark_integer_fc_dictionary.cpp
index f1e35d9..3a752eb 100644
--- a/benchmark/benchmark_integer_fc_dictionary.cpp
+++ b/benchmark/benchmark_integer_fc_dictionary.cpp
@@ -8,7 +8,7 @@ using namespace autocomplete;
 
 template <typename Dictionary>
 void perf_test(Dictionary const& dict, std::vector<id_type> const& queries) {
-    completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
+    static completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
     essentials::timer_type timer;
 
     for (uint32_t i = 0; i != runs; ++i) {
diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp
index 6e9a1ab..8d37357 100644
--- a/benchmark/benchmark_locate_prefix.cpp
+++ b/benchmark/benchmark_locate_prefix.cpp
@@ -7,10 +7,9 @@
 using namespace autocomplete;
 
 template <typename Index>
-void benchmark_locate_prefix(parameters const& params,
-                             fc_dictionary_type const& dict,
-                             uint32_t max_num_queries, float keep,
-                             essentials::json_lines& result) {
+void benchmark(parameters const& params, fc_dictionary_type const& dict,
+               uint32_t max_num_queries, float keep,
+               essentials::json_lines& result) {
     Index index;
     {
         typename Index::builder builder(params);
@@ -24,6 +23,7 @@ void benchmark_locate_prefix(parameters const& params,
 
     {
         num_queries = load_queries(strings, max_num_queries, keep, std::cin);
+        result.add("num_queries", std::to_string(num_queries));
         for (auto const& string : strings) {
             completion_type prefix;
             byte_range suffix;
@@ -51,26 +51,23 @@ void benchmark_locate_prefix(parameters const& params,
 }
 
 int main(int argc, char** argv) {
-    int mandatory = 5;
-    if (argc < mandatory + 1) {
-        std::cout << argv[0]
-                  << " <type> <collection_basename> <num_terms_per_query> "
-                     "<max_num_queries> <percentage> < queries"
-                  << std::endl;
-        std::cout << "<percentage> is a float in [0,1] and specifies how much "
-                     "we keep of the last token in a query "
-                  << std::endl;
-        return 1;
-    }
+    cmd_line_parser::parser parser(argc, argv);
+    parser.add("type", "Index type.");
+    parser.add("collection_basename", "Collection basename.");
+    parser.add("num_terms_per_query", "Number of terms per query.");
+    parser.add("max_num_queries", "Maximum number of queries to execute.");
+    parser.add("percentage",
+               "A float in [0,1] specifying how much we keep of the last token "
+               "in a query.");
+    if (!parser.parse()) return 1;
 
-    std::string type(argv[1]);
     parameters params;
-    params.collection_basename = argv[2];
+    params.collection_basename = parser.get<std::string>("collection_basename");
     params.load();
 
-    std::string num_terms_per_query(argv[3]);
-    uint32_t max_num_queries = std::atoi(argv[4]);
-    float keep = std::atof(argv[5]);
+    auto type = parser.get<std::string>("type");
+    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
+    auto keep = parser.get<float>("percentage");
 
     fc_dictionary_type dict;
     {
@@ -80,15 +77,16 @@ int main(int argc, char** argv) {
 
     essentials::json_lines result;
     result.new_line();
-    result.add("num_terms_per_query", num_terms_per_query);
+    result.add("num_terms_per_query",
+               parser.get<std::string>("num_terms_per_query"));
     result.add("percentage", std::to_string(keep));
 
     if (type == "trie") {
-        benchmark_locate_prefix<ef_completion_trie>(
-            params, dict, max_num_queries, keep, result);
+        benchmark<ef_completion_trie>(params, dict, max_num_queries, keep,
+                                      result);
     } else if (type == "fc") {
-        benchmark_locate_prefix<integer_fc_dictionary_type>(
-            params, dict, max_num_queries, keep, result);
+        benchmark<integer_fc_dictionary_type>(params, dict, max_num_queries,
+                                              keep, result);
     } else {
         return 1;
     }
diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp
index 2149e03..28046a2 100644
--- a/benchmark/benchmark_prefix_topk.cpp
+++ b/benchmark/benchmark_prefix_topk.cpp
@@ -1,106 +1,92 @@
 #include <iostream>
 
 #include "types.hpp"
-#include "statistics.hpp"
 #include "benchmark_common.hpp"
 
 using namespace autocomplete;
 
 template <typename Index>
-void benchmark_prefix_topk(char const* binary_filename, uint32_t k,
-                           uint32_t max_num_queries,
-                           essentials::json_lines& breakdowns, bool breakdown) {
-    Index autocomp;
-    essentials::logger("loading data structure from disk...");
-    essentials::load(autocomp, binary_filename);
-    essentials::logger("DONE");
-    autocomp.print_stats();
+void benchmark(std::string const& index_filename, uint32_t k,
+               uint32_t max_num_queries, float keep,
+               essentials::json_lines& breakdowns, bool breakdown) {
+    Index index;
+    essentials::load(index, index_filename.c_str());
 
     std::vector<std::string> queries;
-    essentials::logger("loading queries...");
     uint32_t num_queries =
-        load_queries(queries, max_num_queries, 0.25, std::cin);
-    essentials::logger("loaded " + std::to_string(num_queries) + " queries");
+        load_queries(queries, max_num_queries, keep, std::cin);
 
-    auto ns_x_query = [&](double time) {
-        return uint64_t(time / (runs * num_queries) * 1000);
+    uint64_t reported_strings = 0;
+    auto musec_per_query = [&](double time) {
+        return time / (runs * num_queries);
     };
 
-    essentials::logger("benchmarking prefix_topk queries...");
-    uint64_t reported_strings = 0;
+    breakdowns.add("num_queries", std::to_string(num_queries));
 
     if (breakdown) {
         std::vector<timer_type> timers(4);
         for (uint32_t run = 0; run != runs; ++run) {
             for (auto const& query : queries) {
-                auto it = autocomp.prefix_topk(query, k, timers);
+                auto it = index.prefix_topk(query, k, timers);
                 reported_strings += it.size();
             }
         }
-        essentials::logger("DONE");
         std::cout << reported_strings << std::endl;
-        breakdowns.add("num_queries", std::to_string(num_queries));
-        breakdowns.add("parsing_ns_per_query",
-                       std::to_string(ns_x_query(timers[0].elapsed())));
-        breakdowns.add("completions_search_ns_per_query",
-                       std::to_string(ns_x_query(timers[1].elapsed())));
-        breakdowns.add("topk_rmq_ns_per_query",
-                       std::to_string(ns_x_query(timers[2].elapsed())));
-        breakdowns.add("reporting_ns_per_query",
-                       std::to_string(ns_x_query(timers[3].elapsed())));
+        breakdowns.add("parsing_musec_per_query",
+                       std::to_string(musec_per_query(timers[0].elapsed())));
+        breakdowns.add("completions_search_musec_per_query",
+                       std::to_string(musec_per_query(timers[1].elapsed())));
+        breakdowns.add("topk_rmq_musec_per_query",
+                       std::to_string(musec_per_query(timers[2].elapsed())));
+        breakdowns.add("reporting_musec_per_query",
+                       std::to_string(musec_per_query(timers[3].elapsed())));
     } else {
         essentials::timer_type timer;
         timer.start();
         for (uint32_t run = 0; run != runs; ++run) {
             for (auto const& query : queries) {
-                auto it = autocomp.prefix_topk(query, k);
+                auto it = index.prefix_topk(query, k);
                 reported_strings += it.size();
             }
         }
         timer.stop();
-        essentials::logger("DONE");
         std::cout << reported_strings << std::endl;
-        breakdowns.add("num_queries", std::to_string(num_queries));
-        breakdowns.add("ns_per_query",
-                       std::to_string(ns_x_query(timer.elapsed())));
+        breakdowns.add("musec_per_query",
+                       std::to_string(musec_per_query(timer.elapsed())));
     }
 }
 
 int main(int argc, char** argv) {
-    int mandatory = 5;
-    if (argc < mandatory + 1) {
-        std::cout << argv[0]
-                  << " <type> <k> <binary_filename> <num_terms_per_query> "
-                     "<max_num_queries> --breakdown < queries"
-                  << std::endl;
-        return 1;
-    }
+    cmd_line_parser::parser parser(argc, argv);
+    configure_parser_for_benchmarking(parser);
+    if (!parser.parse()) return 1;
 
-    std::string type(argv[1]);
-    uint32_t k = std::atoi(argv[2]);
-    char const* binary_filename = argv[3];
-    std::string num_terms_per_query(argv[4]);
-    uint32_t max_num_queries = std::atoi(argv[5]);
-
-    bool breakdown = false;
-    for (int i = mandatory + 1; i != argc; ++i) {
-        if (std::string(argv[i]) == "--breakdown") {
-            breakdown = true;
-        }
-    }
+    auto type = parser.get<std::string>("type");
+    auto k = parser.get<uint32_t>("k");
+    auto index_filename = parser.get<std::string>("index_filename");
+    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
+    auto keep = parser.get<float>("percentage");
+    auto breakdown = parser.get<bool>("breakdown");
 
     essentials::json_lines breakdowns;
     breakdowns.new_line();
-    breakdowns.add("num_terms_per_query", num_terms_per_query);
+    breakdowns.add("num_terms_per_query",
+                   parser.get<std::string>("num_terms_per_query"));
+    breakdowns.add("percentage", std::to_string(keep));
 
-    if (type == "type1") {
-        benchmark_prefix_topk<ef_autocomplete_type1>(
-            binary_filename, k, max_num_queries, breakdowns, breakdown);
-    } else if (type == "type2") {
-        benchmark_prefix_topk<ef_autocomplete_type2>(
-            binary_filename, k, max_num_queries, breakdowns, breakdown);
+    if (type == "ef_type1") {
+        benchmark<ef_autocomplete_type1>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
+    } else if (type == "ef_type2") {
+        benchmark<ef_autocomplete_type2>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
+    } else if (type == "ef_type3") {
+        benchmark<ef_autocomplete_type3>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
+    } else if (type == "ef_type4") {
+        benchmark<ef_autocomplete_type4>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
     } else {
-        std::cout << "error: unknown type '" << type << "'" << std::endl;
         return 1;
     }
 
diff --git a/benchmark/benchmark_topk.cpp b/benchmark/benchmark_topk.cpp
index a294afe..0ea1e97 100644
--- a/benchmark/benchmark_topk.cpp
+++ b/benchmark/benchmark_topk.cpp
@@ -1,17 +1,16 @@
 #include <iostream>
 
 #include "types.hpp"
-#include "statistics.hpp"
 #include "benchmark_common.hpp"
 
 using namespace autocomplete;
 
 template <typename Index>
-void benchmark_topk(char const* binary_filename, uint32_t k,
-                    uint32_t max_num_queries, float keep,
-                    essentials::json_lines& breakdowns, bool breakdown) {
+void benchmark(std::string const& index_filename, uint32_t k,
+               uint32_t max_num_queries, float keep,
+               essentials::json_lines& breakdowns, bool breakdown) {
     Index index;
-    essentials::load(index, binary_filename);
+    essentials::load(index, index_filename.c_str());
 
     std::vector<std::string> queries;
     uint32_t num_queries =
@@ -32,9 +31,7 @@ void benchmark_topk(char const* binary_filename, uint32_t k,
                 reported_strings += it.size();
             }
         }
-
         std::cout << reported_strings << std::endl;
-
         breakdowns.add("parsing_musec_per_query",
                        std::to_string(musec_per_query(timers[0].elapsed())));
         breakdowns.add("prefix_search_musec_per_query",
@@ -43,7 +40,6 @@ void benchmark_topk(char const* binary_filename, uint32_t k,
                        std::to_string(musec_per_query(timers[2].elapsed())));
         breakdowns.add("reporting_musec_per_query",
                        std::to_string(musec_per_query(timers[3].elapsed())));
-
     } else {
         essentials::timer_type timer;
         timer.start();
@@ -54,58 +50,42 @@ void benchmark_topk(char const* binary_filename, uint32_t k,
             }
         }
         timer.stop();
-
         std::cout << reported_strings << std::endl;
-
         breakdowns.add("musec_per_query",
                        std::to_string(musec_per_query(timer.elapsed())));
     }
 }
 
 int main(int argc, char** argv) {
-    int mandatory = 6;
-    if (argc < mandatory + 1) {
-        std::cout << argv[0]
-                  << " <type> <k> <binary_filename> <num_terms_per_query> "
-                     "<max_num_queries> <percentage> [--breakdown] < queries"
-                  << std::endl;
-        std::cout << "<percentage> is a float in [0,1] and specifies how much "
-                     "we keep of the last token in a query "
-                  << std::endl;
-        return 1;
-    }
-
-    std::string type(argv[1]);
-    uint32_t k = std::atoi(argv[2]);
-    char const* binary_filename = argv[3];
-    std::string num_terms_per_query(argv[4]);
-    uint32_t max_num_queries = std::atoi(argv[5]);
-    float keep = std::atof(argv[6]);
+    cmd_line_parser::parser parser(argc, argv);
+    configure_parser_for_benchmarking(parser);
+    if (!parser.parse()) return 1;
 
-    bool breakdown = false;
-    for (int i = mandatory + 1; i != argc; ++i) {
-        if (std::string(argv[i]) == "--breakdown") {
-            breakdown = true;
-        }
-    }
+    auto type = parser.get<std::string>("type");
+    auto k = parser.get<uint32_t>("k");
+    auto index_filename = parser.get<std::string>("index_filename");
+    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
+    auto keep = parser.get<float>("percentage");
+    auto breakdown = parser.get<bool>("breakdown");
 
     essentials::json_lines breakdowns;
     breakdowns.new_line();
-    breakdowns.add("num_terms_per_query", num_terms_per_query);
+    breakdowns.add("num_terms_per_query",
+                   parser.get<std::string>("num_terms_per_query"));
     breakdowns.add("percentage", std::to_string(keep));
 
     if (type == "ef_type1") {
-        benchmark_topk<ef_autocomplete_type1>(
-            binary_filename, k, max_num_queries, keep, breakdowns, breakdown);
+        benchmark<ef_autocomplete_type1>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
     } else if (type == "ef_type2") {
-        benchmark_topk<ef_autocomplete_type2>(
-            binary_filename, k, max_num_queries, keep, breakdowns, breakdown);
+        benchmark<ef_autocomplete_type2>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
     } else if (type == "ef_type3") {
-        benchmark_topk<ef_autocomplete_type3>(
-            binary_filename, k, max_num_queries, keep, breakdowns, breakdown);
+        benchmark<ef_autocomplete_type3>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
     } else if (type == "ef_type4") {
-        benchmark_topk<ef_autocomplete_type4>(
-            binary_filename, k, max_num_queries, keep, breakdowns, breakdown);
+        benchmark<ef_autocomplete_type4>(index_filename, k, max_num_queries,
+                                         keep, breakdowns, breakdown);
     } else {
         return 1;
     }
diff --git a/src/statistics.cpp b/src/statistics.cpp
index 5b2148f..9dbf689 100644
--- a/src/statistics.cpp
+++ b/src/statistics.cpp
@@ -2,25 +2,25 @@
 
 #include "types.hpp"
 #include "statistics.hpp"
+#include "../external/cmd_line_parser/include/parser.hpp"
 
 using namespace autocomplete;
 
 template <typename Index>
-void print_stats(char const* index_filename) {
+void print_stats(std::string const& index_filename) {
     Index index;
-    essentials::load(index, index_filename);
+    essentials::load(index, index_filename.c_str());
     index.print_stats();
 }
 
 int main(int argc, char** argv) {
-    int mandatory = 2;
-    if (argc < mandatory + 1) {
-        std::cout << argv[0] << " <type> <index_filename>" << std::endl;
-        return 1;
-    }
+    cmd_line_parser::parser parser(argc, argv);
+    parser.add("type", "Index type.");
+    parser.add("index_filename", "Index filename.");
+    if (!parser.parse()) return 1;
 
-    std::string type(argv[1]);
-    char const* index_filename = argv[2];
+    auto type = parser.get<std::string>("type");
+    auto index_filename = parser.get<std::string>("index_filename");
 
     if (type == "ef_type1") {
         print_stats<ef_autocomplete_type1>(index_filename);

From 44dab2bbeb59b7466abf2ec2ead7af383855a260 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sun, 27 Oct 2019 13:09:00 +0100
Subject: [PATCH 029/102] added license

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..35abc20
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright 2019 Giulio Ermanno Pibiri
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file

From 8d46c1ce9591771162c88b5cca7af4410386b25f Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sun, 27 Oct 2019 13:46:09 +0100
Subject: [PATCH 030/102] dependencies updated

---
 external/cmd_line_parser | 2 +-
 external/essentials      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/external/cmd_line_parser b/external/cmd_line_parser
index de6d870..70b779f 160000
--- a/external/cmd_line_parser
+++ b/external/cmd_line_parser
@@ -1 +1 @@
-Subproject commit de6d870f8f01076f671a4eed6bbe55f3b9217d05
+Subproject commit 70b779fbb1c5e1bbdb5949044a6b8824a3044855
diff --git a/external/essentials b/external/essentials
index 3721ea2..07db05a 160000
--- a/external/essentials
+++ b/external/essentials
@@ -1 +1 @@
-Subproject commit 3721ea2b02c24005088cb9efeb89b4090753bbf2
+Subproject commit 07db05abd0c058ee310ff5078eb4ec27d2b3cdcb

From 4327f39f0574f49e12386be81343cbefdf55f121 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 6 Nov 2019 10:04:44 +0100
Subject: [PATCH 031/102] queries are just strings, without any id

---
 benchmark/benchmark_common.hpp           | 7 +++----
 test_data/partition_queries_by_length.py | 9 +++------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp
index 135992d..4309912 100644
--- a/benchmark/benchmark_common.hpp
+++ b/benchmark/benchmark_common.hpp
@@ -9,11 +9,10 @@ static const uint32_t runs = 5;
 size_t load_queries(std::vector<std::string>& queries, uint32_t max_num_queries,
                     float percentage, std::istream& is = std::cin) {
     assert(percentage >= 0.0 and percentage <= 1.0);
-    std::string line;
+    std::string query;
     queries.reserve(max_num_queries);
     for (uint32_t i = 0; i != max_num_queries; ++i) {
-        if (!std::getline(is, line)) break;
-        auto query = line.substr(line.find(' ') + 1, line.size());
+        if (!std::getline(is, query)) break;
         assert(query.size() > 0);
         size_t size = query.size() - 1;
         while (size > 0 and query[size] != ' ') --size;
@@ -34,7 +33,7 @@ void configure_parser_for_benchmarking(cmd_line_parser::parser& parser) {
     parser.add("max_num_queries", "Maximum number of queries to execute.");
     parser.add("percentage",
                "A float in [0,1] specifying how much we keep of the last token "
-               "in a query.");
+               "in a query: n x 100 <=> n%, for n in [0,1].");
     parser.add("breakdown", "Collect timings breakdown.", "--breakdown");
 }
 
diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py
index 7f14b42..c2397de 100644
--- a/test_data/partition_queries_by_length.py
+++ b/test_data/partition_queries_by_length.py
@@ -16,17 +16,15 @@
     for line in f:
         x = line.rstrip('\n').split()
         l = len(x) - 1
-
+        string = ' '.join(x[1:l+1]) + '\n'
         if l > num_shards:
-            all_others_strings.append(line)
+            all_others_strings.append(string)
         else:
-            strings[l - 1].append(line)
-
+            strings[l - 1].append(string)
         lines += 1
         if lines % 1000000 == 0:
             print("processed " + str(lines) + " lines")
 
-
 for i in range(0, num_shards):
     random.shuffle(strings[i])
     for s in strings[i]:
@@ -37,4 +35,3 @@
 for s in all_others_strings:
     all_others.write(s)
 all_others.close()
-

From d50e9445cb3aa761d92f870a225fb7f4b3b3f8fe Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Fri, 8 Nov 2019 14:12:08 +0100
Subject: [PATCH 032/102] removed comment

---
 include/autocomplete2.hpp | 5 -----
 include/autocomplete3.hpp | 5 -----
 include/autocomplete4.hpp | 5 -----
 3 files changed, 15 deletions(-)

diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp
index ece6d2e..7216379 100644
--- a/include/autocomplete2.hpp
+++ b/include/autocomplete2.hpp
@@ -304,11 +304,6 @@ struct autocomplete2 {
         assert(m_pool.size() == 0);
     }
 
-    // NOTE: this can be done more efficienctly exploiting
-    // the fact that the strings to be extracted share a common
-    // prefix, thus this task should be delegated to the
-    // integer_fc_dictionary... (enchance the locality of the operation)
-    // NOTE: this only work when used during the prefix_topk step.
     void extract_completions(const uint32_t num_completions) {
         auto const& topk_scores = m_pool.scores();
         auto& completions = m_topk_completion_set.completions();
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index 44c1bf4..c015583 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -277,11 +277,6 @@ struct autocomplete3 {
         assert(m_pool.size() == 0);
     }
 
-    // NOTE: this can be done more efficienctly exploiting
-    // the fact that the strings to be extracted share a common
-    // prefix, thus this task should be delegated to the
-    // integer_fc_dictionary... (enchance the locality of the operation)
-    // NOTE: this only work when used during the prefix_topk step.
     void extract_completions(const uint32_t num_completions) {
         auto const& topk_scores = m_pool.scores();
         auto& completions = m_topk_completion_set.completions();
diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index d0f3304..ec88ec3 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -263,11 +263,6 @@ struct autocomplete4 {
         assert(m_pool.size() == 0);
     }
 
-    // NOTE: this can be done more efficienctly exploiting
-    // the fact that the strings to be extracted share a common
-    // prefix, thus this task should be delegated to the
-    // integer_fc_dictionary... (enchance the locality of the operation)
-    // NOTE: this only work when used during the prefix_topk step.
     void extract_completions(const uint32_t num_completions) {
         auto const& topk_scores = m_pool.scores();
         auto& completions = m_topk_completion_set.completions();

From 6c79eba738dd5b1f3d67436292cd7b05e9eb4c15 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 9 Nov 2019 15:55:52 +0100
Subject: [PATCH 033/102] removed unused import

---
 test_data/build_inverted_and_forward.py  | 1 -
 test_data/partition_queries_by_length.py | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py
index c47ea17..0634d82 100644
--- a/test_data/build_inverted_and_forward.py
+++ b/test_data/build_inverted_and_forward.py
@@ -1,5 +1,4 @@
 import sys
-import numpy as np
 
 input_filename = sys.argv[1]
 
diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py
index c2397de..7dfbed6 100644
--- a/test_data/partition_queries_by_length.py
+++ b/test_data/partition_queries_by_length.py
@@ -1,6 +1,4 @@
-import sys
-import numpy as np
-import random
+import sys, random
 
 input_filename = sys.argv[1]
 

From b2cc9a5cb7ad2a2b5fb04d74121d3681bc03efd1 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Mon, 11 Nov 2019 12:13:20 +0100
Subject: [PATCH 034/102] fix

---
 include/integer_fc_dictionary.hpp   |  9 +++------
 test/test_integer_fc_dictionary.cpp |  1 +
 test/test_locate_prefix.cpp         | 24 +++++++++++++++---------
 test_data/preprocess.sh             |  1 +
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/include/integer_fc_dictionary.hpp b/include/integer_fc_dictionary.hpp
index e0b228b..443cc8f 100644
--- a/include/integer_fc_dictionary.hpp
+++ b/include/integer_fc_dictionary.hpp
@@ -270,7 +270,7 @@ struct integer_fc_dictionary {
         if (cmp < 0) {
             bucket_id = mi;
         } else {
-            bucket_id = mi - 1;
+            bucket_id = hi == -1 ? 0 : hi;
             h = header(bucket_id);
         }
 
@@ -288,18 +288,15 @@ struct integer_fc_dictionary {
             cmp = uint32_range_compare(h, t, n);
             if (cmp > 0) {
                 hi = mi - 1;
-            } else if (cmp < 0) {
+            } else if (cmp <= 0) {
                 lo = mi + 1;
-            } else {
-                bucket_id = mi;
-                return;
             }
         }
 
         if (cmp < 0) {
             bucket_id = mi;
         } else {
-            bucket_id = mi - 1;
+            bucket_id = hi == -1 ? 0 : hi;
             h = header(bucket_id);
         }
     }
diff --git a/test/test_integer_fc_dictionary.cpp b/test/test_integer_fc_dictionary.cpp
index b67879d..d36db82 100644
--- a/test/test_integer_fc_dictionary.cpp
+++ b/test/test_integer_fc_dictionary.cpp
@@ -48,6 +48,7 @@ TEST_CASE("test integer_fc_dictionary") {
 
                 id_type got_id =
                     dict.locate({decoded.data(), decoded.data() + size});
+                REQUIRE(got_id != global::invalid_term_id);
                 REQUIRE_MESSAGE(got_id == id, "Error in locating the "
                                                   << id
                                                   << "-th string: expected id "
diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp
index 8938965..7924899 100644
--- a/test/test_locate_prefix.cpp
+++ b/test/test_locate_prefix.cpp
@@ -19,11 +19,11 @@ void test_locate_prefix(Dictionary const& dict, Index const& index,
         suffix_lex_range.end += 1;
         range got = index.locate_prefix(prefix, suffix_lex_range);
 
-        REQUIRE_MESSAGE(
-            (got.begin == expected.begin and got.end == expected.end),
-            "Error for query '" << query << "': expected [" << expected.begin
-                                << "," << expected.end << ") but got ["
-                                << got.begin << "," << got.end << ")");
+        CHECK_MESSAGE((got.begin == expected.begin and got.end == expected.end),
+                      "Error for query '"
+                          << query << "': expected [" << expected.begin << ","
+                          << expected.end << ") but got [" << got.begin << ","
+                          << got.end << ")");
     }
 }
 
@@ -82,14 +82,20 @@ TEST_CASE("test locate_prefix()") {
                       << num_terms << std::endl;
             {
                 queries.clear();
-                std::ifstream querylog((params.collection_basename +
-                                        ".length=" + std::to_string(num_terms))
-                                           .c_str());
+                std::string filename = params.collection_basename +
+                                       ".length=" + std::to_string(num_terms) +
+                                       ".shuffled";
+                std::ifstream querylog(filename.c_str());
+                if (!querylog.is_open()) {
+                    std::cerr << "cannot open file '" << filename << "'"
+                              << std::endl;
+                    return;
+                }
                 load_queries(queries, max_num_queries, perc, querylog);
                 querylog.close();
             }
 
-            test_locate_prefix(dict, ct_index, queries, strings);
+            // test_locate_prefix(dict, ct_index, queries, strings);
             test_locate_prefix(dict, fc_index, queries, strings);
         }
     }
diff --git a/test_data/preprocess.sh b/test_data/preprocess.sh
index ab4dbeb..24c9488 100755
--- a/test_data/preprocess.sh
+++ b/test_data/preprocess.sh
@@ -8,4 +8,5 @@ for collection in $collections; do
     python map_dataset.py $collection
     python build_stats.py $collection.mapped
     python build_inverted_and_forward.py $collection
+    python partition_queries_by_length.py $collection
 done

From cd83f927f0b605075475486569e415f2a47f824c Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 13 Nov 2019 10:40:24 +0100
Subject: [PATCH 035/102] check for terms out of vocabulary

---
 include/autocomplete_common.hpp | 8 +++++---
 include/fc_dictionary.hpp       | 6 ++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp
index 362a706..17b38b4 100644
--- a/include/autocomplete_common.hpp
+++ b/include/autocomplete_common.hpp
@@ -7,14 +7,16 @@ namespace autocomplete {
 template <typename Dictionary>
 uint32_t parse(Dictionary const& dict, std::string const& query,
                completion_type& prefix, byte_range& suffix) {
-    uint32_t num_terms = 1;
+    uint32_t num_terms = 1;  // for suffix
     byte_range_iterator it(string_to_byte_range(query));
     while (true) {
         suffix = it.next();
         if (!it.has_next()) break;
         auto term_id = dict.locate(suffix);
-        prefix.push_back(term_id);
-        ++num_terms;
+        if (term_id != global::invalid_term_id) {
+            prefix.push_back(term_id);
+            ++num_terms;
+        }
     }
     return num_terms;
 }
diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp
index bde263e..ed09026 100644
--- a/include/fc_dictionary.hpp
+++ b/include/fc_dictionary.hpp
@@ -223,8 +223,7 @@ struct fc_dictionary {
         if (cmp < 0) {
             bucket_id = mi;
         } else {
-            assert(cmp > 0);
-            bucket_id = hi;
+            bucket_id = hi == -1 ? 0 : hi;
             h = header(bucket_id);
         }
 
@@ -344,8 +343,7 @@ struct fc_dictionary {
             if (cmp < 0) return global::invalid_term_id;
             curr += l - lcp_len + 2;
         }
-        assert(false);
-        __builtin_unreachable();
+        return global::invalid_term_id;  // term does not exist in dictionary
     }
 
     id_type left_locate(byte_range p, byte_range h, id_type bucket_id) const {

From c5086dc7abca7e7ac2b538c24881d5ad998da5f4 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sun, 17 Nov 2019 11:07:52 +0100
Subject: [PATCH 036/102] print avg. number of terms x completion

---
 include/statistics.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/statistics.hpp b/include/statistics.hpp
index a863814..f93444f 100644
--- a/include/statistics.hpp
+++ b/include/statistics.hpp
@@ -74,6 +74,10 @@ void autocomplete<Completions, UnsortedDocsList, Dictionary, InvertedIndex,
               m_inverted_index.num_integers());
     print("forward index", m_forward_index.bytes(), total_bytes,
           m_completions.size());
+    std::cout << "\tavg. number of terms per completion: "
+              << static_cast<double>(m_forward_index.num_integers()) /
+                     m_completions.size()
+              << std::endl;
     print_bpi("data", m_forward_index.data_bytes(),
               m_forward_index.num_integers());
     print_bpi("pointers", m_forward_index.pointer_bytes(),

From 95720d8f23274b51046ae92c820707835344c3e8 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 20 Nov 2019 22:32:38 +0100
Subject: [PATCH 037/102] minor changes

---
 benchmark/benchmark_locate_prefix.cpp | 82 ++++++++++++++++-----------
 external/cmd_line_parser              |  2 +-
 external/essentials                   |  2 +-
 include/fc_dictionary.hpp             |  6 ++
 include/statistics.hpp                | 19 ++++---
 5 files changed, 68 insertions(+), 43 deletions(-)

diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp
index 8d37357..998d8c7 100644
--- a/benchmark/benchmark_locate_prefix.cpp
+++ b/benchmark/benchmark_locate_prefix.cpp
@@ -6,36 +6,28 @@
 
 using namespace autocomplete;
 
+typedef std::pair<completion_type, range> query_type;
+
 template <typename Index>
-void benchmark(parameters const& params, fc_dictionary_type const& dict,
-               uint32_t max_num_queries, float keep,
-               essentials::json_lines& result) {
+void benchmark(parameters const& params, std::vector<query_type>& queries,
+               uint32_t num_queries, uint32_t num_terms_per_query, float keep) {
+    essentials::json_lines result;
+    result.new_line();
+    result.add("num_terms_per_query", std::to_string(num_terms_per_query));
+    result.add("percentage", std::to_string(keep));
+    result.add("num_queries", std::to_string(num_queries));
+
     Index index;
     {
         typename Index::builder builder(params);
         builder.build(index);
     }
 
-    typedef std::pair<completion_type, range> query_type;
-    std::vector<std::string> strings;
-    std::vector<query_type> queries;
-    uint32_t num_queries = 0;
-
-    {
-        num_queries = load_queries(strings, max_num_queries, keep, std::cin);
-        result.add("num_queries", std::to_string(num_queries));
-        for (auto const& string : strings) {
-            completion_type prefix;
-            byte_range suffix;
-            parse(dict, string, prefix, suffix);
-            range suffix_lex_range = dict.locate_prefix(suffix);
-            queries.emplace_back(prefix, suffix_lex_range);
-        }
-    }
-
-    auto musec_per_query = [&](double time) {
-        return time / (runs * num_queries);
-    };
+    result.add("MiB", std::to_string(static_cast<double>(index.bytes()) /
+                                     essentials::MiB));
+    result.add(
+        "bytes_per_completion",
+        std::to_string(static_cast<double>(index.bytes()) / index.size()));
 
     essentials::timer_type timer;
     timer.start();
@@ -47,7 +39,8 @@ void benchmark(parameters const& params, fc_dictionary_type const& dict,
     }
     timer.stop();
     result.add("musec_per_query",
-               std::to_string(musec_per_query(timer.elapsed())));
+               std::to_string(timer.elapsed() / (runs * num_queries)));
+    result.print();
 }
 
 int main(int argc, char** argv) {
@@ -67,6 +60,7 @@ int main(int argc, char** argv) {
 
     auto type = parser.get<std::string>("type");
     auto max_num_queries = parser.get<uint32_t>("max_num_queries");
+    auto num_terms_per_query = parser.get<uint32_t>("num_terms_per_query");
     auto keep = parser.get<float>("percentage");
 
     fc_dictionary_type dict;
@@ -75,22 +69,42 @@ int main(int argc, char** argv) {
         builder.build(dict);
     }
 
-    essentials::json_lines result;
-    result.new_line();
-    result.add("num_terms_per_query",
-               parser.get<std::string>("num_terms_per_query"));
-    result.add("percentage", std::to_string(keep));
+    std::vector<std::string> strings;
+    std::vector<query_type> queries;
+    uint32_t num_queries = 0;
+
+    {
+        num_queries = load_queries(strings, max_num_queries, keep, std::cin);
+        for (auto const& string : strings) {
+            completion_type prefix;
+            byte_range suffix;
+            parse(dict, string, prefix, suffix);
+            range suffix_lex_range = dict.locate_prefix(suffix);
+            queries.emplace_back(prefix, suffix_lex_range);
+        }
+    }
 
     if (type == "trie") {
-        benchmark<ef_completion_trie>(params, dict, max_num_queries, keep,
-                                      result);
+        benchmark<ef_completion_trie>(params, queries, num_queries,
+                                      num_terms_per_query, keep);
     } else if (type == "fc") {
-        benchmark<integer_fc_dictionary_type>(params, dict, max_num_queries,
-                                              keep, result);
+        benchmark<integer_fc_dictionary<4>>(params, queries, num_queries,
+                                            num_terms_per_query, keep);
+        benchmark<integer_fc_dictionary<8>>(params, queries, num_queries,
+                                            num_terms_per_query, keep);
+        benchmark<integer_fc_dictionary<16>>(params, queries, num_queries,
+                                             num_terms_per_query, keep);
+        benchmark<integer_fc_dictionary<32>>(params, queries, num_queries,
+                                             num_terms_per_query, keep);
+        benchmark<integer_fc_dictionary<64>>(params, queries, num_queries,
+                                             num_terms_per_query, keep);
+        benchmark<integer_fc_dictionary<128>>(params, queries, num_queries,
+                                              num_terms_per_query, keep);
+        benchmark<integer_fc_dictionary<256>>(params, queries, num_queries,
+                                              num_terms_per_query, keep);
     } else {
         return 1;
     }
 
-    result.print();
     return 0;
 }
\ No newline at end of file
diff --git a/external/cmd_line_parser b/external/cmd_line_parser
index 70b779f..1776808 160000
--- a/external/cmd_line_parser
+++ b/external/cmd_line_parser
@@ -1 +1 @@
-Subproject commit 70b779fbb1c5e1bbdb5949044a6b8824a3044855
+Subproject commit 1776808718445425dcad42ba2d1b6adf2cb5e496
diff --git a/external/essentials b/external/essentials
index 07db05a..da66810 160000
--- a/external/essentials
+++ b/external/essentials
@@ -1 +1 @@
-Subproject commit 07db05abd0c058ee310ff5078eb4ec27d2b3cdcb
+Subproject commit da6681019cbad6bef62804927801dd09832e512e
diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp
index ed09026..1b223be 100644
--- a/include/fc_dictionary.hpp
+++ b/include/fc_dictionary.hpp
@@ -37,14 +37,17 @@ struct fc_dictionary {
             std::string curr;
             std::string header;
 
+            uint64_t total_characters = 0;
             for (uint32_t b = 0; b != buckets; ++b) {
                 input >> header;
+                total_characters += header.size();
                 write_header(header);
                 m_pointers_to_headers.push_back(m_headers.size());
                 prev.swap(header);
                 uint32_t size = b != buckets - 1 ? BucketSize : tail;
                 for (uint32_t i = 0; i != size; ++i) {
                     input >> curr;
+                    total_characters += curr.size();
                     uint32_t l = 0;  // |lcp(curr,prev)|
                     while (l != curr.size() and l != prev.size() and
                            curr[l] == prev[l]) {
@@ -61,6 +64,9 @@ struct fc_dictionary {
                 m_buckets.push_back(0);
             }
 
+            std::cout << static_cast<double>(total_characters) / m_size
+                      << " characters per string" << std::endl;
+
             input.close();
             essentials::logger("DONE");
         }
diff --git a/include/statistics.hpp b/include/statistics.hpp
index f93444f..aa1fbe0 100644
--- a/include/statistics.hpp
+++ b/include/statistics.hpp
@@ -10,7 +10,8 @@ namespace autocomplete {
 
 void print(std::string const& what, size_t bytes, size_t total_bytes,
            uint64_t num_completions) {
-    std::cout << "  " << what << ": " << convert(bytes, essentials::MiB)
+    std::cout << "  " << what << ": "
+              << essentials::convert(bytes, essentials::MiB)
               << " [MiB]: " << static_cast<double>(bytes) / num_completions
               << " [bytes per completion] ";
     std::cout << "(" << (bytes * 100.0) / total_bytes << "%)" << std::endl;
@@ -31,8 +32,8 @@ template <typename Nodes, typename Pointers, typename LeftExtremes,
 void completion_trie<Nodes, Pointers, LeftExtremes, Sizes>::print_stats()
     const {
     size_t total_bytes = bytes();
-    std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]"
-              << std::endl;
+    std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+              << " [MiB]" << std::endl;
     print_bps("nodes", nodes_bytes(), size());
     print_bps("pointers", pointers_bytes(), size());
     print_bps("left extremes", left_extremes_bytes(), size());
@@ -44,7 +45,8 @@ template <typename Completions, typename UnsortedDocsList, typename Dictionary,
 void autocomplete<Completions, UnsortedDocsList, Dictionary, InvertedIndex,
                   ForwardIndex>::print_stats() const {
     size_t total_bytes = bytes();
-    std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: "
+    std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+              << " [MiB]: "
               << static_cast<double>(total_bytes) / m_completions.size()
               << " [bytes per completion] " << std::endl;
 
@@ -89,7 +91,8 @@ template <typename Completions, typename UnsortedDocsList, typename Dictionary,
 void autocomplete2<Completions, UnsortedDocsList, Dictionary,
                    InvertedIndex>::print_stats() const {
     size_t total_bytes = bytes();
-    std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: "
+    std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+              << " [MiB]: "
               << static_cast<double>(total_bytes) / m_completions.size()
               << " [bytes per completion] " << std::endl;
 
@@ -124,7 +127,8 @@ template <typename Completions, typename UnsortedDocsList, typename Dictionary,
 void autocomplete3<Completions, UnsortedDocsList, Dictionary,
                    InvertedIndex>::print_stats() const {
     size_t total_bytes = bytes();
-    std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: "
+    std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+              << " [MiB]: "
               << static_cast<double>(total_bytes) / m_completions.size()
               << " [bytes per completion] " << std::endl;
 
@@ -149,7 +153,8 @@ template <typename Completions, typename UnsortedDocsList, typename Dictionary,
 void autocomplete4<Completions, UnsortedDocsList, Dictionary,
                    BlockedInvertedIndex>::print_stats() const {
     size_t total_bytes = bytes();
-    std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: "
+    std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
+              << " [MiB]: "
               << static_cast<double>(total_bytes) / m_completions.size()
               << " [bytes per completion] " << std::endl;
 

From d807990fbddb201c8a364c4f5eb79cd3055034b2 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 21 Nov 2019 15:21:08 +0100
Subject: [PATCH 038/102] map queries

---
 benchmark/benchmark_locate_prefix.cpp | 24 ++++++------
 include/integer_fc_dictionary.hpp     |  4 +-
 include/inverted_index.hpp            | 14 +++++++
 src/CMakeLists.txt                    |  3 +-
 src/map_queries.cpp                   | 54 +++++++++++++++++++++++++++
 5 files changed, 84 insertions(+), 15 deletions(-)
 create mode 100644 src/map_queries.cpp

diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp
index 998d8c7..f9e6282 100644
--- a/benchmark/benchmark_locate_prefix.cpp
+++ b/benchmark/benchmark_locate_prefix.cpp
@@ -88,20 +88,20 @@ int main(int argc, char** argv) {
         benchmark<ef_completion_trie>(params, queries, num_queries,
                                       num_terms_per_query, keep);
     } else if (type == "fc") {
-        benchmark<integer_fc_dictionary<4>>(params, queries, num_queries,
-                                            num_terms_per_query, keep);
-        benchmark<integer_fc_dictionary<8>>(params, queries, num_queries,
-                                            num_terms_per_query, keep);
+        // benchmark<integer_fc_dictionary<4>>(params, queries, num_queries,
+        //                                     num_terms_per_query, keep);
+        // benchmark<integer_fc_dictionary<8>>(params, queries, num_queries,
+        //                                     num_terms_per_query, keep);
         benchmark<integer_fc_dictionary<16>>(params, queries, num_queries,
                                              num_terms_per_query, keep);
-        benchmark<integer_fc_dictionary<32>>(params, queries, num_queries,
-                                             num_terms_per_query, keep);
-        benchmark<integer_fc_dictionary<64>>(params, queries, num_queries,
-                                             num_terms_per_query, keep);
-        benchmark<integer_fc_dictionary<128>>(params, queries, num_queries,
-                                              num_terms_per_query, keep);
-        benchmark<integer_fc_dictionary<256>>(params, queries, num_queries,
-                                              num_terms_per_query, keep);
+        // benchmark<integer_fc_dictionary<32>>(params, queries, num_queries,
+        //                                      num_terms_per_query, keep);
+        // benchmark<integer_fc_dictionary<64>>(params, queries, num_queries,
+        //                                      num_terms_per_query, keep);
+        // benchmark<integer_fc_dictionary<128>>(params, queries, num_queries,
+        //                                       num_terms_per_query, keep);
+        // benchmark<integer_fc_dictionary<256>>(params, queries, num_queries,
+        //                                       num_terms_per_query, keep);
     } else {
         return 1;
     }
diff --git a/include/integer_fc_dictionary.hpp b/include/integer_fc_dictionary.hpp
index 443cc8f..39e547f 100644
--- a/include/integer_fc_dictionary.hpp
+++ b/include/integer_fc_dictionary.hpp
@@ -174,12 +174,12 @@ struct integer_fc_dictionary {
         p_end += right_locate(completion_to_uint32_range(prefix), h_end,
                               bucket_id_end);
 
+        prefix.pop_back();
+
         if (p_end < p_begin) {
-            prefix.pop_back();
             return global::invalid_range;
         }
 
-        prefix.pop_back();
         if (suffix_lex_range.begin == suffix_lex_range.end) {
             prefix.pop_back();
         }
diff --git a/include/inverted_index.hpp b/include/inverted_index.hpp
index cd4ad29..0bef228 100644
--- a/include/inverted_index.hpp
+++ b/include/inverted_index.hpp
@@ -28,10 +28,18 @@ struct inverted_index {
 
             std::vector<id_type> list;
             m_pointers.push_back(0);
+
+            uint32_t max_list_size = 0;
+            uint32_t min_list_size = uint32_t(-1);
+
             for (uint64_t i = 0; i != num_terms; ++i) {
                 list.clear();
                 uint32_t n = 0;
                 input >> n;
+
+                if (n > max_list_size) max_list_size = n;
+                if (n < min_list_size) min_list_size = n;
+
                 list.reserve(n);
                 m_num_integers += n;
                 for (uint64_t k = 0; k != n; ++k) {
@@ -46,6 +54,12 @@ struct inverted_index {
                 m_pointers.push_back(m_bvb.size());
             }
 
+            std::cout << "avg. list size = "
+                      << static_cast<double>(m_num_integers) / num_terms
+                      << std::endl;
+            std::cout << "max_list_size = " << max_list_size << std::endl;
+            std::cout << "min_list_size = " << min_list_size << std::endl;
+
             m_pointers.pop_back();
             input.close();
             essentials::logger("DONE");
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a9e4661..576f34b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,4 +2,5 @@ add_executable(build build.cpp)
 add_executable(web_server web_server.cpp ../external/mongoose/mongoose.c)
 add_executable(output_ds2i_format output_ds2i_format.cpp)
 add_executable(statistics statistics.cpp)
-add_executable(check_topk check_topk.cpp)
\ No newline at end of file
+add_executable(check_topk check_topk.cpp)
+add_executable(map_queries map_queries.cpp)
\ No newline at end of file
diff --git a/src/map_queries.cpp b/src/map_queries.cpp
new file mode 100644
index 0000000..f607d3d
--- /dev/null
+++ b/src/map_queries.cpp
@@ -0,0 +1,54 @@
+#include <iostream>
+
+#include "types.hpp"
+
+using namespace autocomplete;
+
+template <typename Dictionary>
+completion_type parse(Dictionary const& dict, std::string const& query) {
+    completion_type completion;
+    byte_range_iterator it(string_to_byte_range(query));
+    while (true) {
+        byte_range term = it.next();
+        if (!it.has_next()) break;
+        auto term_id = dict.locate(term);
+        assert(term_id > 0);
+        assert(term_id != global::invalid_term_id);
+        completion.push_back(term_id - 1);
+    }
+    return completion;
+}
+
+int main(int argc, char** argv) {
+    int mandatory = 2 + 1;
+    if (argc < mandatory) {
+        std::cout << argv[0] << " <collection_basename> <num_queries> < queries"
+                  << std::endl;
+        return 1;
+    }
+
+    parameters params;
+    params.collection_basename = argv[1];
+    params.load();
+
+    uint32_t num_queries = std::atoi(argv[2]);
+
+    fc_dictionary_type dict;
+    {
+        fc_dictionary_type::builder builder(params);
+        builder.build(dict);
+    }
+
+    std::string query;
+    for (uint32_t i = 0; i != num_queries; ++i) {
+        if (!std::getline(std::cin, query)) break;
+        auto completion = parse(dict, query);
+        std::cout << completion.front();
+        for (size_t i = 1; i != completion.size(); ++i) {
+            std::cout << "\t" << completion[i];
+        }
+        std::cout << "\n";
+    }
+
+    return 0;
+}
\ No newline at end of file

From 7309a0aa3b0dd5e86c934ccb1b3367ae8e666787 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 21 Nov 2019 15:23:36 +0100
Subject: [PATCH 039/102] map queries

---
 src/map_queries.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/map_queries.cpp b/src/map_queries.cpp
index f607d3d..17a460d 100644
--- a/src/map_queries.cpp
+++ b/src/map_queries.cpp
@@ -8,9 +8,8 @@ template <typename Dictionary>
 completion_type parse(Dictionary const& dict, std::string const& query) {
     completion_type completion;
     byte_range_iterator it(string_to_byte_range(query));
-    while (true) {
+    while (it.has_next()) {
         byte_range term = it.next();
-        if (!it.has_next()) break;
         auto term_id = dict.locate(term);
         assert(term_id > 0);
         assert(term_id != global::invalid_term_id);

From a22a83db1d3a69d37589a748fadacff1de518adf Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 21 Nov 2019 15:25:15 +0100
Subject: [PATCH 040/102] map queries

---
 src/map_queries.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/map_queries.cpp b/src/map_queries.cpp
index 17a460d..de43df1 100644
--- a/src/map_queries.cpp
+++ b/src/map_queries.cpp
@@ -42,11 +42,11 @@ int main(int argc, char** argv) {
     for (uint32_t i = 0; i != num_queries; ++i) {
         if (!std::getline(std::cin, query)) break;
         auto completion = parse(dict, query);
-        std::cout << completion.front();
+        std::cerr << completion.front();
         for (size_t i = 1; i != completion.size(); ++i) {
-            std::cout << "\t" << completion[i];
+            std::cerr << "\t" << completion[i];
         }
-        std::cout << "\n";
+        std::cerr << "\n";
     }
 
     return 0;

From 350df1bd9c345dbcf04cfbfffb1bea84404a4b9a Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Fri, 22 Nov 2019 11:26:02 +0100
Subject: [PATCH 041/102] fix benchmark_conjunctive_topk

---
 benchmark/benchmark_conjunctive_topk.cpp      |  4 ++--
 .../collect_results_by_varying_percentage.py  | 24 +++++++++++++++++++
 ...lect_topk_results_by_varying_percentage.py | 23 ------------------
 3 files changed, 26 insertions(+), 25 deletions(-)
 create mode 100644 script/collect_results_by_varying_percentage.py
 delete mode 100644 script/collect_topk_results_by_varying_percentage.py

diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp
index 83e2c99..ad10ec6 100644
--- a/benchmark/benchmark_conjunctive_topk.cpp
+++ b/benchmark/benchmark_conjunctive_topk.cpp
@@ -27,7 +27,7 @@ void benchmark(std::string const& index_filename, uint32_t k,
         std::vector<timer_type> timers(4);
         for (uint32_t run = 0; run != runs; ++run) {
             for (auto const& query : queries) {
-                auto it = index.prefix_topk(query, k, timers);
+                auto it = index.conjunctive_topk(query, k, timers);
                 reported_strings += it.size();
             }
         }
@@ -45,7 +45,7 @@ void benchmark(std::string const& index_filename, uint32_t k,
         timer.start();
         for (uint32_t run = 0; run != runs; ++run) {
             for (auto const& query : queries) {
-                auto it = index.prefix_topk(query, k);
+                auto it = index.conjunctive_topk(query, k);
                 reported_strings += it.size();
             }
         }
diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py
new file mode 100644
index 0000000..baeeb85
--- /dev/null
+++ b/script/collect_results_by_varying_percentage.py
@@ -0,0 +1,24 @@
+import sys, os
+
+index_type = sys.argv[1]
+query_mode = sys.argv[2] # topk, prefix_topk, conjunctive_topk
+index_filename = sys.argv[3]
+dataset_name = sys.argv[4]
+k = sys.argv[5]
+num_queries = sys.argv[6]
+
+output_filename = dataset_name + "." + index_type
+
+breakdown = ""
+if len(sys.argv) > 7 and sys.argv[7] == "--breakdown":
+    breakdown = "--breakdown"
+    output_filename += ".breakdown"
+
+output_filename += "." + query_mode + ".timings.json"
+
+percentages = ["0.0", "0.25", "0.50", "0.75"]
+
+for perc in percentages:
+    for terms in range(2,8): # (1,8)
+        os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename)
+    os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename)
diff --git a/script/collect_topk_results_by_varying_percentage.py b/script/collect_topk_results_by_varying_percentage.py
deleted file mode 100644
index f520405..0000000
--- a/script/collect_topk_results_by_varying_percentage.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import sys, os
-
-type = sys.argv[1]
-index_filename = sys.argv[2]
-dataset_name = sys.argv[3]
-k = sys.argv[4]
-num_queries = sys.argv[5]
-
-output_filename = dataset_name + "." + type
-
-breakdown = ""
-if len(sys.argv) > 6 and sys.argv[6] == "--breakdown":
-    breakdown = "--breakdown"
-    output_filename += ".breakdown"
-
-output_filename += ".topk.timings.json"
-
-percentages = ["0.0", "0.25", "0.50", "0.75"]
-
-for perc in percentages:
-    for terms in range(2,8): # (1,8)
-        os.system("../build/benchmark_topk " + type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename)
-    os.system("../build/benchmark_topk " + type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename)

From 4aca38391216cfc37ac825bff6486d745cf13f84 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Fri, 22 Nov 2019 11:30:11 +0100
Subject: [PATCH 042/102] fix benchmark_conjunctive_topk

---
 include/autocomplete4.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index ec88ec3..d884912 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -88,7 +88,7 @@ struct autocomplete4 {
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
 
         uint32_t num_completions =
-            conjunctive_topk(prefix, suffix_lex_range, k, m_pool.scores());
+            conjunctive_topk(prefix, suffix_lex_range, k);
         extract_completions(num_completions);
         return extract_strings(num_completions);
     }
@@ -217,8 +217,7 @@ struct autocomplete4 {
 
         // step 2
         timers[2].start();
-        num_completions =
-            conjunctive_topk(prefix, suffix_lex_range, k, m_pool.scores());
+        num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         timers[2].stop();
 
         // step 3

From cca6b637674b89a1b07e841241e02d53c861d2b9 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 23 Nov 2019 10:45:47 +0100
Subject: [PATCH 043/102] small optimization for block_inv_idx

---
 include/blocked_inverted_index.hpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index 8425e4e..c9c3bf1 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -251,6 +251,11 @@ struct blocked_inverted_index {
         return id;
     }
 
+    uint32_t block_boundary(uint32_t block_id) const {
+        assert(block_id < m_blocks.size());
+        return m_blocks[block_id];
+    }
+
     struct block_type {
         docs_iterator_type docs_iterator;
         offsets_iterator_type offsets_iterator;
@@ -312,14 +317,16 @@ struct blocked_inverted_index {
 
             {
                 uint32_t current_block_id = ii->block_id(r.begin);
-                uint32_t i = r.begin;
-                for (; i != r.end; ++i) {
+                uint32_t current_block_boundary =
+                    ii->block_boundary(current_block_id);
+                for (uint32_t i = r.begin; i != r.end; ++i) {
                     assert(i > 0);
-                    uint32_t b = ii->block_id(i);
-                    if (b > current_block_id) {
+                    if (i > current_block_boundary) {
                         m_range.push_back(ii->block(current_block_id));
+                        current_block_id += 1;
+                        current_block_boundary =
+                            ii->block_boundary(current_block_id);
                     }
-                    current_block_id = b;
                 }
                 m_range.push_back(ii->block(current_block_id));
             }

From 9cebb3dc20301a87a07532ab0508858f4036b675 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 23 Nov 2019 19:09:57 +0100
Subject: [PATCH 044/102] minor fix: ensure bit width

---
 include/bit_vector.hpp             |  1 +
 include/blocked_inverted_index.hpp |  1 +
 include/compact_vector.hpp         | 16 +++++++++-------
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/bit_vector.hpp b/include/bit_vector.hpp
index 676c112..747faef 100644
--- a/include/bit_vector.hpp
+++ b/include/bit_vector.hpp
@@ -412,6 +412,7 @@ struct bits_getter {
         , m_base(offset)
         , m_width(width)
         , m_mask(-(width == 64) | ((uint64_t(1) << width) - 1)) {
+        assert(width > 0);
         util::prefetch(m_data + m_base / 64);
     }
 
diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index c9c3bf1..cf6307e 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -118,6 +118,7 @@ struct blocked_inverted_index {
                     auto max =
                         *std::max_element(term_list.begin(), term_list.end());
                     uint64_t width = util::ceil_log2(max + 1);
+                    if (width == 0) width = 1;
                     // std::cout << "using " << width << " [bpi]" << std::endl;
                     m_terms.append_bits(width, 6);
                     for (auto t : term_list) m_terms.append_bits(t, width);
diff --git a/include/compact_vector.hpp b/include/compact_vector.hpp
index eb3f9b0..da99182 100644
--- a/include/compact_vector.hpp
+++ b/include/compact_vector.hpp
@@ -73,24 +73,26 @@ struct compact_vector {
     };
 
     struct builder {
-        builder(uint64_t n = 0, uint64_t w = 0)
+        builder() {}
+
+        builder(uint64_t n, uint64_t w)
             : m_size(n)
-            , m_width(!w ? w + 1 : w)
+            , m_width(w)
             , m_mask(-(w == 64) | ((1ULL << w) - 1))
             , m_back(0)
             , m_cur_block(0)
             , m_cur_shift(0)
             , m_bits(essentials::words_for(m_size * m_width), 0) {
-            if (m_width > 64) {
-                throw std::runtime_error("width must be <= 64");
+            if (m_width == 0 or m_width > 64) {
+                throw std::runtime_error("width must be > 0 and <= 64");
             }
         }
 
         void resize(size_t n, uint64_t w) {
             m_size = n;
-            m_width = !w ? w + 1 : w;
-            if (m_width > 64) {
-                throw std::runtime_error("width must be <= 64");
+            m_width = w;
+            if (m_width == 0 or m_width > 64) {
+                throw std::runtime_error("width must be > 0 and <= 64");
             }
             m_mask = -(w == 64) | ((uint64_t(1) << w) - 1);
             m_bits.resize(essentials::words_for(m_size * m_width), 0);

From a52d05d52f2e89691f40e66170a23dcfbe6c4575 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 26 Nov 2019 16:04:53 +0100
Subject: [PATCH 045/102] optimized bast and weber

---
 benchmark/benchmark_conjunctive_topk.cpp |  8 +++
 benchmark/benchmark_prefix_topk.cpp      | 18 +++--
 include/autocomplete2.hpp                | 10 +--
 include/autocomplete3.hpp                | 16 ++---
 include/autocomplete4.hpp                | 64 +++++++++++++++--
 include/blocked_inverted_index.hpp       | 91 ++++++++----------------
 6 files changed, 122 insertions(+), 85 deletions(-)

diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp
index ad10ec6..23f9bba 100644
--- a/benchmark/benchmark_conjunctive_topk.cpp
+++ b/benchmark/benchmark_conjunctive_topk.cpp
@@ -53,6 +53,14 @@ void benchmark(std::string const& index_filename, uint32_t k,
         std::cout << reported_strings << std::endl;
         breakdowns.add("musec_per_query",
                        std::to_string(musec_per_query(timer.elapsed())));
+
+        // for (auto const& query : queries) {
+        //     auto it = index.conjunctive_topk(query, k);
+        //     reported_strings += it.size();
+        // }
+        // breakdowns.add("avg_results_per_query",
+        //                std::to_string(static_cast<double>(reported_strings) /
+        //                               queries.size()));
     }
 }
 
diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp
index 28046a2..2c31c68 100644
--- a/benchmark/benchmark_prefix_topk.cpp
+++ b/benchmark/benchmark_prefix_topk.cpp
@@ -34,12 +34,14 @@ void benchmark(std::string const& index_filename, uint32_t k,
         std::cout << reported_strings << std::endl;
         breakdowns.add("parsing_musec_per_query",
                        std::to_string(musec_per_query(timers[0].elapsed())));
-        breakdowns.add("completions_search_musec_per_query",
+        // breakdowns.add("completions_search_musec_per_query",
+        //                std::to_string(musec_per_query(timers[1].elapsed())));
+        // breakdowns.add("topk_rmq_musec_per_query",
+        //                std::to_string(musec_per_query(timers[2].elapsed())));
+        breakdowns.add("prefix_search_musec_per_query",
                        std::to_string(musec_per_query(timers[1].elapsed())));
-        breakdowns.add("topk_rmq_musec_per_query",
-                       std::to_string(musec_per_query(timers[2].elapsed())));
         breakdowns.add("reporting_musec_per_query",
-                       std::to_string(musec_per_query(timers[3].elapsed())));
+                       std::to_string(musec_per_query(timers[2].elapsed())));
     } else {
         essentials::timer_type timer;
         timer.start();
@@ -53,6 +55,14 @@ void benchmark(std::string const& index_filename, uint32_t k,
         std::cout << reported_strings << std::endl;
         breakdowns.add("musec_per_query",
                        std::to_string(musec_per_query(timer.elapsed())));
+
+        // for (auto const& query : queries) {
+        //     auto it = index.prefix_topk(query, k);
+        //     reported_strings += it.size();
+        // }
+        // breakdowns.add("avg_results_per_query",
+        //                std::to_string(static_cast<double>(reported_strings) /
+        //                               queries.size()));
     }
 }
 
diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp
index 7216379..52b7273 100644
--- a/include/autocomplete2.hpp
+++ b/include/autocomplete2.hpp
@@ -209,19 +209,19 @@ struct autocomplete2 {
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         if (r.is_invalid()) return m_pool.begin();
-        timers[1].stop();
+        // timers[1].stop();
 
         // step 2
-        timers[2].start();
+        // timers[2].start();
         uint32_t num_completions =
             m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        timers[2].stop();
+        timers[1].stop();
 
         // step 3
-        timers[3].start();
+        timers[2].start();
         extract_completions(num_completions);
         auto it = extract_strings(num_completions);
-        timers[3].stop();
+        timers[2].stop();
 
         return it;
     }
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index c015583..4faf5a6 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -320,9 +320,7 @@ struct autocomplete3 {
         uint32_t results = 0;
         for (; it.has_next() and !q.empty(); ++it) {
             auto doc_id = *it;
-
-            bool found = false;
-            while (!q.empty() and !found) {
+            while (!q.empty()) {
                 auto& z = q.top();
                 auto val = *z;
                 if (val > doc_id) break;
@@ -334,12 +332,12 @@ struct autocomplete3 {
                         q.heapify();
                     }
                 }
-                if (val == doc_id) found = true;
-            }
-
-            if (found) {
-                topk_scores[results++] = doc_id;
-                if (results == k) break;
+                if (val == doc_id) {  // NOTE: putting else here seems to slow
+                                      // down the code!
+                    topk_scores[results++] = doc_id;
+                    if (results == k) return results;
+                    break;
+                }
             }
         }
 
diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index d884912..ecab539 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -274,19 +274,75 @@ struct autocomplete4 {
         }
     }
 
+    typedef typename BlockedInvertedIndex::block_type block_t;
+
+    struct block_type_comparator {
+        bool operator()(block_t& l, block_t& r) {
+            return l.docs_iterator.operator*() > r.docs_iterator.operator*();
+        }
+    };
+
+    typedef min_heap<block_t, block_type_comparator> min_priority_queue_type;
+
     uint32_t conjunctive_topk(completion_type& prefix, const range suffix,
                               const uint32_t k) {
         auto& topk_scores = m_pool.scores();
         deduplicate(prefix);
+
+        min_priority_queue_type q;
+        uint32_t current_block_id = m_inverted_index.block_id(suffix.begin);
+        uint32_t current_block_boundary =
+            m_inverted_index.block_boundary(current_block_id);
+        for (uint32_t i = suffix.begin; i != suffix.end; ++i) {
+            assert(i > 0);
+            if (i > current_block_boundary) {
+                q.push_back(m_inverted_index.block(current_block_id));
+                current_block_id += 1;
+                current_block_boundary =
+                    m_inverted_index.block_boundary(current_block_id);
+            }
+        }
+        q.push_back(m_inverted_index.block(current_block_id));
+        q.make_heap();
+
         auto it = m_inverted_index.intersection_iterator(prefix, suffix);
         uint32_t results = 0;
-        for (; it.has_next(); ++it) {
+        for (; it.has_next() and !q.empty(); ++it) {
             auto doc_id = *it;
-            if (it.intersects()) {
-                topk_scores[results++] = doc_id;
-                if (results == k) break;
+
+            while (!q.empty()) {
+                auto& z = q.top();
+                auto val = z.docs_iterator.operator*();
+                if (val > doc_id) break;
+                if (val < doc_id) {
+                    val = z.docs_iterator.next_geq(doc_id);
+                    if (!z.docs_iterator.has_next()) {
+                        q.pop();
+                    } else {
+                        q.heapify();
+                    }
+                } else {
+                    if (val == doc_id) {
+                        uint64_t pos = z.docs_iterator.position();
+                        assert(z.docs_iterator.access(pos) == doc_id);
+                        uint64_t begin = z.offsets_iterator.access(pos);
+                        uint64_t end = z.offsets_iterator.access(pos + 1);
+                        assert(end > begin);
+                        for (uint64_t i = begin; i != end; ++i) {
+                            auto t = z.terms_iterator.access(i) + z.lower_bound;
+                            if (t > suffix.end) break;
+                            if (suffix.contains(t)) {
+                                topk_scores[results++] = doc_id;
+                                if (results == k) return results;
+                                break;
+                            }
+                        }
+                    }
+                    break;
+                }
             }
         }
+
         return results;
     }
 
diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index cf6307e..e87aa32 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -316,22 +316,6 @@ struct blocked_inverted_index {
                 m_candidate = 0;
             }
 
-            {
-                uint32_t current_block_id = ii->block_id(r.begin);
-                uint32_t current_block_boundary =
-                    ii->block_boundary(current_block_id);
-                for (uint32_t i = r.begin; i != r.end; ++i) {
-                    assert(i > 0);
-                    if (i > current_block_boundary) {
-                        m_range.push_back(ii->block(current_block_id));
-                        current_block_id += 1;
-                        current_block_boundary =
-                            ii->block_boundary(current_block_id);
-                    }
-                }
-                m_range.push_back(ii->block(current_block_id));
-            }
-
             next();
         }
 
@@ -356,25 +340,6 @@ struct blocked_inverted_index {
             next();
         }
 
-        bool intersects() {
-            for (auto& b : m_range) {
-                uint64_t val = b.docs_iterator.next_geq(m_candidate);
-                if (val == m_candidate) {
-                    uint64_t pos = b.docs_iterator.position();
-                    assert(b.docs_iterator.access(pos) == m_candidate);
-                    uint64_t begin = b.offsets_iterator.access(pos);
-                    uint64_t end = b.offsets_iterator.access(pos + 1);
-                    assert(end > begin);
-                    for (uint64_t i = begin; i != end; ++i) {
-                        auto t = b.terms_iterator.access(i) + b.lower_bound;
-                        if (t > m_suffix.end) break;
-                        if (m_suffix.contains(t)) return true;
-                    }
-                }
-            }
-            return false;
-        }
-
     private:
         id_type m_candidate;
         size_t m_i;
@@ -440,34 +405,6 @@ struct blocked_inverted_index {
         return intersection_iterator_type(this, term_ids, r);
     }
 
-    template <typename Visitor>
-    void visit(Visitor& visitor) {
-        visitor.visit(m_num_integers);
-        visitor.visit(m_num_docs);
-        visitor.visit(m_num_terms);
-        visitor.visit(m_blocks);
-        visitor.visit(m_pointers_to_lists);
-        visitor.visit(m_lists);
-        visitor.visit(m_pointers_to_offsets);
-        visitor.visit(m_offsets);
-        visitor.visit(m_pointers_to_terms);
-        visitor.visit(m_terms);
-    }
-
-private:
-    uint64_t m_num_integers;
-    uint64_t m_num_docs;
-    uint64_t m_num_terms;
-
-    std::vector<uint32_t> m_blocks;
-
-    ef::ef_sequence m_pointers_to_lists;
-    bit_vector m_lists;
-    ef::ef_sequence m_pointers_to_offsets;
-    bit_vector m_offsets;
-    ef::ef_sequence m_pointers_to_terms;
-    bit_vector m_terms;
-
     block_type block(uint32_t block_id) const {
         assert(block_id < num_blocks());
         block_type b;
@@ -496,6 +433,34 @@ struct blocked_inverted_index {
 
         return b;
     }
+
+    template <typename Visitor>
+    void visit(Visitor& visitor) {
+        visitor.visit(m_num_integers);
+        visitor.visit(m_num_docs);
+        visitor.visit(m_num_terms);
+        visitor.visit(m_blocks);
+        visitor.visit(m_pointers_to_lists);
+        visitor.visit(m_lists);
+        visitor.visit(m_pointers_to_offsets);
+        visitor.visit(m_offsets);
+        visitor.visit(m_pointers_to_terms);
+        visitor.visit(m_terms);
+    }
+
+private:
+    uint64_t m_num_integers;
+    uint64_t m_num_docs;
+    uint64_t m_num_terms;
+
+    std::vector<uint32_t> m_blocks;
+
+    ef::ef_sequence m_pointers_to_lists;
+    bit_vector m_lists;
+    ef::ef_sequence m_pointers_to_offsets;
+    bit_vector m_offsets;
+    ef::ef_sequence m_pointers_to_terms;
+    bit_vector m_terms;
 };
 
 }  // namespace autocomplete
\ No newline at end of file

From 802ef16b303284ae09541475efd17f89a370553a Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 28 Nov 2019 12:06:34 +0100
Subject: [PATCH 046/102] refactoring

---
 README.md                                     | 133 ++++--------
 TODO.md                                       |   0
 benchmark/benchmark_common.hpp                |   6 +
 benchmark/benchmark_conjunctive_topk.cpp      |  17 +-
 include/autocomplete.hpp                      |  14 +-
 include/autocomplete2.hpp                     |  51 ++---
 include/autocomplete3.hpp                     |  35 ++-
 include/autocomplete4.hpp                     |  34 ++-
 include/blocked_inverted_index.hpp            |   2 +-
 include/building_util.hpp                     |  47 ++--
 include/compact_forward_index.hpp             |  11 +-
 include/compact_vector.hpp                    |  19 +-
 include/integer_fc_dictionary.hpp             |  15 +-
 include/inverted_index.hpp                    |   2 +-
 include/parameters.hpp                        |   3 +
 install.sh                                    |   2 +-
 results/README.md                             |  22 --
 results/conjunctive_topk.md                   | 107 ----------
 results/fc_dictionary.md                      |  75 -------
 results/integer_fc_dictionary.md              |  31 ---
 results/inverted_index_space.md               |  19 --
 results/prefix_topk.md                        |  94 --------
 results/space.md                              | 159 --------------
 results/topk.md                               | 201 ------------------
 .../collect_results_by_varying_percentage.py  |  12 +-
 src/output_ds2i_format.cpp                    |   2 +-
 test/test_blocked_inverted_index.cpp          |   4 +-
 test/test_compact_forward_index.cpp           |   4 +-
 test/test_inverted_index.cpp                  |   8 +-
 test/test_locate_prefix.cpp                   |   8 +-
 test/test_unsorted_list.cpp                   |  20 +-
 test_data/build_inverted_and_forward.py       |   7 +-
 test_data/build_stats.py                      |  10 +
 test_data/filter_and_preprocess.sh            |  14 ++
 test_data/filter_dataset.py                   |  32 +++
 test_data/partition_queries_by_length.py      |  23 +-
 test_data/preprocess.sh                       |  17 +-
 37 files changed, 281 insertions(+), 979 deletions(-)
 delete mode 100644 TODO.md
 delete mode 100644 results/README.md
 delete mode 100644 results/conjunctive_topk.md
 delete mode 100644 results/fc_dictionary.md
 delete mode 100644 results/integer_fc_dictionary.md
 delete mode 100644 results/inverted_index_space.md
 delete mode 100644 results/prefix_topk.md
 delete mode 100644 results/space.md
 delete mode 100644 results/topk.md
 create mode 100644 test_data/filter_and_preprocess.sh
 create mode 100644 test_data/filter_dataset.py

diff --git a/README.md b/README.md
index 31c1649..f19bd7b 100644
--- a/README.md
+++ b/README.md
@@ -4,119 +4,65 @@ Autocomplete
 Query autocompletion in C++.
 
 ##### Table of contents
-1. [Description](#descr)
-2. [Installation and quick start](#install)
-3. [Compiling the code](#compiling)
-4. [Input data format](#input)
-5. [Running the unit tests](#testing)
-6. [Building an index](#building)
-7. [Benchmarks](#benchmarks)
-8. [Live demo](#demo)
-
-Description <a name="descr"></a>
------------
-
-We designed two solutions (`autocomplete.hpp` and `autocomplete2.hpp`).
-The second solution avoids storing the forward index of the first solution.
-
-Both solution build on two steps: (1) a prefix search (`prefix_topk`) and (2) a conjunctive search (`conjunctive_topk`).
-
-Recall that each completion has an associated integer identifier (henceforth, called docID), assigned in *decreasing* score order.
-
-#### 1. Prefix search
-
-This step returns the top-k completions that are prefixed by the terms in the query.
-For this purposes, we build a dictionary storing all completions seen as (multi-) sets of termIDs.
-Solution 1 uses an integer trie data structure (`completion_trie.hpp`);
-Solution 2 uses Front Coding (`integer_fc_dictionary.hpp`).
-We also materialize the list L of docIDs sorted by the lexicographical order of the completions (`unsorted_list.hpp`).
-
-During a search, we first map the query terms to their lexicographic IDs by using a string dictionary (implemented as a 2-level index with Front Coding -- `fc_dictionary.hpp`). Then, we search the mapped query, say Q, into the completion trie to obtain the lexicographic range [l,r] of all completions that are children of Q. Then we need to identify the top-k docIDs from L[l,r]. Since the range [l,r] can be very large, we use a RMQ data structure built on L.
-
-Having retrieved a list of (at most) k docIDs, we then:
-
-1. Solution 1: use a forward index (`forward_index.hpp`) to materialize the identified completions into a string pool (`scored_string_pool.hpp`).
-The forward index stores the sorted (multi-) set of the termIDs of each completion, plus also the permutation of such termIDs in order to restore the original completion. The sets are stored in increasing-docID order.
-Specifically, we use the forward index to obtain the (permuted) set
-of termIDs and the string dictionary to extract the strings.
-
-2. Solution 2: use a map from docIDs to lexicographic IDs. For every top-k docID, we extract the corresponding completion from the FC-based dictionary.
-
-#### 2. Conjunctive search
-
-This step returns the top-k completions using an inverted index (`inverted_index.hpp`).
-For this purpose, let us consider a query Q[1,m] as tokenized into m terms (the last one possibly not completed).
-In this case we want to return the top-k (smallest) docIDs belonging
-to the intersection between the posting lists of the first m-1 terms
-and the union between all the postings lists of the terms that are
-prefixed by Q[m].
-
-To do so, we could trivially materialize the union and then proceed
-with the intersection.
-The clear problem with this approach is that the number of terms that are prefixed by Q[m] can be very large. Therefore iterating over the union can be overkilling.
-
-To solve this problem, we first obtain the lexicographic range of Q[m] by the string dictionary, say [l,r].
-We then iterate over the intersection of the first m-1 terms' posting lists and for each docID x we check whether the range [l,r] intersect the forward list of x. This check is done with the forward index.
-If the check succeeds, then x is among the top-k documents.
-We keep iterating over the intersection and checking the forward lists until we have k completions or we touch every docID in the intersection.
-
-There is a special case for the case m = 1. In this case, we have no term before the last (only) one, thus we would check *all* forward lists for the range [l,r]. This is too expensive.
-Therefore, we use another RMQ data structure, built on the list, say M, of all the first (i.e., *minimal*) docIDs of the posting lists (think of it as the "first" column of the inverted index).
-A recursive heap-based algorithm is used to produce the smallest docIDs in M[l,r] using the RMQ data structure.
-
-The final string extraction step is identical to that of the
-prefix search.
+1. [Installation and quick start](#install)
+2. [Compiling the code](#compiling)
+3. [Input data format](#input)
+4. [Running the unit tests](#testing)
+5. [Building an index](#building)
+6. [Benchmarks](#benchmarks)
+7. [Live demo](#demo)
 
 Installation and quick start <a name="install"></a>
 ------------------
 
 Just run
 
-	$ bash ./install.sh
+	bash ./install.sh
 
-from the parent directory. The script builds the code; prepare the test data in the folder `test_data` for indexing; executes the unit tests.
+from the parent directory. The script builds the code; prepare the test data in the folder `test_data/trec_05_efficiency_queries` for indexing; executes the unit tests.
 
 After that, for having a minimal running example, just run
 
-	$ bash ./example.sh
+	bash ./example.sh
 
 and then access the service [here](http://127.0.0.1:8000).
 
 Compiling the code <a name="compiling"></a>
 ------------------
 
-The code is tested on Linux with `gcc` 7.4.0 and on Mac 10.14 with `clang` 10.0.0.
+The code has been tested on Linux with `gcc` 7.4.0, 8.3.0, 9.0.0 and on Mac 10.14 with `clang` 10.0.0.
 To build the code, [`CMake`](https://cmake.org/) is required.
 
 Clone the repository with
 
-	$ git clone --recursive https://github.com/jermp/autocomplete.git
+	git clone --recursive https://github.com/jermp/autocomplete.git
 
 If you have cloned the repository without `--recursive`, you will need to perform the following commands before
 compiling:
 
-    $ git submodule init
-    $ git submodule update
+    git submodule init
+    git submodule update
 
 To compile the code for a release environment (see file `CMakeLists.txt` for the used compilation flags), it is sufficient to do the following:
 
-    $ mkdir build
-    $ cd build
-    $ cmake ..
-    $ make
+    mkdir build
+    cd build
+    cmake ..
+    make
 
-Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs.
+Hint: Use `make -j` to compile the library in parallel using all
+available threads.
 
 For the best of performance, we recommend compiling with:
 
-	$ cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On
+	cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On
 
 For a testing environment, use the following instead:
 
-    $ mkdir debug_build
-    $ cd debug_build
-    $ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
-    $ make
+    mkdir debug_build
+    cd debug_build
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
+    make
 
 Input data format <a name="input"></a>
 -----------------
@@ -137,7 +83,11 @@ in preparing the data for indexing.
 Thus, from within the directory `test_data`, it is sufficient
 to do:
 
-	$ bash preprocess.sh
+	bash preprocess.sh 300
+
+The second argument in the example, i.e., 300, represents the
+number of completions (per completion size) that are drawn at
+random and could be used to query the indexes.
 
 If you run the script, you will get:
 
@@ -168,7 +118,7 @@ The unit tests are written using [doctest](https://github.com/onqtam/doctest).
 After compilation and preparation of the data for indexing (see Section [Input data format](#input)), it is advised
 to run the unit tests with:
 
-	$ make test
+	make test
 
 Building an index <a name="building"></a>
 -----------
@@ -178,31 +128,36 @@ where the index will be written.
 
 For example, with
 
-	$ ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin
+	./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin
 
 we can build an index of type `ef_type1` from the test file `../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`, that will be serialized to the file `trec05.ef_type1.bin`.
 
 Possible types are `ef_type1`, `ef_type2`, `ef_type3` and `ef_type4`.
 
+Note: the type `ef_type4` requires an extra parameter
+to be specified, `c`. Use for example: `-c 0.0001`.
 
 Benchmarks <a name="benchmarks"></a>
 ----------
 
 To run the top-k benchmarks in the `/benchmark` directory,
 we first need some query logs.
+They should have been created already if you have run the
+script `preprocess.sh`, otherwise
+you can use
 
-You can use
-
-	$ python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions
+	python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries 300
 
-to partition the input completions by number of query terms. Each partition
-of queries is shuffled at random to avoid locality of access.
+to partition the input completions by number of query terms
+and retain 300 queries at random.
+Query files are placed in the output directory
+`trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries`.
 (By default, 8 shards will be created: the ones having [1,7] query terms and
 the one collecting all completions with >= 8 query terms).
 
 Then the command
 
-	$ ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 1000 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.length=3
+	./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 300 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries/queries.length=3.shuffled
 
 will execute 1000 top-10 queries with 3 terms, from which only 25%
 of the prefix of the last token is retained.
@@ -210,7 +165,7 @@ of the prefix of the last token is retained.
 We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`.
 From within the `/build` directory, run
 
-	$ python ../script/collect_topk_results_by_varying_percentage.py ef_type1 trec05.ef_type1.bin trec_05_efficiency_queries 10 5000
+	python ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300
 
 You can also specify the option `--breakdown` to record timings breakdowns.
 
diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index e69de29..0000000
diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp
index 4309912..2f12c8a 100644
--- a/benchmark/benchmark_common.hpp
+++ b/benchmark/benchmark_common.hpp
@@ -6,6 +6,11 @@ namespace autocomplete {
 
 static const uint32_t runs = 5;
 
+// void tolower(std::string& str) {
+//     std::transform(str.begin(), str.end(), str.begin(),
+//                    [](unsigned char c) { return std::tolower(c); });
+// }
+
 size_t load_queries(std::vector<std::string>& queries, uint32_t max_num_queries,
                     float percentage, std::istream& is = std::cin) {
     assert(percentage >= 0.0 and percentage <= 1.0);
@@ -20,6 +25,7 @@ size_t load_queries(std::vector<std::string>& queries, uint32_t max_num_queries,
         size_t end = size + std::ceil(last_token_size * percentage) + 1 +
                      1;  // retain at least one char
         for (size = query.size(); size > end; --size) query.pop_back();
+        // tolower(query);
         queries.push_back(query);
     }
     return queries.size();
diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp
index 23f9bba..7d8a7d3 100644
--- a/benchmark/benchmark_conjunctive_topk.cpp
+++ b/benchmark/benchmark_conjunctive_topk.cpp
@@ -16,22 +16,35 @@ void benchmark(std::string const& index_filename, uint32_t k,
     uint32_t num_queries =
         load_queries(queries, max_num_queries, keep, std::cin);
 
+    uint32_t R = runs;  // runs
+
     uint64_t reported_strings = 0;
     auto musec_per_query = [&](double time) {
-        return time / (runs * num_queries);
+        return time / (R * num_queries);
     };
 
     breakdowns.add("num_queries", std::to_string(num_queries));
 
     if (breakdown) {
         std::vector<timer_type> timers(4);
-        for (uint32_t run = 0; run != runs; ++run) {
+        for (uint32_t run = 0; run != R; ++run) {
             for (auto const& query : queries) {
                 auto it = index.conjunctive_topk(query, k, timers);
                 reported_strings += it.size();
             }
         }
         std::cout << reported_strings << std::endl;
+
+        // breakdowns.add("checked_docids",
+        // std::to_string(index.checked_docids)); breakdowns.add("heap_size",
+        // std::to_string(index.heap_size));
+
+        // auto perc_skipped_searches =
+        //     (static_cast<double>(index.skipped_searches) * 100.0) /
+        //     queries.size();
+        // breakdowns.add("skipped_searches",
+        //                std::to_string(perc_skipped_searches));
+
         breakdowns.add("parsing_musec_per_query",
                        std::to_string(musec_per_query(timers[0].elapsed())));
         breakdowns.add("dictionary_search_musec_per_query",
diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp
index 47b4472..616b13f 100644
--- a/include/autocomplete.hpp
+++ b/include/autocomplete.hpp
@@ -13,6 +13,9 @@ struct autocomplete {
     typedef scored_string_pool::iterator iterator_type;
 
     autocomplete() {
+        // heap_size = 0;
+        // checked_docids = 0;
+        // skipped_searches = 0;
         m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
     }
 
@@ -218,7 +221,11 @@ struct autocomplete {
         // step 1
         timers[1].start();
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
+        if (suffix_lex_range.is_invalid()) {
+            // ++skipped_searches;
+            // std::cout << "'" << query << "'\n";
+            return m_pool.begin();
+        }
 
         timers[1].stop();
 
@@ -261,6 +268,10 @@ struct autocomplete {
         visitor.visit(m_forward_index);
     }
 
+    // uint64_t heap_size;
+    // uint64_t checked_docids;
+    // uint64_t skipped_searches;
+
 private:
     Completions m_completions;
     UnsortedDocsList m_unsorted_docs_list;
@@ -294,6 +305,7 @@ struct autocomplete {
         uint32_t results = 0;
         for (; it.has_next(); ++it) {
             auto doc_id = *it;
+            // ++checked_docids;
             if (m_forward_index.intersects(doc_id, r)) {
                 topk_scores[results++] = doc_id;
                 if (results == k) break;
diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp
index 52b7273..9d05226 100644
--- a/include/autocomplete2.hpp
+++ b/include/autocomplete2.hpp
@@ -15,6 +15,8 @@ struct autocomplete2 {
     typedef scored_string_pool::iterator iterator_type;
 
     autocomplete2() {
+        // heap_size = 0;
+        // checked_docids = 0;
         m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
         m_topk_completion_set.resize(constants::MAX_K,
                                      2 * constants::MAX_NUM_TERMS_PER_QUERY);
@@ -26,29 +28,13 @@ struct autocomplete2 {
         typename Dictionary::builder di_builder(params);
         typename InvertedIndex::builder ii_builder(params);
 
-        auto const& doc_ids = cm_builder.doc_ids();
-        m_unsorted_docs_list.build(doc_ids);
+        auto const& docid_to_lexid = cm_builder.docid_to_lexid();
+        m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
+                               util::ceil_log2(params.num_completions + 1));
+        m_unsorted_docs_list.build(
+            util::invert(docid_to_lexid, params.num_completions));
         m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids());
 
-        {
-            essentials::logger("building map from doc_id to lex_id...");
-            uint64_t n = doc_ids.size();
-            typedef std::vector<std::pair<id_type, id_type>> id_map_type;
-            id_map_type ids;
-            ids.reserve(n);
-            for (id_type lex_id = 0; lex_id != n; ++lex_id) {
-                ids.emplace_back(lex_id, doc_ids[lex_id]);
-            }
-            std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) {
-                return l.second < r.second;
-            });
-            m_docid_to_lexid.build(
-                util::first_iterator<typename id_map_type::const_iterator>(
-                    ids.begin()),
-                ids.size());
-            essentials::logger("DONE");
-        }
-
         cm_builder.build(m_completions);
         di_builder.build(m_dictionary);
         ii_builder.build(m_inverted_index);
@@ -287,6 +273,9 @@ struct autocomplete2 {
         visitor.visit(m_docid_to_lexid);
     }
 
+    // uint64_t heap_size;
+    // uint64_t checked_docids;
+
 private:
     Completions m_completions;
     UnsortedDocsList m_unsorted_docs_list;
@@ -336,19 +325,17 @@ struct autocomplete2 {
 
         for (; it.has_next(); ++it) {
             auto doc_id = *it;
+            // ++checked_docids;
             auto lex_id = m_docid_to_lexid[doc_id];
             uint32_t size = m_completions.extract(lex_id, completions[i]);
-
-            bool found = false;
-            for (uint32_t j = 0; j != size and !found; ++j) {
-                if (r.contains(completions[i][j])) found = true;
-            }
-
-            if (found) {
-                topk_scores[i] = doc_id;
-                sizes[i] = size;
-                ++i;
-                if (i == k) break;
+            for (uint32_t j = 0; j != size; ++j) {
+                if (r.contains(completions[i][j])) {
+                    topk_scores[i] = doc_id;
+                    sizes[i] = size;
+                    ++i;
+                    if (i == k) return k;
+                    break;
+                }
             }
         }
 
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index 4faf5a6..6165e19 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -25,6 +25,8 @@ struct autocomplete3 {
         min_priority_queue_type;
 
     autocomplete3() {
+        // heap_size = 0;
+        // checked_docids = 0;
         m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
         m_topk_completion_set.resize(constants::MAX_K,
                                      2 * constants::MAX_NUM_TERMS_PER_QUERY);
@@ -36,27 +38,11 @@ struct autocomplete3 {
         typename Dictionary::builder di_builder(params);
         typename InvertedIndex::builder ii_builder(params);
 
-        auto const& doc_ids = cm_builder.doc_ids();
-        m_unsorted_docs_list.build(doc_ids);
-
-        {
-            essentials::logger("building map from doc_id to lex_id...");
-            uint64_t n = doc_ids.size();
-            typedef std::vector<std::pair<id_type, id_type>> id_map_type;
-            id_map_type ids;
-            ids.reserve(n);
-            for (id_type lex_id = 0; lex_id != n; ++lex_id) {
-                ids.emplace_back(lex_id, doc_ids[lex_id]);
-            }
-            std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) {
-                return l.second < r.second;
-            });
-            m_docid_to_lexid.build(
-                util::first_iterator<typename id_map_type::const_iterator>(
-                    ids.begin()),
-                ids.size());
-            essentials::logger("DONE");
-        }
+        auto const& docid_to_lexid = cm_builder.docid_to_lexid();
+        m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
+                               util::ceil_log2(params.num_completions + 1));
+        m_unsorted_docs_list.build(
+            util::invert(docid_to_lexid, params.num_completions));
 
         cm_builder.build(m_completions);
         di_builder.build(m_dictionary);
@@ -261,6 +247,9 @@ struct autocomplete3 {
         visitor.visit(m_docid_to_lexid);
     }
 
+    // uint64_t heap_size;
+    // uint64_t checked_docids;
+
 private:
     Completions m_completions;
     UnsortedDocsList m_unsorted_docs_list;
@@ -317,9 +306,13 @@ struct autocomplete3 {
         }
         q.make_heap();
 
+        // heap_size += q.size();
+
         uint32_t results = 0;
         for (; it.has_next() and !q.empty(); ++it) {
             auto doc_id = *it;
+            // ++checked_docids;
+
             while (!q.empty()) {
                 auto& z = q.top();
                 auto val = *z;
diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index ecab539..cd44706 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -18,6 +18,8 @@ struct autocomplete4 {
     typedef scored_string_pool::iterator iterator_type;
 
     autocomplete4() {
+        // heap_size = 0;
+        // checked_docids = 0;
         m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
         m_topk_completion_set.resize(constants::MAX_K,
                                      2 * constants::MAX_NUM_TERMS_PER_QUERY);
@@ -29,27 +31,11 @@ struct autocomplete4 {
         typename Dictionary::builder di_builder(params);
         typename BlockedInvertedIndex::builder ii_builder(params, c);
 
-        auto const& doc_ids = cm_builder.doc_ids();
-        m_unsorted_docs_list.build(doc_ids);
-
-        {
-            essentials::logger("building map from doc_id to lex_id...");
-            uint64_t n = doc_ids.size();
-            typedef std::vector<std::pair<id_type, id_type>> id_map_type;
-            id_map_type ids;
-            ids.reserve(n);
-            for (id_type lex_id = 0; lex_id != n; ++lex_id) {
-                ids.emplace_back(lex_id, doc_ids[lex_id]);
-            }
-            std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) {
-                return l.second < r.second;
-            });
-            m_docid_to_lexid.build(
-                util::first_iterator<typename id_map_type::const_iterator>(
-                    ids.begin()),
-                ids.size());
-            essentials::logger("DONE");
-        }
+        auto const& docid_to_lexid = cm_builder.docid_to_lexid();
+        m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
+                               util::ceil_log2(params.num_completions + 1));
+        m_unsorted_docs_list.build(
+            util::invert(docid_to_lexid, params.num_completions));
 
         cm_builder.build(m_completions);
         di_builder.build(m_dictionary);
@@ -246,6 +232,9 @@ struct autocomplete4 {
         visitor.visit(m_docid_to_lexid);
     }
 
+    // uint64_t heap_size;
+    // uint64_t checked_docids;
+
 private:
     Completions m_completions;
     UnsortedDocsList m_unsorted_docs_list;
@@ -305,10 +294,13 @@ struct autocomplete4 {
         q.push_back(m_inverted_index.block(current_block_id));
         q.make_heap();
 
+        // heap_size += q.size();
+
         auto it = m_inverted_index.intersection_iterator(prefix, suffix);
         uint32_t results = 0;
         for (; it.has_next() and !q.empty(); ++it) {
             auto doc_id = *it;
+            // ++checked_docids;
 
             while (!q.empty()) {
                 auto& z = q.top();
diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index e87aa32..519a0bf 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -21,7 +21,7 @@ struct blocked_inverted_index {
 
         builder(parameters const& params, float c)
             : m_num_integers(0)
-            , m_num_docs(params.num_completions)
+            , m_num_docs(params.universe)
             , m_num_terms(params.num_terms) {
             if (!(c > 0.0 and c <= 1.0)) {
                 throw std::runtime_error("c must be in (0,1]");
diff --git a/include/building_util.hpp b/include/building_util.hpp
index 17427b6..0398879 100644
--- a/include/building_util.hpp
+++ b/include/building_util.hpp
@@ -1,10 +1,22 @@
 #pragma once
 
+#include "util.hpp"
 #include "bit_vector.hpp"
 
 namespace autocomplete {
 namespace util {
 
+std::vector<id_type> invert(std::vector<id_type> const& docid_to_lexid,
+                            uint64_t size) {
+    std::vector<id_type> lexid_to_docid(size);
+    for (uint64_t doc_id = 0; doc_id != docid_to_lexid.size(); ++doc_id) {
+        if (docid_to_lexid[doc_id] < size) {
+            lexid_to_docid[docid_to_lexid[doc_id]] = doc_id;
+        }
+    }
+    return lexid_to_docid;
+}
+
 void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) {
     uint64_t mod = bvb.size() % alignment;
     if (mod) {
@@ -23,40 +35,5 @@ void eat_pad(bits_iterator<bit_vector>& it, uint64_t alignment = 8) {
     }
 }
 
-template <typename Iterator>
-struct first_iterator
-    : std::iterator<std::forward_iterator_tag,
-                    typename Iterator::value_type::first_type> {
-    first_iterator(Iterator it, uint64_t state = 0)
-        : m_it(it)
-        , m_state(state) {}
-
-    typename Iterator::value_type::first_type operator*() {
-        return (*m_it).first;
-    }
-
-    first_iterator& operator++() {
-        m_it += 1;
-        m_state += 1;
-        return *this;
-    }
-
-    first_iterator operator+(uint64_t n) {
-        return {m_it + n, m_state + n};
-    }
-
-    bool operator==(first_iterator const& other) const {
-        return m_state == other.m_state;
-    }
-
-    bool operator!=(first_iterator const& other) const {
-        return !(*this == other);
-    }
-
-private:
-    Iterator m_it;
-    uint64_t m_state;
-};
-
 }  // namespace util
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/include/compact_forward_index.hpp b/include/compact_forward_index.hpp
index 74ad769..bde4b71 100644
--- a/include/compact_forward_index.hpp
+++ b/include/compact_forward_index.hpp
@@ -14,20 +14,19 @@ struct compact_forward_index {
             : m_num_integers(0)
             , m_num_terms(params.num_terms) {
             essentials::logger("building forward_index...");
-            uint64_t num_completions = params.num_completions;
+            uint64_t universe = params.universe;
             std::ifstream input(
                 (params.collection_basename + ".forward").c_str(),
                 std::ios_base::in);
-
-            std::vector<uint32_t> terms;
-            terms.reserve(params.num_completions *
+            std::vector<id_type> terms;
+            terms.reserve(universe *
                           constants::MAX_NUM_TERMS_PER_QUERY);  // at most
             uint64_t size = 0;
             m_pointers.push_back(0);
-            for (uint64_t i = 0; i != num_completions; ++i) {
+            for (uint64_t i = 0; i != universe; ++i) {
                 uint32_t n = 0;
                 input >> n;
-                assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY);
+                assert(n < constants::MAX_NUM_TERMS_PER_QUERY);
                 m_num_integers += n;
                 size += n;
                 for (uint64_t k = 0; k != n; ++k) {
diff --git a/include/compact_vector.hpp b/include/compact_vector.hpp
index da99182..ac8e275 100644
--- a/include/compact_vector.hpp
+++ b/include/compact_vector.hpp
@@ -73,7 +73,10 @@ struct compact_vector {
     };
 
     struct builder {
-        builder() {}
+        builder()
+            : m_back(0)
+            , m_cur_block(0)
+            , m_cur_shift(0) {}
 
         builder(uint64_t n, uint64_t w)
             : m_size(n)
@@ -95,6 +98,8 @@ struct compact_vector {
                 throw std::runtime_error("width must be > 0 and <= 64");
             }
             m_mask = -(w == 64) | ((uint64_t(1) << w) - 1);
+            std::cout << "using " << essentials::words_for(m_size * m_width)
+                      << " words" << std::endl;
             m_bits.resize(essentials::words_for(m_size * m_width), 0);
         }
 
@@ -110,7 +115,7 @@ struct compact_vector {
                 throw std::runtime_error("width must be greater than 0");
             }
 
-            for (uint64_t i = 0; i < n; ++i, ++begin) {
+            for (uint64_t i = 0; i != n; ++i, ++begin) {
                 push_back(*begin);
             }
         }
@@ -222,8 +227,13 @@ struct compact_vector {
     void build(Iterator begin, uint64_t n) {
         uint64_t max = *std::max_element(begin, begin + n);
         uint64_t width = util::ceil_log2(max + 1);
-        std::cout << "\tusing " << width << " [bpi]" << std::endl;
-        compact_vector::builder builder(begin, n, width);
+        build(begin, n, width);
+    }
+
+    template <typename Iterator>
+    void build(Iterator begin, uint64_t n, uint64_t w) {
+        std::cout << "\tusing " << w << " [bpi]" << std::endl;
+        compact_vector::builder builder(begin, n, w);
         builder.build(*this);
     }
 
@@ -314,4 +324,5 @@ struct compact_vector {
     uint64_t m_mask;
     std::vector<uint64_t> m_bits;
 };
+
 }  // namespace autocomplete
diff --git a/include/integer_fc_dictionary.hpp b/include/integer_fc_dictionary.hpp
index 39e547f..29d8743 100644
--- a/include/integer_fc_dictionary.hpp
+++ b/include/integer_fc_dictionary.hpp
@@ -19,7 +19,7 @@ struct integer_fc_dictionary {
             essentials::logger(
                 "building integer_fc_dictionary with bucket size " +
                 std::to_string(BucketSize) + "...");
-            m_doc_ids.reserve(params.num_completions);
+            m_docid_to_lexid.resize(params.universe, id_type(-1));
 
             uint32_t buckets = std::ceil(double(m_size) / (BucketSize + 1));
             m_pointers_to_buckets.reserve(buckets + 1);
@@ -35,9 +35,10 @@ struct integer_fc_dictionary {
                 std::ios_base::in);
             completion_iterator it(params, input);
 
+            id_type lex_id = 0;
             for (uint32_t b = 0; b != buckets; ++b) {
                 auto& header = *it;
-                m_doc_ids.push_back(header.doc_id);
+                m_docid_to_lexid[header.doc_id] = lex_id++;
                 write_header(header.completion);
                 m_pointers_to_headers.push_back(m_headers.size());
                 completion_type prev;
@@ -47,7 +48,7 @@ struct integer_fc_dictionary {
                 for (uint32_t i = 0; i != size; ++i, ++it) {
                     auto& record = *it;
                     auto& curr = record.completion;
-                    m_doc_ids.push_back(record.doc_id);
+                    m_docid_to_lexid[record.doc_id] = lex_id++;
                     uint32_t l = 0;  // |lcp(curr,prev)|
                     while (l != curr.size() and l != prev.size() and
                            curr[l] == prev[l]) {
@@ -76,7 +77,7 @@ struct integer_fc_dictionary {
             other.m_pointers_to_buckets.swap(m_pointers_to_buckets);
             other.m_headers.swap(m_headers);
             other.m_buckets.swap(m_buckets);
-            other.m_doc_ids.swap(m_doc_ids);
+            other.m_docid_to_lexid.swap(m_docid_to_lexid);
         }
 
         void build(integer_fc_dictionary<BucketSize, Pointers>& d) {
@@ -88,8 +89,8 @@ struct integer_fc_dictionary {
             builder().swap(*this);
         }
 
-        std::vector<id_type>& doc_ids() {
-            return m_doc_ids;
+        std::vector<id_type>& docid_to_lexid() {
+            return m_docid_to_lexid;
         }
 
     private:
@@ -98,7 +99,7 @@ struct integer_fc_dictionary {
         std::vector<uint64_t> m_pointers_to_buckets;
         std::vector<uint32_t> m_headers;
         std::vector<uint8_t> m_buckets;
-        std::vector<id_type> m_doc_ids;
+        std::vector<id_type> m_docid_to_lexid;
 
         void write_header(completion_type const& c) {
             assert(c.size() > 0 and
diff --git a/include/inverted_index.hpp b/include/inverted_index.hpp
index 0bef228..900fd96 100644
--- a/include/inverted_index.hpp
+++ b/include/inverted_index.hpp
@@ -16,7 +16,7 @@ struct inverted_index {
 
         builder(parameters const& params)
             : m_num_integers(0)
-            , m_num_docs(params.num_completions) {
+            , m_num_docs(params.universe) {
             essentials::logger("building inverted_index...");
 
             uint64_t num_terms = params.num_terms;
diff --git a/include/parameters.hpp b/include/parameters.hpp
index 9d03783..d628d25 100644
--- a/include/parameters.hpp
+++ b/include/parameters.hpp
@@ -24,10 +24,12 @@ struct parameters {
         input >> num_terms;
         input >> max_string_length;
         input >> num_completions;
+        input >> universe;
         input >> num_levels;
         assert(num_terms > 0);
         assert(max_string_length > 0);
         assert(num_completions > 0);
+        assert(universe >= num_completions);
         assert(num_levels > 0);
 
         if (max_string_length > constants::MAX_NUM_CHARS_PER_QUERY) {
@@ -52,6 +54,7 @@ struct parameters {
     uint32_t num_terms;
     uint32_t max_string_length;
     uint32_t num_completions;
+    uint32_t universe;
     uint32_t num_levels;
     std::vector<uint32_t> nodes_per_level;
     std::string collection_basename;
diff --git a/install.sh b/install.sh
index 9e8da9e..7714147 100644
--- a/install.sh
+++ b/install.sh
@@ -5,7 +5,7 @@ cd build
 cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On
 make
 cd ../test_data
-./preprocess.sh
+bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300
 cd ../build
 make test
 cd ..
diff --git a/results/README.md b/results/README.md
deleted file mode 100644
index 7e6ba77..0000000
--- a/results/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-Test machine
-------------
-
-4 Intel i7-7700 cores (@3.6 GHz); 64 GB of RAM DDR3 (@2.133 GHz); running Linux 4.4.0 (64 bits); 32K for both instruction and data L1 cache; 256K for L2 cache; 8192K for L3 cache.
-
-Compiler
---------
-
-gcc 7.4.0
-
-`cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=OFF -DUSE_INTRINSICS=ON -DUSE_PDEP=ON`
-
-
-Experiments
------------
-
-- The file `space.md` reports the space breakdowns.
-- The file `prefix_topk.md` reports the timing breakdowns for the prefix_topk step by varying the number of query terms.
-- The file `conjunctive_topk.md` reports the timing breakdowns for the conjunctive_topk step by varying the number of query terms.
-- The file `topk.md` reports the total time of the `topk` operation (combining the two steps, `prefix_topk` and `conjunctive_topk`) by varying the number of query terms.
-- The file `fc_dictionary.md` reports on the `fc_dictionary` benchmark.
-- The file `integer_fc_dictionary.md` reports on the `integer_fc_dictionary` benchmark.
\ No newline at end of file
diff --git a/results/conjunctive_topk.md b/results/conjunctive_topk.md
deleted file mode 100644
index 3d9747b..0000000
--- a/results/conjunctive_topk.md
+++ /dev/null
@@ -1,107 +0,0 @@
-Conjunctive top-k
------------------
-
-Executing queries shuffled at random, for k = 7.
-
-Average among 10 runs.
-
-From the last token of the query, we only retain the first character. This means that we spend less in obtaining the lexicographic range of the character (string comparisons are
-very fast), but we spend more on the RMQ phase, because the
-range obtained from the completion trie can be very large.
-
-### AOL
-
-#### Solution 1
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "3", "conjunctive_search_ns_per_query": "2896", "reporting_ns_per_query": "352"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "52", "dictionary_search_ns_per_query": "10", "conjunctive_search_ns_per_query": "2273", "reporting_ns_per_query": "2333"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "919", "dictionary_search_ns_per_query": "39", "conjunctive_search_ns_per_query": "20478", "reporting_ns_per_query": "1772"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1298", "dictionary_search_ns_per_query": "49", "conjunctive_search_ns_per_query": "27363", "reporting_ns_per_query": "974"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1857", "dictionary_search_ns_per_query": "42", "conjunctive_search_ns_per_query": "25484", "reporting_ns_per_query": "556"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2239", "dictionary_search_ns_per_query": "34", "conjunctive_search_ns_per_query": "22070", "reporting_ns_per_query": "438"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2871", "dictionary_search_ns_per_query": "32", "conjunctive_search_ns_per_query": "18657", "reporting_ns_per_query": "465"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3774", "dictionary_search_ns_per_query": "30", "conjunctive_search_ns_per_query": "13967", "reporting_ns_per_query": "844"}
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4463"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6677"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "25503"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "31536"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "29973"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "27148"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "23630"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "20511"}
-
-If we do not check the forward index (thus erronously reporting the first k docids of the intersection), we have:
-
-    {"num_terms_per_query": "3", "num_queries": "50000", "conjunctive_search_ns_per_query": "10362"}
-    {"num_terms_per_query": "4", "num_queries": "50000", "conjunctive_search_ns_per_query": "21327"}
-    {"num_terms_per_query": "5", "num_queries": "50000", "conjunctive_search_ns_per_query": "23187"}
-    {"num_terms_per_query": "6", "num_queries": "50000",  "conjunctive_search_ns_per_query": "21259"}
-    {"num_terms_per_query": "7", "num_queries": "50000",  "conjunctive_search_ns_per_query": "18234"}
-    {"num_terms_per_query": "8+", "num_queries": "50000",  "conjunctive_search_ns_per_query": "13912"}
-
-We can see that the time for the `conjunctive_search` remains the same, except for the case with 3 terms.
-This suggests that the time needed to check the forward index is negligible compared to the one
-needed to produce the intersection. This can also be observed considering that the time for the case with 2 terms is very small: in this case we check the forward index for each doc in the inverted list of the first term.
-
-#### Solution 2
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3275", "reporting_ns_per_query": "330"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "109", "dictionary_search_ns_per_query": "36", "conjunctive_search_ns_per_query": "15770", "reporting_ns_per_query": "2485"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "932", "dictionary_search_ns_per_query": "52", "conjunctive_search_ns_per_query": "24290", "reporting_ns_per_query": "1780"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1388", "dictionary_search_ns_per_query": "55", "conjunctive_search_ns_per_query": "29056", "reporting_ns_per_query": "953"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1880", "dictionary_search_ns_per_query": "41", "conjunctive_search_ns_per_query": "26675", "reporting_ns_per_query": "541"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2277", "dictionary_search_ns_per_query": "43", "conjunctive_search_ns_per_query": "22955", "reporting_ns_per_query": "421"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2762", "dictionary_search_ns_per_query": "37", "conjunctive_search_ns_per_query": "19437", "reporting_ns_per_query": "443"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3878", "dictionary_search_ns_per_query": "40", "conjunctive_search_ns_per_query": "14657", "reporting_ns_per_query": "814"}
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4917"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "20361"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "28619"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "33140"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "30410"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "27477"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "24357"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "21042"}
-
-### MSN
-
-#### Solution 1
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3021", "reporting_ns_per_query": "576"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "39", "dictionary_search_ns_per_query": "7", "conjunctive_search_ns_per_query": "2279", "reporting_ns_per_query": "1926"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "810", "dictionary_search_ns_per_query": "15", "conjunctive_search_ns_per_query": "12382", "reporting_ns_per_query": "1078"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1104", "dictionary_search_ns_per_query": "15", "conjunctive_search_ns_per_query": "13534", "reporting_ns_per_query": "526"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1737", "dictionary_search_ns_per_query": "11", "conjunctive_search_ns_per_query": "11424", "reporting_ns_per_query": "305"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2049", "dictionary_search_ns_per_query": "10", "conjunctive_search_ns_per_query": "9565", "reporting_ns_per_query": "252"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2396", "dictionary_search_ns_per_query": "9", "conjunctive_search_ns_per_query": "8020", "reporting_ns_per_query": "324"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3431", "dictionary_search_ns_per_query": "9", "conjunctive_search_ns_per_query": "6199", "reporting_ns_per_query": "738"}
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4982"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6176"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "16236"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "17306"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "15591"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "13961"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "12980"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "12311"}
-
-#### Solution 2
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3722", "reporting_ns_per_query": "511"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "56", "dictionary_search_ns_per_query": "20", "conjunctive_search_ns_per_query": "15134", "reporting_ns_per_query": "2043"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "835", "dictionary_search_ns_per_query": "20", "conjunctive_search_ns_per_query": "15310", "reporting_ns_per_query": "1072"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1117", "dictionary_search_ns_per_query": "19", "conjunctive_search_ns_per_query": "14672", "reporting_ns_per_query": "517"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1704", "dictionary_search_ns_per_query": "14", "conjunctive_search_ns_per_query": "12384", "reporting_ns_per_query": "300"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2164", "dictionary_search_ns_per_query": "13", "conjunctive_search_ns_per_query": "10222", "reporting_ns_per_query": "246"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2567", "dictionary_search_ns_per_query": "12", "conjunctive_search_ns_per_query": "8579", "reporting_ns_per_query": "305"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3670", "dictionary_search_ns_per_query": "12", "conjunctive_search_ns_per_query": "6644", "reporting_ns_per_query": "714"}
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5667"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "19144"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "18886"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "18109"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "16030"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "14423"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "13418"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "12779"}
\ No newline at end of file
diff --git a/results/fc_dictionary.md b/results/fc_dictionary.md
deleted file mode 100644
index 37ff080..0000000
--- a/results/fc_dictionary.md
+++ /dev/null
@@ -1,75 +0,0 @@
-#### Results on the AOL querylog.
-
-	pibiri@rubino:~/autocomplete/build$ ./benchmark_fc_dictionary ../test_data/aol/aol.completions 1000000 < ../test_data/aol/aol.completions.dict_queries.1M.shuffled 
-	2019-10-24 11:11:49: loading queries...
-	2019-10-24 11:11:49: loaded 1000000 queries
-	2019-10-24 11:11:49: building fc_dictionary with bucket size 4...
-	2019-10-24 11:11:50: DONE
-	using 42938890 bytes
-	locate: 557.091 [ns/string]
-	extract: 168.772 [ns/string]
-	locate_prefix-0%: 213.453 [ns/string]
-	locate_prefix-25%: 794.612 [ns/string]
-	locate_prefix-50%: 1064.44 [ns/string]
-	locate_prefix-75%: 912.04 [ns/string]
-	locate_prefix-100%: 702.745 [ns/string]
-	2019-10-24 11:12:12: building fc_dictionary with bucket size 8...
-	2019-10-24 11:12:12: DONE
-	using 38111527 bytes
-	locate: 511.503 [ns/string]
-	extract: 152.331 [ns/string]
-	locate_prefix-0%: 223.374 [ns/string]
-	locate_prefix-25%: 686.093 [ns/string]
-	locate_prefix-50%: 873.161 [ns/string]
-	locate_prefix-75%: 758.029 [ns/string]
-	locate_prefix-100%: 638.576 [ns/string]
-	2019-10-24 11:12:32: building fc_dictionary with bucket size 16...
-	2019-10-24 11:12:32: DONE
-	using 35270205 bytes
-	locate: 478.592 [ns/string]
-	extract: 139.109 [ns/string]
-	locate_prefix-0%: 228.416 [ns/string]
-	locate_prefix-25%: 662.483 [ns/string]
-	locate_prefix-50%: 769.227 [ns/string]
-	locate_prefix-75%: 685.358 [ns/string]
-	locate_prefix-100%: 615.757 [ns/string]
-	2019-10-24 11:12:51: building fc_dictionary with bucket size 32...
-	2019-10-24 11:12:51: DONE
-	using 33722303 bytes
-	locate: 484.72 [ns/string]
-	extract: 150.21 [ns/string]
-	locate_prefix-0%: 273.595 [ns/string]
-	locate_prefix-25%: 717.559 [ns/string]
-	locate_prefix-50%: 790.342 [ns/string]
-	locate_prefix-75%: 728.409 [ns/string]
-	locate_prefix-100%: 681.921 [ns/string]
-	2019-10-24 11:13:11: building fc_dictionary with bucket size 64...
-	2019-10-24 11:13:11: DONE
-	using 32910194 bytes
-	locate: 585.835 [ns/string]
-	extract: 194.183 [ns/string]
-	locate_prefix-0%: 667.159 [ns/string]
-	locate_prefix-25%: 962.096 [ns/string]
-	locate_prefix-50%: 1056.04 [ns/string]
-	locate_prefix-75%: 1014.63 [ns/string]
-	locate_prefix-100%: 978.718 [ns/string]
-	2019-10-24 11:13:39: building fc_dictionary with bucket size 128...
-	2019-10-24 11:13:39: DONE
-	using 32496375 bytes
-	locate: 810.282 [ns/string]
-	extract: 286.967 [ns/string]
-	locate_prefix-0%: 574.352 [ns/string]
-	locate_prefix-25%: 1248.92 [ns/string]
-	locate_prefix-50%: 1435.28 [ns/string]
-	locate_prefix-75%: 1419.18 [ns/string]
-	locate_prefix-100%: 1398.48 [ns/string]
-	2019-10-24 11:14:16: building fc_dictionary with bucket size 256...
-	2019-10-24 11:14:16: DONE
-	using 32286042 bytes
-	locate: 1281.09 [ns/string]
-	extract: 470.922 [ns/string]
-	locate_prefix-0%: 1065.07 [ns/string]
-	locate_prefix-25%: 2099.35 [ns/string]
-	locate_prefix-50%: 2387.39 [ns/string]
-	locate_prefix-75%: 2407.04 [ns/string]
-	locate_prefix-100%: 2403.04 [ns/string]
\ No newline at end of file
diff --git a/results/integer_fc_dictionary.md b/results/integer_fc_dictionary.md
deleted file mode 100644
index 955afe0..0000000
--- a/results/integer_fc_dictionary.md
+++ /dev/null
@@ -1,31 +0,0 @@
-#### Results on the AOL querylog.
-
-	pibiri@rubino:~/autocomplete/build$ ./benchmark_integer_fc_dictionary ../test_data/aol/aol.completions 1000000
-	2019-10-14 15:28:12: building integer_fc_dictionary with bucket size 4...
-	2019-10-14 15:28:14: DONE
-	using 129855836 bytes
-	extract: 102.787 [ns/string]
-	2019-10-14 15:28:15: building integer_fc_dictionary with bucket size 8...
-	2019-10-14 15:28:18: DONE
-	using 112779868 bytes
-	extract: 98.9981 [ns/string]
-	2019-10-14 15:28:19: building integer_fc_dictionary with bucket size 16...
-	2019-10-14 15:28:21: DONE
-	using 102740006 bytes
-	extract: 103.745 [ns/string]
-	2019-10-14 15:28:22: building integer_fc_dictionary with bucket size 32...
-	2019-10-14 15:28:24: DONE
-	using 97266766 bytes
-	extract: 136.042 [ns/string]
-	2019-10-14 15:28:26: building integer_fc_dictionary with bucket size 64...
-	2019-10-14 15:28:28: DONE
-	using 94397632 bytes
-	extract: 207.699 [ns/string]
-	2019-10-14 15:28:30: building integer_fc_dictionary with bucket size 128...
-	2019-10-14 15:28:32: DONE
-	using 92933198 bytes
-	extract: 354.622 [ns/string]
-	2019-10-14 15:28:36: building integer_fc_dictionary with bucket size 256...
-	2019-10-14 15:28:38: DONE
-	using 92192244 bytes
-	extract: 651.357 [ns/string]
\ No newline at end of file
diff --git a/results/inverted_index_space.md b/results/inverted_index_space.md
deleted file mode 100644
index f3acd81..0000000
--- a/results/inverted_index_space.md
+++ /dev/null
@@ -1,19 +0,0 @@
-Inverted index compression
-----
-
-#### AOL
-
-	EF -- 17.1495 bits per element
-	PEF uniform -- 16.5788 bits per element
-	PEF opt -- 15.0967 bits per element
-	PFOR -- 15.2661 bits per element
-	BIC -- 14.1396 bits per element
-	Simple9 -- 21.8895 bits per element
-	Simple16 -- 21.7385 bits per element
-	VByte -- 20.9531 bits per element
-	Varint -- 21.996 bits per element
-	Gamma -- 23.6305 bits per element
-	Delta -- 19.2088 bits per element
-	Rice -- 19.4145 bits per element
-	DINT single -- 15.4204 bits per element
-	DINT multi -- 15.084 bits per element
\ No newline at end of file
diff --git a/results/prefix_topk.md b/results/prefix_topk.md
deleted file mode 100644
index 6404bc4..0000000
--- a/results/prefix_topk.md
+++ /dev/null
@@ -1,94 +0,0 @@
-Prefix top-k
-------------
-
-Executing queries shuffled at random, for k = 7.
-
-Average among 10 runs.
-
-From the last token of the query, we only retain the first character. This means that we spend less in obtaining the lexicographic range of the character (string comparisons are
-very fast), but we spend more on the RMQ phase, because the
-range obtained from the completion trie can be very large.
-
-### AOL
-
-#### Solution 1
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "279", "topk_rmq_ns_per_query": "2887", "reporting_ns_per_query": "317"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "47", "completions_search_ns_per_query": "853", "topk_rmq_ns_per_query": "576", "reporting_ns_per_query": "1851"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "706", "completions_search_ns_per_query": "945", "topk_rmq_ns_per_query": "95", "reporting_ns_per_query": "717"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1055", "completions_search_ns_per_query": "1057", "topk_rmq_ns_per_query": "22", "reporting_ns_per_query": "332"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1495", "completions_search_ns_per_query": "1215", "topk_rmq_ns_per_query": "9", "reporting_ns_per_query": "325"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1957", "completions_search_ns_per_query": "1434", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "425"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2410", "completions_search_ns_per_query": "1581", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "611"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3360", "completions_search_ns_per_query": "1888", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "913"}
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5027"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4974"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3984"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4137"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4660"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5335"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5785"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "7394"}
-
-#### Solution 2
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "493", "topk_rmq_ns_per_query": "3072", "reporting_ns_per_query": "628"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "60", "completions_search_ns_per_query": "1078", "topk_rmq_ns_per_query": "589", "reporting_ns_per_query": "1897"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "675", "completions_search_ns_per_query": "1053", "topk_rmq_ns_per_query": "96", "reporting_ns_per_query": "730"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1047", "completions_search_ns_per_query": "1081", "topk_rmq_ns_per_query": "21", "reporting_ns_per_query": "320"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1367", "completions_search_ns_per_query": "1112", "topk_rmq_ns_per_query": "8", "reporting_ns_per_query": "244"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1886", "completions_search_ns_per_query": "1139", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "300"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2242", "completions_search_ns_per_query": "1166", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "455"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3229", "completions_search_ns_per_query": "1205", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "809"}
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5768"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "5625"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "4389"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4421"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4830"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5336"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5963"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "7104"}
-
-### MSN
-    
-#### Solution 1
-    
-	{"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "403", "topk_rmq_ns_per_query": "3211", "reporting_ns_per_query": "509"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "33", "completions_search_ns_per_query": "784", "topk_rmq_ns_per_query": "312", "reporting_ns_per_query": "1287"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "596", "completions_search_ns_per_query": "906", "topk_rmq_ns_per_query": "49", "reporting_ns_per_query": "423"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1026", "completions_search_ns_per_query": "1015", "topk_rmq_ns_per_query": "11", "reporting_ns_per_query": "206"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1434", "completions_search_ns_per_query": "1114", "topk_rmq_ns_per_query": "5", "reporting_ns_per_query": "217"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1938", "completions_search_ns_per_query": "1273", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "330"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2362", "completions_search_ns_per_query": "1437", "topk_rmq_ns_per_query": "0", "reporting_ns_per_query": "545"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3186", "completions_search_ns_per_query": "1737", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "873"}
-	
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5804"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4006"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3456"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "3873"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4587"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5030"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5617"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "6957"}
-
-#### Solution 2
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "697", "topk_rmq_ns_per_query": "3495", "reporting_ns_per_query": "1114"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "32", "completions_search_ns_per_query": "1038", "topk_rmq_ns_per_query": "321", "reporting_ns_per_query": "1384"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "547", "completions_search_ns_per_query": "1029", "topk_rmq_ns_per_query": "51", "reporting_ns_per_query": "455"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1012", "completions_search_ns_per_query": "1038", "topk_rmq_ns_per_query": "11", "reporting_ns_per_query": "210"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1318", "completions_search_ns_per_query": "1066", "topk_rmq_ns_per_query": "5", "reporting_ns_per_query": "172"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1922", "completions_search_ns_per_query": "1077", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "242"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2213", "completions_search_ns_per_query": "1099", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "425"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3228", "completions_search_ns_per_query": "1124", "topk_rmq_ns_per_query": "0", "reporting_ns_per_query": "799"}
-	
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "6772"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4646"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3831"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4108"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4594"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5080"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5621"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "6775"}
\ No newline at end of file
diff --git a/results/space.md b/results/space.md
deleted file mode 100644
index 64ac1a2..0000000
--- a/results/space.md
+++ /dev/null
@@ -1,159 +0,0 @@
-AOL 2006 query log
-------------------
-
-10,142,395 distinct queries, whose ids have been assigned
-in decreasing frequency order (ties broken lexicographically).
-
-#### Solution 1
-
-	using 1.05555 [GiB]
-	  completions: 0.520278 [GiB] (49.2899%)
-	  unsorted docs list: 0.0409812 [GiB] (3.88246%)
-	  unsorted minimal docs list: 0.0154568 [GiB] (1.46434%)
-	  dictionary: 0.0328479 [GiB] (3.11194%)
-	  inverted index: 0.144273 [GiB] (13.6681%)
-		data: 33.0401 [bpi]
-		pointers: 8.13526 [bpi]
-	  forward index: 0.30171 [GiB] (28.5833%)
-		data: 42.6801 [bpi]
-		pointers: 42.8379 [bpi]
-	
-	
-	+ Elias-Fano
-	using 0.370675 [GiB]
-	  completions: 0.0867222 [GiB] (23.3958%)
-	  unsorted docs list: 0.0409812 [GiB] (11.0558%)
-	  unsorted minimal docs list: 0.0154568 [GiB] (4.1699%)
-	  dictionary: 0.0328479 [GiB] (8.86166%)
-	  inverted index: 0.0595939 [GiB] (16.0771%)
-		data: 15.7999 [bpi]
-		pointers: 1.20819 [bpi]
-	  forward index: 0.135073 [GiB] (36.4397%)
-		data: 32.866 [bpi]
-		pointers: 5.41964 [bpi]
-	
-	+ Elias-Fano and compact_forward_index
-	using 0.318008 [GiB]
-	  completions: 0.0867222 [GiB] (27.2704%)
-	  unsorted docs list: 0.0409812 [GiB] (12.8868%)
-	  unsorted minimal docs list: 0.0154568 [GiB] (4.86049%)
-	  dictionary: 0.0328479 [GiB] (10.3293%)
-	  inverted index: 0.0595939 [GiB] (18.7397%)
-		data: 15.7999 [bpi]
-		pointers: 1.20819 [bpi]
-	  forward index: 0.0824065 [GiB] (25.9133%)
-		data: 22 [bpi]
-		pointers: 1.35762 [bpi]
-	
-	+ Elias-Fano and delta_forward_index
-	using 0.350595 [GiB]
-	  completions: 0.086722 [GiB] (24.7356%)
-	  unsorted docs list: 0.0409812 [GiB] (11.689%)
-	  unsorted minimal docs list: 0.0154568 [GiB] (4.40872%)
-	  dictionary: 0.0328479 [GiB] (9.36919%)
-		data: 69.9866 [bps]
-		pointers: 3.76476 [bps]
-	  inverted index: 0.0595939 [GiB] (16.9979%)
-		data: 15.7999 [bpi]
-		pointers: 1.20819 [bpi]
-	  forward index: 0.114994 [GiB] (32.7995%)
-		data: 29.6008 [bpi]
-		pointers: 2.99348 [bpi]
-	
-	+ Elias-Fano + compact_forward_index + compact_unsorted_lists
-	using 0.304999 [GiB]
-	  completions: 0.086722 [GiB] (28.4335%)
-	  unsorted docs list: 0.0315353 [GiB] (10.3395%)
-	  unsorted minimal docs list: 0.0118937 [GiB] (3.89958%)
-	  dictionary: 0.0328479 [GiB] (10.7698%)
-		data: 69.9866 [bps]
-		pointers: 3.76476 [bps]
-	  inverted index: 0.0595939 [GiB] (19.539%)
-		data: 15.7999 [bpi]
-		pointers: 1.20819 [bpi]
-	  forward index: 0.0824065 [GiB] (27.0186%)
-		data: 22 [bpi]
-		pointers: 1.35762 [bpi]
-	
-#### Solution 2
-
-	using 0.377843 [GiB]
-	  completions: 0.0956838 [GiB] (25.3237%)
-	  unsorted docs list: 0.0409812 [GiB] (10.8461%)
-	  unsorted minimal docs list: 0.0154568 [GiB] (4.09079%)
-	  dictionary: 0.0330574 [GiB] (8.74898%)
-	  inverted index: 0.154881 [GiB] (40.9907%)
-	  map from docid to lexid: 0.0377834 [GiB] (9.99975%)
-	  
-	  
-	+ Elias-Fano
-	using 0.259893 [GiB]
-	  completions: 0.0956841 [GiB] (36.8168%)
-		data: 73.5086 [bps]
-		pointers: 7.52944 [bps]
-	  unsorted docs list: 0.0315353 [GiB] (12.134%)
-	  unsorted minimal docs list: 0.0118937 [GiB] (4.57639%)
-	  dictionary: 0.0328479 [GiB] (12.639%)
-		data: 69.9866 [bps]
-		pointers: 3.76476 [bps]
-	  inverted index: 0.0595939 [GiB] (22.9302%)
-		data: 15.7999 [bpi]
-		pointers: 1.20819 [bpi]
-	  map from docid to lexid: 0.0283376 [GiB] (10.9036%)
-  	  
-
-MSN 2006 query log
-------------------
-
-7,083,363 distinct queries, whose ids have been assigned
-in decreasing frequency order (ties broken lexicographically).
-
-#### Solution 1
-
-    using 0.769592 [GiB]
-      completion trie: 0.370163 [GiB] (48.0986%)
-      unsorted docs list: 0.0286179 [GiB] (3.71858%)
-      unsorted minimal docs list: 0.0104689 [GiB] (1.36031%)
-      dictionary: 0.0220881 [GiB] (2.87011%)
-      inverted index: 0.107578 [GiB] (13.9785%)
-      forward index: 0.230677 [GiB] (29.9739%)
-      
-     + compression 
-	 using 0.213269 [GiB]
-	  completions: 0.0617906 [GiB] (28.973%)
-	  unsorted docs list: 0.0211964 [GiB] (9.9388%)
-	  unsorted minimal docs list: 0.00775427 [GiB] (3.6359%)
-	  dictionary: 0.0219463 [GiB] (10.2904%)
-		data: 68.9954 [bps]
-		pointers: 3.7648 [bps]
-	  inverted index: 0.0429281 [GiB] (20.1286%)
-		data: 16.2938 [bpi]
-		pointers: 1.1785 [bpi]
-	  forward index: 0.0576538 [GiB] (27.0333%)
-		data: 22 [bpi]
-		pointers: 1.35605 [bpi]
- 
-#### Solution 2
-
-	using 0.263256 [GiB]
-	  completions: 0.0681158 [GiB] (25.8744%)
-	  unsorted docs list: 0.0286179 [GiB] (10.8708%)
-	  unsorted minimal docs list: 0.0104689 [GiB] (3.97669%)
-	  dictionary: 0.0220881 [GiB] (8.39036%)
-	  inverted index: 0.107578 [GiB] (40.8643%)
-	  map from docid to lexid: 0.0263876 [GiB] (10.0236%)
-	  
-	+ compression
-	using 0.180907 [GiB]
-	  completions: 0.0681161 [GiB] (37.6525%)
-		data: 75.0743 [bps]
-		pointers: 7.52946 [bps]
-	  unsorted docs list: 0.0211964 [GiB] (11.7167%)
-	  unsorted minimal docs list: 0.00775427 [GiB] (4.28633%)
-	  dictionary: 0.0219463 [GiB] (12.1312%)
-		data: 68.9954 [bps]
-		pointers: 3.7648 [bps]
-	  inverted index: 0.0429281 [GiB] (23.7293%)
-		data: 16.2938 [bpi]
-		pointers: 1.1785 [bpi]
-	  map from docid to lexid: 0.0189661 [GiB] (10.4839%)
\ No newline at end of file
diff --git a/results/topk.md b/results/topk.md
deleted file mode 100644
index b101b43..0000000
--- a/results/topk.md
+++ /dev/null
@@ -1,201 +0,0 @@
-Top-k
------------------
-
-Executing queries shuffled at random, for k = 7.
-
-Average among 10 runs.
-
-### AOL
-
-#### Solution 1
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5062"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6725"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "24960"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "32761"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "31450"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "28812"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "25978"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "22785"}
-
-	+ Elias-Fano
-	{"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5614"}
-	{"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "9767"}
-	{"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "26999"}
-	{"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "35428"}
-	{"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "36073"}
-	{"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "31718"}
-	{"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "29992"}
-	{"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "27313"}
-
-	+ Elias-Fano and forward_index2
-	{"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5336"}
-	{"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "7573"}
-	{"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "26278"}
-	{"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "35664"}
-	{"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "35189"}
-	{"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "32033"}
-	{"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "29950"}
-	{"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "27332"}
-
-#### Solution 2
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5812"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "12703"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "27307"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "33476"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "31403"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "28718"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "25728"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "22419"}
-
-	+ Elias-Fano
-	{"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5609"}
-	{"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "10894"}
-	{"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "27311"}
-	{"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "34780"}
-	{"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "33849"}
-	{"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "30319"}
-	{"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "28181"}
-	{"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "24757"}
-
-#### Solution 3
-
-	{"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "5899"}
-	{"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "12282007"}
-	{"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "18393403"}
-	{"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "15212918"}
-	{"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "11852012"}
-	{"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "7781194"}
-	{"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "7939661"}
-	{"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "6980226"}
-
-	+ Elias-Fano
-	{"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "6024"}
-	{"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "20553345"}
-	{"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "32495295"}
-	{"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "30929833"}
-	{"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "27103519"}
-	{"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "19912460"}
-	{"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "20956205"}
-	{"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "19643570"}
-
-#### Solution 4
-
-	c = 0.005
-	{"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6593"}
-	{"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "756944"}
-	{"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2188766"}
-	{"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "1920720"}
-	{"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2398355"}
-	{"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "1711205"}
-	{"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2195672"}
-	{"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2115028"}
-
-	c = 0.01
-	{"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6610"}
-	{"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "739838"}
-	{"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2147339"}
-	{"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "1988980"}
-	{"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2440435"}
-	{"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "1858965"}
-	{"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2304761"}
-	{"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2254481"}
-
-	c = 0.01, + Elias-Fano
-	{"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "5879"}
-	{"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "1754176"}
-	{"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "3435481"}
-	{"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "4442784"}
-	{"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "4946228"}
-	{"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "4818169"}
-	{"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "5157776"}
-	{"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "5431935"}
-
-	c = 0.025
-	{"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6528"}
-	{"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "828082"}
-	{"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2422803"}
-	{"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "2482018"}
-	{"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2970064"}
-	{"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "2542134"}
-	{"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2972710"}
-	{"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2924603"}
-
-	c = 0.05
-	{"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6508"}
-	{"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1059938"}
-	{"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "3046716"}
-	{"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "3528723"}
-	{"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "4037290"}
-	{"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "3850329"}
-	{"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "4371489"}
-	{"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "4648349"}
-
-	c = 0.1
-	{"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6584"}
-	{"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1600869"}
-	{"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "4501125"}
-	{"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "5562030"}
-	{"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "6634491"}
-	{"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "6768321"}
-	{"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "7124462"}
-	{"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "7733525"}
-
-	c = 0.2
-	{"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6589"}
-	{"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "2831409"}
-	{"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "7641806"}
-	{"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "9881857"}
-	{"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "11138148"}
-	{"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "11643908"}
-	{"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "11966417"}
-	{"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "12460833"}
-
-### MSN
-
-#### Solution 1
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5823"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6251"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "16502"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "18380"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "17044"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "15622"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "14709"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "14323"}
-
-#### Solution 2
-
-	{"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "6837"}
-	{"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "14469"}
-	{"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "18670"}
-	{"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "19144"}
-	{"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "17109"}
-	{"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "15738"}
-	{"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "14810"}
-	{"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "14260"}
-
-
-#### Solution 3
-
-
-	{"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "6666"}
-	{"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "6635754"}
-	{"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "8612266"}
-	{"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "5290905"}
-	{"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "3939319"}
-	{"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "3035556"}
-	{"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "3106875"}
-	{"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "3089917"}
-
-#### Solution 4 with c = 0.1
-
-	{"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "7496"}
-	{"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1280652"}
-	{"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "3181191"}
-	{"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "3722226"}
-	{"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "4056810"}
-	{"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "4130288"}
-	{"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "4282750"}
-	{"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "4205507"}
\ No newline at end of file
diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py
index baeeb85..b474d7a 100644
--- a/script/collect_results_by_varying_percentage.py
+++ b/script/collect_results_by_varying_percentage.py
@@ -3,22 +3,22 @@
 index_type = sys.argv[1]
 query_mode = sys.argv[2] # topk, prefix_topk, conjunctive_topk
 index_filename = sys.argv[3]
-dataset_name = sys.argv[4]
+dataset_basename = sys.argv[4] # e.g., aol/aol.completions or aol/aol.completions.filtered
 k = sys.argv[5]
 num_queries = sys.argv[6]
 
-output_filename = dataset_name + "." + index_type
+output_filename = dataset_basename + "." + index_type
 
 breakdown = ""
 if len(sys.argv) > 7 and sys.argv[7] == "--breakdown":
     breakdown = "--breakdown"
     output_filename += ".breakdown"
 
-output_filename += "." + query_mode + ".timings.json"
+output_filename += "." + query_mode + ".json"
+query_filename_prefix = dataset_basename + ".queries/queries."
 
 percentages = ["0.0", "0.25", "0.50", "0.75"]
-
 for perc in percentages:
     for terms in range(2,8): # (1,8)
-        os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename)
-    os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename)
+        os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
+    os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)
diff --git a/src/output_ds2i_format.cpp b/src/output_ds2i_format.cpp
index cc139c4..eb92509 100644
--- a/src/output_ds2i_format.cpp
+++ b/src/output_ds2i_format.cpp
@@ -27,7 +27,7 @@ int main(int argc, char** argv) {
 
     {  // write ds2i header
         uint32_t n = 1;
-        uint32_t universe = params.num_completions;
+        uint32_t universe = params.universe;
         docs.write(reinterpret_cast<const char*>(&n), sizeof(uint32_t));
         docs.write(reinterpret_cast<const char*>(&universe), sizeof(uint32_t));
     }
diff --git a/test/test_blocked_inverted_index.cpp b/test/test_blocked_inverted_index.cpp
index 80a9bc1..a2ede74 100644
--- a/test/test_blocked_inverted_index.cpp
+++ b/test/test_blocked_inverted_index.cpp
@@ -15,7 +15,7 @@ TEST_CASE("test blocked_inverted_index::intersection_iterator") {
     {
         inverted_index_type::builder ii_builder(params);
         ii_builder.build(ii);
-        REQUIRE(ii.num_docs() == params.num_completions);
+        REQUIRE(ii.num_docs() == params.universe);
         REQUIRE(ii.num_terms() == params.num_terms);
     }
 
@@ -37,7 +37,7 @@ TEST_CASE("test blocked_inverted_index::intersection_iterator") {
                 blocked_ii_builder.build(blocked_ii);
             }
 
-            REQUIRE(blocked_ii.num_docs() == params.num_completions);
+            REQUIRE(blocked_ii.num_docs() == params.universe);
             REQUIRE(blocked_ii.num_terms() == params.num_terms);
 
             for (auto& q : queries) {
diff --git a/test/test_compact_forward_index.cpp b/test/test_compact_forward_index.cpp
index aa09403..dc78c07 100644
--- a/test/test_compact_forward_index.cpp
+++ b/test/test_compact_forward_index.cpp
@@ -12,7 +12,7 @@ TEST_CASE("test compact_forward_index::iterator") {
         compact_forward_index::builder builder(params);
         compact_forward_index index;
         builder.build(index);
-        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_docs() == params.universe);
         REQUIRE(index.num_terms() == params.num_terms);
         essentials::save<compact_forward_index>(index, output_filename);
     }
@@ -20,7 +20,7 @@ TEST_CASE("test compact_forward_index::iterator") {
     {
         compact_forward_index index;
         essentials::load(index, output_filename);
-        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_docs() == params.universe);
         REQUIRE(index.num_terms() == params.num_terms);
 
         std::ifstream input((params.collection_basename + ".forward").c_str(),
diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp
index b96b708..5faa823 100644
--- a/test/test_inverted_index.cpp
+++ b/test/test_inverted_index.cpp
@@ -14,7 +14,7 @@ TEST_CASE("test inverted_index::iterator") {
         inverted_index_type::builder builder(params);
         inverted_index_type index;
         builder.build(index);
-        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_docs() == params.universe);
         REQUIRE(index.num_terms() == params.num_terms);
         essentials::save<inverted_index_type>(index, output_filename);
     }
@@ -22,7 +22,7 @@ TEST_CASE("test inverted_index::iterator") {
     {
         inverted_index_type index;
         essentials::load(index, output_filename);
-        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_docs() == params.universe);
         REQUIRE(index.num_terms() == params.num_terms);
 
         std::ifstream input((params.collection_basename + ".inverted").c_str(),
@@ -58,7 +58,7 @@ TEST_CASE("test inverted_index::intersection_iterator") {
         inverted_index_type::builder builder(params);
         inverted_index_type index;
         builder.build(index);
-        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_docs() == params.universe);
         REQUIRE(index.num_terms() == params.num_terms);
         essentials::save<inverted_index_type>(index, output_filename);
     }
@@ -66,7 +66,7 @@ TEST_CASE("test inverted_index::intersection_iterator") {
     {
         inverted_index_type index;
         essentials::load(index, output_filename);
-        REQUIRE(index.num_docs() == params.num_completions);
+        REQUIRE(index.num_docs() == params.universe);
         REQUIRE(index.num_terms() == params.num_terms);
 
         static const uint32_t num_queries = 1000000;
diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp
index 7924899..ae99a6b 100644
--- a/test/test_locate_prefix.cpp
+++ b/test/test_locate_prefix.cpp
@@ -82,9 +82,9 @@ TEST_CASE("test locate_prefix()") {
                       << num_terms << std::endl;
             {
                 queries.clear();
-                std::string filename = params.collection_basename +
-                                       ".length=" + std::to_string(num_terms) +
-                                       ".shuffled";
+                std::string filename =
+                    params.collection_basename +
+                    ".queries/queries.length=" + std::to_string(num_terms);
                 std::ifstream querylog(filename.c_str());
                 if (!querylog.is_open()) {
                     std::cerr << "cannot open file '" << filename << "'"
@@ -95,7 +95,7 @@ TEST_CASE("test locate_prefix()") {
                 querylog.close();
             }
 
-            // test_locate_prefix(dict, ct_index, queries, strings);
+            test_locate_prefix(dict, ct_index, queries, strings);
             test_locate_prefix(dict, fc_index, queries, strings);
         }
     }
diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp
index 8e791bb..8b1ce0f 100644
--- a/test/test_unsorted_list.cpp
+++ b/test/test_unsorted_list.cpp
@@ -62,15 +62,17 @@ TEST_CASE("test unsorted_list on doc_ids") {
         }
         input.close();
 
-        {
-            // must have all ids from 0 to doc_ids.size() - 1
-            std::vector<id_type> tmp = doc_ids;
-            std::sort(tmp.begin(), tmp.end());
-            for (id_type id = 0; id != doc_ids.size(); ++id) {
-                REQUIRE_MESSAGE(tmp[id] == id,
-                                "Error: id " << id << " not found");
-            }
-        }
+        // {
+        //     // must have all ids from 0 to doc_ids.size() - 1
+        //     // NOTE: not true if we filter out some strings to be used as
+        //     // queries
+        //     std::vector<id_type> tmp = doc_ids;
+        //     std::sort(tmp.begin(), tmp.end());
+        //     for (id_type id = 0; id != doc_ids.size(); ++id) {
+        //         REQUIRE_MESSAGE(tmp[id] == id,
+        //                         "Error: id " << id << " not found");
+        //     }
+        // }
 
         succinct_rmq list;
         list.build(doc_ids);
diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py
index 0634d82..acf4b8e 100644
--- a/test_data/build_inverted_and_forward.py
+++ b/test_data/build_inverted_and_forward.py
@@ -19,10 +19,11 @@
 num_docs = 0
 with open(input_filename + ".mapped.stats") as f:
     num_terms = int(f.readline())
-    print num_terms
-    f.readline() # skip line containing max num. of query terms
+    print("terms: " + str(num_terms))
+    f.readline() # skip line: max num. of query terms
+    f.readline() # skip line: num. of completions
     num_docs = int(f.readline())
-    print num_docs
+    print("universe: " + str(num_docs))
 
 inverted_index = [[] for i in range(num_terms + 1)] # id 0 is not assigned
 forward_index = [[] for i in range(num_docs)]
diff --git a/test_data/build_stats.py b/test_data/build_stats.py
index 5fdfdb7..8e60a39 100644
--- a/test_data/build_stats.py
+++ b/test_data/build_stats.py
@@ -8,10 +8,17 @@
 
 output_file = open(input_filename + ".stats", 'a')
 prev = []
+universe = 0;
 with open(input_filename, 'r') as f:
     for line in f:
         x = line.rstrip('\n').split()
+        docid = int(x[0])
+
+        if docid > universe:
+            universe = docid
+
         q = x[1:len(x)]
+
         level_id = 0
         while level_id < len(q) and level_id < len(prev) and q[level_id] == prev[level_id]:
             level_id += 1
@@ -31,7 +38,10 @@
 # number of completions
 # number of levels in the trie
 # number of nodes for each level
+print("universe: " + str(universe + 1))
+print("completions: " + str(lines))
 output_file.write(str(lines) + "\n")
+output_file.write(str(universe + 1) + "\n")
 output_file.write(str(len(nodes_per_level)) + "\n")
 for key, value in sorted(nodes_per_level.iteritems(), key = lambda kv: kv[0]):
     output_file.write(str(value) + "\n")
diff --git a/test_data/filter_and_preprocess.sh b/test_data/filter_and_preprocess.sh
new file mode 100644
index 0000000..38425d7
--- /dev/null
+++ b/test_data/filter_and_preprocess.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+echo $1 # input filename
+
+# number of completions to exclude per completion size,
+# e.g., if it is 100, then at most 8 x 100 completions are filtered out
+echo $2
+
+python partition_queries_by_length.py $1 $1.filtered.queries $2
+python filter_dataset.py $1 $1.filtered.queries
+python extract_dict.py $1.filtered
+python map_dataset.py $1.filtered
+python build_stats.py $1.filtered.mapped
+python build_inverted_and_forward.py $1.filtered
diff --git a/test_data/filter_dataset.py b/test_data/filter_dataset.py
new file mode 100644
index 0000000..4481cbe
--- /dev/null
+++ b/test_data/filter_dataset.py
@@ -0,0 +1,32 @@
+import sys
+from sets import Set
+
+input_filename = sys.argv[1]
+queries_directory = sys.argv[2]
+
+to_filter = Set({})
+print("loading strings to filter...")
+for i in range(1,8):
+    with open(queries_directory + "/queries.length=" + str(i)) as f:
+        for line in f:
+            s = line.rstrip('\n')
+            to_filter.add(s)
+with open(queries_directory + "/queries.length=8+") as f:
+    for line in f:
+        s = line.rstrip('\n')
+        to_filter.add(s)
+
+lines = 0
+print("filtering dataset...")
+
+output_file = open(input_filename + ".filtered", 'w')
+with open(input_filename, 'r') as f:
+    for line in f:
+        x = line.rstrip('\n').split()
+        string = ' '.join(x[1:len(x)])
+        if string not in to_filter:
+            output_file.write(line)
+        lines += 1
+        if lines % 1000000 == 0:
+            print("processed " + str(lines) + " lines")
+output_file.close()
\ No newline at end of file
diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py
index 7dfbed6..eb9b95d 100644
--- a/test_data/partition_queries_by_length.py
+++ b/test_data/partition_queries_by_length.py
@@ -1,12 +1,17 @@
-import sys, random
+import sys, os, random
 
 input_filename = sys.argv[1]
+output_directory = sys.argv[2]
+n = int(sys.argv[3])
+
+if not os.path.exists(output_directory):
+    os.makedirs(output_directory)
 
 num_shards = 7
-files = [open(input_filename + ".length=" + str(i) + ".shuffled", "w") for i in range(1,num_shards + 1)]
-all_others = open(input_filename + ".length=" + str(num_shards + 1) + "+.shuffled", "w")
+files = [open(output_directory + "/queries.length=" + str(i), "w") for i in range(1,num_shards + 1)]
+all_others = open(output_directory + "/queries.length=" + str(num_shards + 1) + "+", "w")
 
-strings = [[] for i in range(0, num_shards)]
+strings = [[] for i in range(num_shards)]
 all_others_strings = []
 
 lines = 0
@@ -23,13 +28,13 @@
         if lines % 1000000 == 0:
             print("processed " + str(lines) + " lines")
 
-for i in range(0, num_shards):
+for i in range(num_shards):
     random.shuffle(strings[i])
-    for s in strings[i]:
-        files[i].write(s)
+    for k in range(min(n, len(strings[i]))):
+        files[i].write(strings[i][k])
     files[i].close()
 
 random.shuffle(all_others_strings)
-for s in all_others_strings:
-    all_others.write(s)
+for k in range(min(n, len(all_others_strings))):
+    all_others.write(all_others_strings[k])
 all_others.close()
diff --git a/test_data/preprocess.sh b/test_data/preprocess.sh
index 24c9488..e3d96f7 100755
--- a/test_data/preprocess.sh
+++ b/test_data/preprocess.sh
@@ -1,12 +1,9 @@
 #!/bin/bash
 
-collections=`find . | grep "\\.completions$"`
-
-for collection in $collections; do
-    echo $collection
-    python extract_dict.py $collection
-    python map_dataset.py $collection
-    python build_stats.py $collection.mapped
-    python build_inverted_and_forward.py $collection
-    python partition_queries_by_length.py $collection
-done
+echo $1 # input filename
+echo $2 # number of queries for each size
+python extract_dict.py $1
+python map_dataset.py $1
+python build_stats.py $1.mapped
+python build_inverted_and_forward.py $1
+python partition_queries_by_length.py $1 $1.queries $2

From 78f27ed61b143ad1a927fd30a4f2c227726f61aa Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 28 Nov 2019 14:32:50 +0100
Subject: [PATCH 047/102] script to automate benchmarking of dictionaries

---
 benchmark/benchmark_fc_dictionary.cpp         | 33 +++++++++----------
 script/benchmark_dictionaries.sh              |  7 ++++
 ...te_prefix_results_by_varying_percentage.py | 11 +++----
 .../collect_results_by_varying_percentage.py  |  6 ++--
 4 files changed, 30 insertions(+), 27 deletions(-)
 create mode 100644 script/benchmark_dictionaries.sh

diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp
index 1d94c8e..36882c9 100644
--- a/benchmark/benchmark_fc_dictionary.cpp
+++ b/benchmark/benchmark_fc_dictionary.cpp
@@ -41,12 +41,10 @@ void perf_test(Dictionary const& dict,
         timer.stop();
     }
 
-    std::cout << "extract: " << (timer.average() * 1000.0) / ids.size()
-              << " [ns/string]" << std::endl;
+    std::cout << "extract: " << timer.average() / ids.size()
+              << " [musec/string]" << std::endl;
 
     static std::vector<float> percentages = {0.0, 0.25, 0.50, 0.75, 1.0};
-    // static std::vector<float> percentages = {0.1, 0.2, 0.3, 0.4, 0.5,
-    //                                          0.6, 0.7, 0.8, 0.9, 1.0};
     for (auto p : percentages) {
         timer.reset();
         for (uint32_t i = 0; i != runs; ++i) {
@@ -64,8 +62,8 @@ void perf_test(Dictionary const& dict,
         }
 
         std::cout << "\tlocate_prefix-" << p * 100.0
-                  << "%: " << (timer.average() * 1000.0) / queries.size()
-                  << " [ns/string]" << std::endl;
+                  << "%: " << timer.average() / queries.size()
+                  << " [musec/string]" << std::endl;
     }
 }
 
@@ -81,30 +79,29 @@ void perf_test(Dictionary const& dict,
     }
 
 int main(int argc, char** argv) {
-    int mandatory = 2 + 1;
-    if (argc < mandatory) {
-        std::cout << argv[0] << " <collection_basename> <num_queries> < queries"
-                  << std::endl;
-        return 1;
-    }
+    cmd_line_parser::parser parser(argc, argv);
+    parser.add("collection_basename", "Collection basename.");
+    parser.add("max_num_queries", "Maximum number of queries to execute.");
+    if (!parser.parse()) return 1;
 
     parameters params;
-    params.collection_basename = argv[1];
+    params.collection_basename = parser.get<std::string>("collection_basename");
     params.load();
 
-    uint32_t num_queries = std::atoi(argv[2]);
+    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
 
     essentials::logger("loading queries...");
     std::vector<std::string> queries;
-    queries.reserve(num_queries);
+    queries.reserve(max_num_queries);
     std::string query;
     query.reserve(2 * constants::MAX_NUM_CHARS_PER_QUERY);
-    for (uint32_t i = 0; i != num_queries; ++i) {
+    for (uint32_t i = 0; i != max_num_queries; ++i) {
         if (!std::getline(std::cin, query)) break;
         queries.push_back(std::move(query));
     }
-    num_queries = queries.size();
-    essentials::logger("loaded " + std::to_string(num_queries) + " queries");
+    max_num_queries = queries.size();
+    essentials::logger("loaded " + std::to_string(max_num_queries) +
+                       " queries");
 
     exe(4) exe(8) exe(16) exe(32) exe(64) exe(128) exe(256) return 0;
 }
\ No newline at end of file
diff --git a/script/benchmark_dictionaries.sh b/script/benchmark_dictionaries.sh
new file mode 100644
index 0000000..88c0254
--- /dev/null
+++ b/script/benchmark_dictionaries.sh
@@ -0,0 +1,7 @@
+cd ../test_data
+bash preprocess.sh aol/aol.completions 100000
+cd ../build
+python ../script/collect_locate_prefix_results_by_varying_percentage.py fc ../test_data/aol/aol.completions 100000
+python ../script/collect_locate_prefix_results_by_varying_percentage.py trie ../test_data/aol/aol.completions 100000
+./benchmark_fc_dictionary ../test_data/aol/aol.completions 100000 < ../test_data/aol/aol.completions.queries/queries.length=1
+cd ../script
\ No newline at end of file
diff --git a/script/collect_locate_prefix_results_by_varying_percentage.py b/script/collect_locate_prefix_results_by_varying_percentage.py
index e9142d9..305fafa 100644
--- a/script/collect_locate_prefix_results_by_varying_percentage.py
+++ b/script/collect_locate_prefix_results_by_varying_percentage.py
@@ -2,14 +2,13 @@
 
 type = sys.argv[1] # 'trie' or 'fc'
 collection_basename = sys.argv[2]
-dataset_name = sys.argv[3]
-num_queries = sys.argv[4]
+num_queries = sys.argv[3]
 
-output_filename = dataset_name + "." + type + ".locate_prefix.timings.json"
+output_filename = collection_basename + "." + type + ".locate_prefix.json"
+query_filename_prefix = collection_basename + ".queries/queries."
 
 percentages = ["0.0", "0.25", "0.50", "0.75"]
-
 for perc in percentages:
     for terms in range(1,8):
-        os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename)
-    os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename)
+        os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
+    os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)
diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py
index b474d7a..48a7dd1 100644
--- a/script/collect_results_by_varying_percentage.py
+++ b/script/collect_results_by_varying_percentage.py
@@ -3,11 +3,11 @@
 index_type = sys.argv[1]
 query_mode = sys.argv[2] # topk, prefix_topk, conjunctive_topk
 index_filename = sys.argv[3]
-dataset_basename = sys.argv[4] # e.g., aol/aol.completions or aol/aol.completions.filtered
+collection_basename = sys.argv[4] # e.g., aol/aol.completions or aol/aol.completions.filtered
 k = sys.argv[5]
 num_queries = sys.argv[6]
 
-output_filename = dataset_basename + "." + index_type
+output_filename = collection_basename + "." + index_type
 
 breakdown = ""
 if len(sys.argv) > 7 and sys.argv[7] == "--breakdown":
@@ -15,7 +15,7 @@
     output_filename += ".breakdown"
 
 output_filename += "." + query_mode + ".json"
-query_filename_prefix = dataset_basename + ".queries/queries."
+query_filename_prefix = collection_basename + ".queries/queries."
 
 percentages = ["0.0", "0.25", "0.50", "0.75"]
 for perc in percentages:

From 4e37b944b75c9eb4d753bffc2566b60015259116 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 28 Nov 2019 14:50:44 +0100
Subject: [PATCH 048/102] script to automate benchmarking of dictionaries

---
 README.md                             | 5 +++++
 benchmark/benchmark_fc_dictionary.cpp | 4 ++--
 script/benchmark_dictionaries.sh      | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f19bd7b..50e111f 100644
--- a/README.md
+++ b/README.md
@@ -169,6 +169,11 @@ From within the `/build` directory, run
 
 You can also specify the option `--breakdown` to record timings breakdowns.
 
+To benchmark the dictionaries (Front-Coding and trie), just run the following script from within
+the `script` directory:
+
+    bash benchmark_dictionaries.sh
+
 Live demo <a name="demo"></a>
 ----------
 
diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp
index 36882c9..ce71f67 100644
--- a/benchmark/benchmark_fc_dictionary.cpp
+++ b/benchmark/benchmark_fc_dictionary.cpp
@@ -20,8 +20,8 @@ void perf_test(Dictionary const& dict,
         timer.stop();
     }
 
-    std::cout << "locate: " << (timer.average() * 1000.0) / queries.size()
-              << " [ns/string]" << std::endl;
+    std::cout << "locate: " << timer.average() / queries.size()
+              << " [musec/string]" << std::endl;
 
     std::vector<id_type> ids;
     ids.reserve(queries.size());
diff --git a/script/benchmark_dictionaries.sh b/script/benchmark_dictionaries.sh
index 88c0254..29c9a84 100644
--- a/script/benchmark_dictionaries.sh
+++ b/script/benchmark_dictionaries.sh
@@ -3,5 +3,5 @@ bash preprocess.sh aol/aol.completions 100000
 cd ../build
 python ../script/collect_locate_prefix_results_by_varying_percentage.py fc ../test_data/aol/aol.completions 100000
 python ../script/collect_locate_prefix_results_by_varying_percentage.py trie ../test_data/aol/aol.completions 100000
-./benchmark_fc_dictionary ../test_data/aol/aol.completions 100000 < ../test_data/aol/aol.completions.queries/queries.length=1
+./benchmark_fc_dictionary ../test_data/aol/aol.completions 100000 < ../test_data/aol/aol.completions.queries/queries.length=1 > ../test_data/aol/aol.completions.dictionary_benchmark.txt
 cd ../script
\ No newline at end of file

From e4fb185dcf131df1f1df788c533d9b9e52cc3291 Mon Sep 17 00:00:00 2001
From: Simon Gog <simon.gog@gmail.com>
Date: Tue, 10 Dec 2019 00:19:58 -0800
Subject: [PATCH 049/102] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 50e111f..ce69cb7 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ in preparing the data for indexing.
 Thus, from within the directory `test_data`, it is sufficient
 to do:
 
-	bash preprocess.sh 300
+	bash preprocess.sh <test_collection> 300
 
 The second argument in the example, i.e., 300, represents the
 number of completions (per completion size) that are drawn at

From fae328f7cd6dbbf503b0368a302ea2000285016d Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 29 Feb 2020 19:52:54 +0100
Subject: [PATCH 050/102] effectiveness benchmark

---
 benchmark/CMakeLists.txt    |   3 +-
 benchmark/effectiveness.cpp | 104 ++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/effectiveness.cpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index d7f9433..6275079 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -3,4 +3,5 @@ add_executable(benchmark_prefix_topk benchmark_prefix_topk.cpp)
 add_executable(benchmark_conjunctive_topk benchmark_conjunctive_topk.cpp)
 add_executable(benchmark_fc_dictionary benchmark_fc_dictionary.cpp)
 add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp)
-add_executable(benchmark_locate_prefix benchmark_locate_prefix.cpp)
\ No newline at end of file
+add_executable(benchmark_locate_prefix benchmark_locate_prefix.cpp)
+add_executable(effectiveness effectiveness.cpp)
\ No newline at end of file
diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp
new file mode 100644
index 0000000..3fae9a8
--- /dev/null
+++ b/benchmark/effectiveness.cpp
@@ -0,0 +1,104 @@
+#include <iostream>
+
+#include "types.hpp"
+#include "benchmark_common.hpp"
+
+using namespace autocomplete;
+
+template <typename Index>
+void benchmark(std::string const& index_filename, uint32_t k,
+               uint32_t max_num_queries, float keep,
+               essentials::json_lines& stats, bool verbose) {
+    Index index1, index2;
+    essentials::load(index1, index_filename.c_str());
+    essentials::load(index2, index_filename.c_str());
+
+    std::vector<std::string> queries;
+    uint32_t num_queries =
+        load_queries(queries, max_num_queries, keep, std::cin);
+    uint64_t strings_reported_by_prefix_search = 0;
+    uint64_t better_scored_strings_reported_by_conjunctive_search = 0;
+
+    stats.add("num_queries", std::to_string(num_queries));
+
+    for (auto const& query : queries) {
+        auto it1 = index1.prefix_topk(query, k);
+        auto it2 = index2.conjunctive_topk(query, k);
+        strings_reported_by_prefix_search += it1.size();
+
+        uint64_t more = 0;
+        if (it2.size() >= it1.size()) {
+            more = it2.size() - it1.size();
+        }
+
+        if (verbose) {
+            {
+                auto it = it1;
+                std::cout << "prefix search scores: " << std::endl;
+                for (uint64_t i = 0; i != it.size(); ++i, ++it) {
+                    std::cout << (*it).score << " ";
+                }
+                std::cout << std::endl;
+            }
+            {
+                auto it = it2;
+                std::cout << "conjunctive search scores: " << std::endl;
+                for (uint64_t i = 0; i != it.size(); ++i, ++it) {
+                    std::cout << (*it).score << " ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << "more: " << more << std::endl;
+        }
+
+        better_scored_strings_reported_by_conjunctive_search += more;
+    }
+
+    stats.add("strings_reported_by_prefix_search",
+              std::to_string(strings_reported_by_prefix_search));
+    stats.add(
+        "better_scored_strings_reported_by_conjunctive_search",
+        std::to_string(better_scored_strings_reported_by_conjunctive_search));
+    stats.add(
+        "better_scored_strings_reported_by_conjunctive_search_in_percentage",
+        std::to_string(better_scored_strings_reported_by_conjunctive_search *
+                       100.0 / strings_reported_by_prefix_search));
+}
+
+int main(int argc, char** argv) {
+    cmd_line_parser::parser parser(argc, argv);
+    configure_parser_for_benchmarking(parser);
+    if (!parser.parse()) return 1;
+
+    auto type = parser.get<std::string>("type");
+    auto k = parser.get<uint32_t>("k");
+    auto index_filename = parser.get<std::string>("index_filename");
+    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
+    auto keep = parser.get<float>("percentage");
+    auto verbose = parser.get<bool>("verbose");
+
+    essentials::json_lines stats;
+    stats.new_line();
+    stats.add("num_terms_per_query",
+              parser.get<std::string>("num_terms_per_query"));
+    stats.add("percentage", std::to_string(keep));
+
+    if (type == "ef_type1") {
+        benchmark<ef_autocomplete_type1>(index_filename, k, max_num_queries,
+                                         keep, stats, verbose);
+    } else if (type == "ef_type2") {
+        benchmark<ef_autocomplete_type2>(index_filename, k, max_num_queries,
+                                         keep, stats, verbose);
+    } else if (type == "ef_type3") {
+        benchmark<ef_autocomplete_type3>(index_filename, k, max_num_queries,
+                                         keep, stats, verbose);
+    } else if (type == "ef_type4") {
+        benchmark<ef_autocomplete_type4>(index_filename, k, max_num_queries,
+                                         keep, stats, verbose);
+    } else {
+        return 1;
+    }
+
+    stats.print();
+    return 0;
+}
\ No newline at end of file

From 87ac6f8f44e12062e4a5eec942ba83309d2c41ab Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 29 Feb 2020 19:57:48 +0100
Subject: [PATCH 051/102] effectiveness benchmark

---
 benchmark/effectiveness.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp
index 3fae9a8..49969a1 100644
--- a/benchmark/effectiveness.cpp
+++ b/benchmark/effectiveness.cpp
@@ -67,7 +67,15 @@ void benchmark(std::string const& index_filename, uint32_t k,
 
 int main(int argc, char** argv) {
     cmd_line_parser::parser parser(argc, argv);
-    configure_parser_for_benchmarking(parser);
+    parser.add("type", "Index type.");
+    parser.add("k", "top-k value.");
+    parser.add("index_filename", "Index filename.");
+    parser.add("num_terms_per_query", "Number of terms per query.");
+    parser.add("max_num_queries", "Maximum number of queries to execute.");
+    parser.add("percentage",
+               "A float in [0,1] specifying how much we keep of the last token "
+               "in a query: n x 100 <=> n%, for n in [0,1].");
+    parser.add("verbose", "Verbose output.", "--verbose");
     if (!parser.parse()) return 1;
 
     auto type = parser.get<std::string>("type");

From a5dbf289f239c51493a24fd49b3572023f4fa9e4 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Mon, 2 Mar 2020 12:28:25 +0100
Subject: [PATCH 052/102] effectiveness

---
 benchmark/effectiveness.cpp                   | 54 ++++++++++++-------
 include/scored_string_pool.hpp                |  8 +++
 ...ctiveness_results_by_varying_percentage.py | 18 +++++++
 3 files changed, 61 insertions(+), 19 deletions(-)
 create mode 100644 script/collect_effectiveness_results_by_varying_percentage.py

diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp
index 49969a1..7abb179 100644
--- a/benchmark/effectiveness.cpp
+++ b/benchmark/effectiveness.cpp
@@ -21,6 +21,9 @@ void benchmark(std::string const& index_filename, uint32_t k,
 
     stats.add("num_queries", std::to_string(num_queries));
 
+    std::vector<uint64_t> difference;
+    difference.reserve(k);
+
     for (auto const& query : queries) {
         auto it1 = index1.prefix_topk(query, k);
         auto it2 = index2.conjunctive_topk(query, k);
@@ -28,30 +31,43 @@ void benchmark(std::string const& index_filename, uint32_t k,
 
         uint64_t more = 0;
         if (it2.size() >= it1.size()) {
-            more = it2.size() - it1.size();
-        }
+            auto const& prefix_search_scores = it1.pool()->const_scores();
+            auto const& conjunctive_search_scores = it2.pool()->const_scores();
+            assert(std::is_sorted(prefix_search_scores.begin(),
+                                  prefix_search_scores.begin() + it1.size()));
+            assert(
+                std::is_sorted(conjunctive_search_scores.begin(),
+                               conjunctive_search_scores.begin() + it2.size()));
 
-        if (verbose) {
-            {
-                auto it = it1;
-                std::cout << "prefix search scores: " << std::endl;
-                for (uint64_t i = 0; i != it.size(); ++i, ++it) {
-                    std::cout << (*it).score << " ";
+            if (verbose) {
+                {
+                    auto it = it1;
+                    std::cout << "prefix_search_scores: " << std::endl;
+                    for (uint64_t i = 0; i != it.size(); ++i, ++it) {
+                        std::cout << (*it).score << " ";
+                    }
+                    std::cout << std::endl;
                 }
-                std::cout << std::endl;
-            }
-            {
-                auto it = it2;
-                std::cout << "conjunctive search scores: " << std::endl;
-                for (uint64_t i = 0; i != it.size(); ++i, ++it) {
-                    std::cout << (*it).score << " ";
+                {
+                    auto it = it2;
+                    std::cout << "conjunctive_search_scores: " << std::endl;
+                    for (uint64_t i = 0; i != it.size(); ++i, ++it) {
+                        std::cout << (*it).score << " ";
+                    }
+                    std::cout << std::endl;
                 }
-                std::cout << std::endl;
             }
-            std::cout << "more: " << more << std::endl;
-        }
 
-        better_scored_strings_reported_by_conjunctive_search += more;
+            difference.clear();
+            auto it = std::set_difference(
+                conjunctive_search_scores.begin(),
+                conjunctive_search_scores.begin() + it2.size(),
+                prefix_search_scores.begin(),
+                prefix_search_scores.begin() + it1.size(), difference.begin());
+            more = std::distance(difference.begin(), it);
+            if (verbose) std::cout << "more: " << more << std::endl;
+            better_scored_strings_reported_by_conjunctive_search += more;
+        }
     }
 
     stats.add("strings_reported_by_prefix_search",
diff --git a/include/scored_string_pool.hpp b/include/scored_string_pool.hpp
index f834453..c679aeb 100644
--- a/include/scored_string_pool.hpp
+++ b/include/scored_string_pool.hpp
@@ -39,6 +39,10 @@ struct scored_string_pool {
         return m_scores;
     }
 
+    std::vector<id_type> const& const_scores() const {
+        return m_scores;
+    }
+
     scored_byte_range operator[](size_t i) const {
         assert(i < size());
         scored_byte_range sbr;
@@ -69,6 +73,10 @@ struct scored_string_pool {
             return m_pool->operator[](m_pos);
         }
 
+        scored_string_pool const* pool() const {
+            return m_pool;
+        }
+
     private:
         scored_string_pool const* m_pool;
         size_t m_pos;
diff --git a/script/collect_effectiveness_results_by_varying_percentage.py b/script/collect_effectiveness_results_by_varying_percentage.py
new file mode 100644
index 0000000..4fc7683
--- /dev/null
+++ b/script/collect_effectiveness_results_by_varying_percentage.py
@@ -0,0 +1,18 @@
+import sys, os
+
+index_type = sys.argv[1]
+index_filename = sys.argv[2]
+collection_basename = sys.argv[3] # e.g., aol/aol.completions or aol/aol.completions.filtered
+k = sys.argv[4]
+num_queries = sys.argv[5]
+
+output_filename = collection_basename + "." + index_type
+
+output_filename += ".effectiveness.json"
+query_filename_prefix = collection_basename + ".queries/queries."
+
+percentages = ["0.0", "0.25", "0.50", "0.75"]
+for perc in percentages:
+    for terms in range(2,8): # (1,8)
+        os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
+    os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)

From 7f34276714316eae0729c1f29120705466e92468 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 3 Mar 2020 13:21:34 +0100
Subject: [PATCH 053/102] minor fix

---
 benchmark/effectiveness.cpp       | 76 +++++++++++++++++--------------
 include/autocomplete.hpp          |  6 ++-
 include/autocomplete2.hpp         |  6 ++-
 include/autocomplete3.hpp         |  6 ++-
 include/autocomplete4.hpp         |  6 ++-
 include/autocomplete_common.hpp   |  5 +-
 include/compact_forward_index.hpp |  2 +
 include/ef/ef_sequence.hpp        | 20 +++-----
 include/fc_dictionary.hpp         |  1 +
 9 files changed, 75 insertions(+), 53 deletions(-)

diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp
index 7abb179..e7eb7b7 100644
--- a/benchmark/effectiveness.cpp
+++ b/benchmark/effectiveness.cpp
@@ -30,44 +30,52 @@ void benchmark(std::string const& index_filename, uint32_t k,
         strings_reported_by_prefix_search += it1.size();
 
         uint64_t more = 0;
-        if (it2.size() >= it1.size()) {
-            auto const& prefix_search_scores = it1.pool()->const_scores();
-            auto const& conjunctive_search_scores = it2.pool()->const_scores();
-            assert(std::is_sorted(prefix_search_scores.begin(),
-                                  prefix_search_scores.begin() + it1.size()));
-            assert(
-                std::is_sorted(conjunctive_search_scores.begin(),
-                               conjunctive_search_scores.begin() + it2.size()));
-
-            if (verbose) {
-                {
-                    auto it = it1;
-                    std::cout << "prefix_search_scores: " << std::endl;
-                    for (uint64_t i = 0; i != it.size(); ++i, ++it) {
-                        std::cout << (*it).score << " ";
-                    }
-                    std::cout << std::endl;
+        assert(it2.size() >= it1.size());
+
+        auto const& prefix_search_scores = it1.pool()->const_scores();
+        auto const& conjunctive_search_scores = it2.pool()->const_scores();
+        assert(std::is_sorted(prefix_search_scores.begin(),
+                              prefix_search_scores.begin() + it1.size()));
+        assert(std::is_sorted(conjunctive_search_scores.begin(),
+                              conjunctive_search_scores.begin() + it2.size()));
+
+        if (verbose) {
+            std::cout << "query: '" << query << "'" << std::endl;
+            {
+                auto it = it1;
+                std::cout << "prefix_search results: " << it.size()
+                          << std::endl;
+                for (uint64_t i = 0; i != it.size(); ++i, ++it) {
+                    auto completion = *it;
+                    std::cout << completion.score << ": "
+                              << std::string(completion.string.begin,
+                                             completion.string.end)
+                              << std::endl;
                 }
-                {
-                    auto it = it2;
-                    std::cout << "conjunctive_search_scores: " << std::endl;
-                    for (uint64_t i = 0; i != it.size(); ++i, ++it) {
-                        std::cout << (*it).score << " ";
-                    }
-                    std::cout << std::endl;
+            }
+            {
+                auto it = it2;
+                std::cout << "conjunctive_search results: " << it.size()
+                          << std::endl;
+                for (uint64_t i = 0; i != it.size(); ++i, ++it) {
+                    auto completion = *it;
+                    std::cout << completion.score << ": "
+                              << std::string(completion.string.begin,
+                                             completion.string.end)
+                              << std::endl;
                 }
             }
-
-            difference.clear();
-            auto it = std::set_difference(
-                conjunctive_search_scores.begin(),
-                conjunctive_search_scores.begin() + it2.size(),
-                prefix_search_scores.begin(),
-                prefix_search_scores.begin() + it1.size(), difference.begin());
-            more = std::distance(difference.begin(), it);
-            if (verbose) std::cout << "more: " << more << std::endl;
-            better_scored_strings_reported_by_conjunctive_search += more;
         }
+
+        difference.clear();
+        auto it = std::set_difference(
+            conjunctive_search_scores.begin(),
+            conjunctive_search_scores.begin() + it2.size(),
+            prefix_search_scores.begin(),
+            prefix_search_scores.begin() + it1.size(), difference.begin());
+        more = std::distance(difference.begin(), it);
+        if (verbose) std::cout << "more: " << more << std::endl;
+        better_scored_strings_reported_by_conjunctive_search += more;
     }
 
     stats.add("strings_reported_by_prefix_search",
diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp
index 616b13f..dd085fa 100644
--- a/include/autocomplete.hpp
+++ b/include/autocomplete.hpp
@@ -40,7 +40,9 @@ struct autocomplete {
         init();
         completion_type prefix;
         byte_range suffix;
-        parse(m_dictionary, query, prefix, suffix);
+        if (parse(m_dictionary, query, prefix, suffix, true) == 0) {
+            return m_pool.begin();
+        }
 
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
@@ -76,6 +78,8 @@ struct autocomplete {
                 true  // must return unique results
             );
         } else {
+            suffix_lex_range.begin += 1;
+            suffix_lex_range.end += 1;
             num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         }
 
diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp
index 9d05226..cd6f411 100644
--- a/include/autocomplete2.hpp
+++ b/include/autocomplete2.hpp
@@ -45,7 +45,9 @@ struct autocomplete2 {
         init();
         completion_type prefix;
         byte_range suffix;
-        parse(m_dictionary, query, prefix, suffix);
+        if (parse(m_dictionary, query, prefix, suffix, true) == 0) {
+            return m_pool.begin();
+        }
 
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
@@ -82,6 +84,8 @@ struct autocomplete2 {
             );
             extract_completions(num_completions);
         } else {
+            suffix_lex_range.begin += 1;
+            suffix_lex_range.end += 1;
             num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         }
 
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index 6165e19..a166d9f 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -54,7 +54,9 @@ struct autocomplete3 {
         init();
         completion_type prefix;
         byte_range suffix;
-        parse(m_dictionary, query, prefix, suffix);
+        if (parse(m_dictionary, query, prefix, suffix, true) == 0) {
+            return m_pool.begin();
+        }
 
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
@@ -82,6 +84,8 @@ struct autocomplete3 {
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
 
+        suffix_lex_range.begin += 1;
+        suffix_lex_range.end += 1;
         num_completions =
             conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
         extract_completions(num_completions);
diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index cd44706..3006592 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -47,7 +47,9 @@ struct autocomplete4 {
         init();
         completion_type prefix;
         byte_range suffix;
-        parse(m_dictionary, query, prefix, suffix);
+        if (parse(m_dictionary, query, prefix, suffix, true) == 0) {
+            return m_pool.begin();
+        }
 
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
@@ -73,6 +75,8 @@ struct autocomplete4 {
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
 
+        suffix_lex_range.begin += 1;
+        suffix_lex_range.end += 1;
         uint32_t num_completions =
             conjunctive_topk(prefix, suffix_lex_range, k);
         extract_completions(num_completions);
diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp
index 17b38b4..bd49934 100644
--- a/include/autocomplete_common.hpp
+++ b/include/autocomplete_common.hpp
@@ -6,7 +6,8 @@ namespace autocomplete {
 
 template <typename Dictionary>
 uint32_t parse(Dictionary const& dict, std::string const& query,
-               completion_type& prefix, byte_range& suffix) {
+               completion_type& prefix, byte_range& suffix,
+               bool must_find_prefix = false) {
     uint32_t num_terms = 1;  // for suffix
     byte_range_iterator it(string_to_byte_range(query));
     while (true) {
@@ -16,6 +17,8 @@ uint32_t parse(Dictionary const& dict, std::string const& query,
         if (term_id != global::invalid_term_id) {
             prefix.push_back(term_id);
             ++num_terms;
+        } else {
+            if (must_find_prefix) return 0;
         }
     }
     return num_terms;
diff --git a/include/compact_forward_index.hpp b/include/compact_forward_index.hpp
index bde4b71..21aaa7c 100644
--- a/include/compact_forward_index.hpp
+++ b/include/compact_forward_index.hpp
@@ -32,6 +32,7 @@ struct compact_forward_index {
                 for (uint64_t k = 0; k != n; ++k) {
                     id_type x;
                     input >> x;
+                    assert(x > 0);
                     terms.push_back(x);
                 }
                 m_pointers.push_back(size);
@@ -89,6 +90,7 @@ struct compact_forward_index {
         bool intersects(const range r) const {
             for (uint64_t i = 0; i != size(); ++i) {
                 auto val = m_cv[m_base + i];
+                assert(val > 0);
                 if (r.contains(val)) return true;
             }
             return false;
diff --git a/include/ef/ef_sequence.hpp b/include/ef/ef_sequence.hpp
index 0d1f436..0632f83 100644
--- a/include/ef/ef_sequence.hpp
+++ b/include/ef/ef_sequence.hpp
@@ -152,23 +152,15 @@ struct ef_sequence {
         assert(r.is_valid());
         assert(r.end <= size());
         auto prev_upper = previous_range_upperbound(r);
-
-        uint64_t begin =
-            util::next_geq(*this, lex.begin + prev_upper, r.begin, r.end - 1);
-        if (begin == global::not_found) {
+        uint64_t id_begin = lex.begin + prev_upper;
+        uint64_t id_end = lex.end + prev_upper;
+        uint64_t begin = util::next_geq(*this, id_begin, r.begin, r.end - 1);
+        if (begin == global::not_found or access(begin) > id_end) {
             return {r.end, r.end};
         }
-
-        if (lex.begin == lex.end) {
-            return {begin, begin + 1};
-        }
-
-        uint64_t id_end = lex.end + prev_upper;
+        if (lex.begin == lex.end) return {begin, begin + 1};
         uint64_t end = util::next_geq(*this, id_end, begin, r.end - 1);
-        if (end == global::not_found) {
-            return {begin, r.end};
-        }
-
+        if (end == global::not_found) return {begin, r.end};
         return {begin, access(end) != id_end ? end : end + 1};
     }
 
diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp
index 1b223be..52e3971 100644
--- a/include/fc_dictionary.hpp
+++ b/include/fc_dictionary.hpp
@@ -115,6 +115,7 @@ struct fc_dictionary {
     fc_dictionary() {}
 
     // NOTE: return inclusive ranges, i.e., [a,b]
+    // 0-based ids
     range locate_prefix(byte_range p) const {
         if (p.end - p.begin == 0) return {0, size() - 1};
         auto bucket_id = locate_buckets(p);

From f04f127d7a7df1558347775d6754afcc5491fee9 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 4 Mar 2020 16:40:15 +0100
Subject: [PATCH 054/102] changed css style

---
 web/styles.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/styles.css b/web/styles.css
index 5db5234..b540533 100644
--- a/web/styles.css
+++ b/web/styles.css
@@ -9,4 +9,4 @@
 .autocomplete-group { padding: 2px 5px; }
 .autocomplete-group strong { font-weight: bold; font-size: 16px; color: #000; display: block; border-bottom: 1px solid #000; }
 
-input { font-size: 28px; padding: 10px; border: 1px solid #CCC; display: block; margin: 20px 0; }
+input { font-size: 18px; padding: 10px; border: 1px solid #CCC; display: block; margin: 20px 0; }

From ad7e5845370803e200e0faee0a0d639d4da2b972 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 10 Mar 2020 23:31:20 +0100
Subject: [PATCH 055/102] single-token queries with few completions switch to
 heap-based algorithm; less code

---
 .gitignore                                    |   2 +-
 benchmark/CMakeLists.txt                      |   2 +-
 benchmark/benchmark_common.hpp                |   5 +-
 benchmark/benchmark_conjunctive_topk.cpp      |  46 +--
 benchmark/benchmark_fc_dictionary.cpp         |   6 +-
 benchmark/benchmark_integer_fc_dictionary.cpp |   2 +-
 benchmark/benchmark_locate_prefix.cpp         |   9 +-
 benchmark/benchmark_prefix_topk.cpp           |  33 +--
 benchmark/effectiveness.cpp                   |   5 +-
 include/autocomplete.hpp                      | 261 +++++------------
 include/autocomplete2.hpp                     | 260 +++++------------
 include/autocomplete3.hpp                     | 211 ++++----------
 include/autocomplete4.hpp                     | 268 ++++++------------
 include/autocomplete_common.hpp               |  44 ++-
 include/blocked_inverted_index.hpp            |  79 +++---
 include/probe.hpp                             |  36 +++
 include/util_types.hpp                        |  21 --
 src/CMakeLists.txt                            |   2 +-
 src/web_server.cpp                            |   8 +-
 test/test_autocomplete.cpp                    |   6 +-
 test/test_common.hpp                          |   1 +
 test/test_locate_prefix.cpp                   |   2 +-
 22 files changed, 456 insertions(+), 853 deletions(-)
 create mode 100644 include/probe.hpp

diff --git a/.gitignore b/.gitignore
index 3094469..51855af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
 .DS_Store
-build
+build*
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 6275079..8f2c632 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_executable(benchmark_topk benchmark_topk.cpp)
+# add_executable(benchmark_topk benchmark_topk.cpp)
 add_executable(benchmark_prefix_topk benchmark_prefix_topk.cpp)
 add_executable(benchmark_conjunctive_topk benchmark_conjunctive_topk.cpp)
 add_executable(benchmark_fc_dictionary benchmark_fc_dictionary.cpp)
diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp
index 2f12c8a..e7f9160 100644
--- a/benchmark/benchmark_common.hpp
+++ b/benchmark/benchmark_common.hpp
@@ -1,10 +1,13 @@
 #pragma once
 
 #include "../external/cmd_line_parser/include/parser.hpp"
+#include "probe.hpp"
 
 namespace autocomplete {
 
-static const uint32_t runs = 5;
+namespace benchmarking {
+static const uint32_t runs = 1;
+}
 
 // void tolower(std::string& str) {
 //     std::transform(str.begin(), str.end(), str.begin(),
diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp
index 7d8a7d3..ae73512 100644
--- a/benchmark/benchmark_conjunctive_topk.cpp
+++ b/benchmark/benchmark_conjunctive_topk.cpp
@@ -16,64 +16,42 @@ void benchmark(std::string const& index_filename, uint32_t k,
     uint32_t num_queries =
         load_queries(queries, max_num_queries, keep, std::cin);
 
-    uint32_t R = runs;  // runs
-
     uint64_t reported_strings = 0;
     auto musec_per_query = [&](double time) {
-        return time / (R * num_queries);
+        return time / (benchmarking::runs * num_queries);
     };
 
     breakdowns.add("num_queries", std::to_string(num_queries));
 
     if (breakdown) {
-        std::vector<timer_type> timers(4);
-        for (uint32_t run = 0; run != R; ++run) {
+        timer_probe probe(3);
+        for (uint32_t run = 0; run != benchmarking::runs; ++run) {
             for (auto const& query : queries) {
-                auto it = index.conjunctive_topk(query, k, timers);
+                auto it = index.conjunctive_topk(query, k, probe);
                 reported_strings += it.size();
             }
         }
-        std::cout << reported_strings << std::endl;
-
-        // breakdowns.add("checked_docids",
-        // std::to_string(index.checked_docids)); breakdowns.add("heap_size",
-        // std::to_string(index.heap_size));
-
-        // auto perc_skipped_searches =
-        //     (static_cast<double>(index.skipped_searches) * 100.0) /
-        //     queries.size();
-        // breakdowns.add("skipped_searches",
-        //                std::to_string(perc_skipped_searches));
-
+        std::cout << "#ignore: " << reported_strings << std::endl;
         breakdowns.add("parsing_musec_per_query",
-                       std::to_string(musec_per_query(timers[0].elapsed())));
-        breakdowns.add("dictionary_search_musec_per_query",
-                       std::to_string(musec_per_query(timers[1].elapsed())));
+                       std::to_string(musec_per_query(probe.get(0).elapsed())));
         breakdowns.add("conjunctive_search_musec_per_query",
-                       std::to_string(musec_per_query(timers[2].elapsed())));
+                       std::to_string(musec_per_query(probe.get(1).elapsed())));
         breakdowns.add("reporting_musec_per_query",
-                       std::to_string(musec_per_query(timers[3].elapsed())));
+                       std::to_string(musec_per_query(probe.get(2).elapsed())));
     } else {
         essentials::timer_type timer;
+        nop_probe probe;
         timer.start();
-        for (uint32_t run = 0; run != runs; ++run) {
+        for (uint32_t run = 0; run != benchmarking::runs; ++run) {
             for (auto const& query : queries) {
-                auto it = index.conjunctive_topk(query, k);
+                auto it = index.conjunctive_topk(query, k, probe);
                 reported_strings += it.size();
             }
         }
         timer.stop();
-        std::cout << reported_strings << std::endl;
+        std::cout << "#ignore: " << reported_strings << std::endl;
         breakdowns.add("musec_per_query",
                        std::to_string(musec_per_query(timer.elapsed())));
-
-        // for (auto const& query : queries) {
-        //     auto it = index.conjunctive_topk(query, k);
-        //     reported_strings += it.size();
-        // }
-        // breakdowns.add("avg_results_per_query",
-        //                std::to_string(static_cast<double>(reported_strings) /
-        //                               queries.size()));
     }
 }
 
diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp
index ce71f67..d3e66b5 100644
--- a/benchmark/benchmark_fc_dictionary.cpp
+++ b/benchmark/benchmark_fc_dictionary.cpp
@@ -11,7 +11,7 @@ void perf_test(Dictionary const& dict,
     static std::vector<uint8_t> decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
     essentials::timer_type timer;
 
-    for (uint32_t i = 0; i != runs; ++i) {
+    for (uint32_t i = 0; i != benchmarking::runs; ++i) {
         timer.start();
         for (auto const& query : queries) {
             id_type id = dict.locate(string_to_byte_range(query));
@@ -32,7 +32,7 @@ void perf_test(Dictionary const& dict,
 
     timer.reset();
 
-    for (uint32_t i = 0; i != runs; ++i) {
+    for (uint32_t i = 0; i != benchmarking::runs; ++i) {
         timer.start();
         for (auto const& id : ids) {
             uint8_t string_len = dict.extract(id, decoded.data());
@@ -47,7 +47,7 @@ void perf_test(Dictionary const& dict,
     static std::vector<float> percentages = {0.0, 0.25, 0.50, 0.75, 1.0};
     for (auto p : percentages) {
         timer.reset();
-        for (uint32_t i = 0; i != runs; ++i) {
+        for (uint32_t i = 0; i != benchmarking::runs; ++i) {
             timer.start();
             for (auto const& query : queries) {
                 size_t size = query.size();
diff --git a/benchmark/benchmark_integer_fc_dictionary.cpp b/benchmark/benchmark_integer_fc_dictionary.cpp
index 3a752eb..8cb2b32 100644
--- a/benchmark/benchmark_integer_fc_dictionary.cpp
+++ b/benchmark/benchmark_integer_fc_dictionary.cpp
@@ -11,7 +11,7 @@ void perf_test(Dictionary const& dict, std::vector<id_type> const& queries) {
     static completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY);
     essentials::timer_type timer;
 
-    for (uint32_t i = 0; i != runs; ++i) {
+    for (uint32_t i = 0; i != benchmarking::runs; ++i) {
         timer.start();
         for (auto const& id : queries) {
             uint8_t string_len = dict.extract(id, decoded);
diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp
index f9e6282..a9e374a 100644
--- a/benchmark/benchmark_locate_prefix.cpp
+++ b/benchmark/benchmark_locate_prefix.cpp
@@ -31,15 +31,16 @@ void benchmark(parameters const& params, std::vector<query_type>& queries,
 
     essentials::timer_type timer;
     timer.start();
-    for (uint32_t run = 0; run != runs; ++run) {
+    for (uint32_t run = 0; run != benchmarking::runs; ++run) {
         for (auto& query : queries) {
             auto r = index.locate_prefix(query.first, query.second);
             essentials::do_not_optimize_away(r.end - r.begin);
         }
     }
     timer.stop();
-    result.add("musec_per_query",
-               std::to_string(timer.elapsed() / (runs * num_queries)));
+    result.add(
+        "musec_per_query",
+        std::to_string(timer.elapsed() / (benchmarking::runs * num_queries)));
     result.print();
 }
 
@@ -78,7 +79,7 @@ int main(int argc, char** argv) {
         for (auto const& string : strings) {
             completion_type prefix;
             byte_range suffix;
-            parse(dict, string, prefix, suffix);
+            parse(dict, string, prefix, suffix, true);
             range suffix_lex_range = dict.locate_prefix(suffix);
             queries.emplace_back(prefix, suffix_lex_range);
         }
diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp
index 2c31c68..f09d3dc 100644
--- a/benchmark/benchmark_prefix_topk.cpp
+++ b/benchmark/benchmark_prefix_topk.cpp
@@ -18,36 +18,33 @@ void benchmark(std::string const& index_filename, uint32_t k,
 
     uint64_t reported_strings = 0;
     auto musec_per_query = [&](double time) {
-        return time / (runs * num_queries);
+        return time / (benchmarking::runs * num_queries);
     };
 
     breakdowns.add("num_queries", std::to_string(num_queries));
 
     if (breakdown) {
-        std::vector<timer_type> timers(4);
-        for (uint32_t run = 0; run != runs; ++run) {
+        timer_probe probe(3);
+        for (uint32_t run = 0; run != benchmarking::runs; ++run) {
             for (auto const& query : queries) {
-                auto it = index.prefix_topk(query, k, timers);
+                auto it = index.prefix_topk(query, k, probe);
                 reported_strings += it.size();
             }
         }
-        std::cout << reported_strings << std::endl;
+        std::cout << "#ignore: " << reported_strings << std::endl;
         breakdowns.add("parsing_musec_per_query",
-                       std::to_string(musec_per_query(timers[0].elapsed())));
-        // breakdowns.add("completions_search_musec_per_query",
-        //                std::to_string(musec_per_query(timers[1].elapsed())));
-        // breakdowns.add("topk_rmq_musec_per_query",
-        //                std::to_string(musec_per_query(timers[2].elapsed())));
+                       std::to_string(musec_per_query(probe.get(0).elapsed())));
         breakdowns.add("prefix_search_musec_per_query",
-                       std::to_string(musec_per_query(timers[1].elapsed())));
+                       std::to_string(musec_per_query(probe.get(1).elapsed())));
         breakdowns.add("reporting_musec_per_query",
-                       std::to_string(musec_per_query(timers[2].elapsed())));
+                       std::to_string(musec_per_query(probe.get(2).elapsed())));
     } else {
         essentials::timer_type timer;
+        nop_probe probe;
         timer.start();
-        for (uint32_t run = 0; run != runs; ++run) {
+        for (uint32_t run = 0; run != benchmarking::runs; ++run) {
             for (auto const& query : queries) {
-                auto it = index.prefix_topk(query, k);
+                auto it = index.prefix_topk(query, k, probe);
                 reported_strings += it.size();
             }
         }
@@ -55,14 +52,6 @@ void benchmark(std::string const& index_filename, uint32_t k,
         std::cout << reported_strings << std::endl;
         breakdowns.add("musec_per_query",
                        std::to_string(musec_per_query(timer.elapsed())));
-
-        // for (auto const& query : queries) {
-        //     auto it = index.prefix_topk(query, k);
-        //     reported_strings += it.size();
-        // }
-        // breakdowns.add("avg_results_per_query",
-        //                std::to_string(static_cast<double>(reported_strings) /
-        //                               queries.size()));
     }
 }
 
diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp
index e7eb7b7..e9c6590 100644
--- a/benchmark/effectiveness.cpp
+++ b/benchmark/effectiveness.cpp
@@ -23,10 +23,11 @@ void benchmark(std::string const& index_filename, uint32_t k,
 
     std::vector<uint64_t> difference;
     difference.reserve(k);
+    nop_probe probe;
 
     for (auto const& query : queries) {
-        auto it1 = index1.prefix_topk(query, k);
-        auto it2 = index2.conjunctive_topk(query, k);
+        auto it1 = index1.prefix_topk(query, k, probe);
+        auto it2 = index2.conjunctive_topk(query, k, probe);
         strings_reported_by_prefix_search += it1.size();
 
         uint64_t more = 0;
diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp
index dd085fa..f55b9e5 100644
--- a/include/autocomplete.hpp
+++ b/include/autocomplete.hpp
@@ -13,9 +13,6 @@ struct autocomplete {
     typedef scored_string_pool::iterator iterator_type;
 
     autocomplete() {
-        // heap_size = 0;
-        // checked_docids = 0;
-        // skipped_searches = 0;
         m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
     }
 
@@ -25,235 +22,126 @@ struct autocomplete {
         typename Dictionary::builder di_builder(params);
         typename InvertedIndex::builder ii_builder(params);
         typename ForwardIndex::builder fi_builder(params);
-
         m_unsorted_docs_list.build(cm_builder.doc_ids());
         m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids());
-
         cm_builder.build(m_completions);
         di_builder.build(m_dictionary);
         ii_builder.build(m_inverted_index);
         fi_builder.build(m_forward_index);
     }
 
-    iterator_type prefix_topk(std::string const& query, const uint32_t k) {
+    template <typename Probe>
+    iterator_type prefix_topk(std::string const& query, const uint32_t k,
+                              Probe& probe) {
         assert(k <= constants::MAX_K);
+
+        probe.start(0);
         init();
         completion_type prefix;
         byte_range suffix;
-        if (parse(m_dictionary, query, prefix, suffix, true) == 0) {
+        constexpr bool must_find_prefix = true;
+        if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) {
             return m_pool.begin();
         }
+        probe.stop(0);
 
+        probe.start(1);
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
-        // NOTE: because the completion_trie works with 1-based ids
-        // (id 0 is reserved for null terminator)
         suffix_lex_range.begin += 1;
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         if (r.is_invalid()) return m_pool.begin();
-
         uint32_t num_completions =
             m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        return extract_strings(num_completions);
-    }
-
-    iterator_type conjunctive_topk(std::string const& query, const uint32_t k) {
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-
-        uint32_t num_completions = 0;
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
-        if (num_terms == 1) {  // special case
-            suffix_lex_range.end += 1;
-            num_completions = m_unsorted_minimal_docs_list.topk(
-                suffix_lex_range, k, m_pool.scores(),
-                true  // must return unique results
-            );
-        } else {
-            suffix_lex_range.begin += 1;
-            suffix_lex_range.end += 1;
-            num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-        }
-
-        return extract_strings(num_completions);
-    }
-
-    iterator_type topk(std::string const& query, const uint32_t k) {
-        assert(k <= constants::MAX_K);
-
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-
-        uint32_t num_completions = 0;
-        if (r.is_valid()) {
-            num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        }
-
-        if (num_completions < k) {
-            if (num_terms == 1) {  // special case
-                suffix_lex_range.begin -= 1;
-                num_completions = m_unsorted_minimal_docs_list.topk(
-                    suffix_lex_range, k, m_pool.scores(),
-                    true  // must return unique results
-                );
-            } else {
-                num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-            }
-        }
-
-        return extract_strings(num_completions);
-    }
+        probe.stop(1);
 
-    iterator_type topk(std::string const& query, const uint32_t k,
-                       std::vector<timer_type>& timers) {
-        assert(k <= constants::MAX_K);
-
-        // step 1: parsing
-        timers[0].start();
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-        timers[0].stop();
-
-        // step 2: prefix search
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-        uint32_t num_completions = 0;
-        if (r.is_valid()) {
-            num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        }
-        timers[1].stop();
-
-        // step 3: conjunctive search
-        timers[2].start();
-        if (num_completions < k) {
-            if (num_terms == 1) {  // special case
-                suffix_lex_range.begin -= 1;
-                num_completions = m_unsorted_minimal_docs_list.topk(
-                    suffix_lex_range, k, m_pool.scores(),
-                    true  // must return unique results
-                );
-            } else {
-                num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-            }
-        }
-        timers[2].stop();
-
-        // step 4: reporting
-        timers[3].start();
+        probe.start(2);
         auto it = extract_strings(num_completions);
-        timers[3].stop();
+        probe.stop(2);
 
         return it;
     }
 
-    // for benchmarking
-    iterator_type prefix_topk(std::string const& query, uint32_t const k,
-                              std::vector<timer_type>& timers) {
-        // step 0
-        timers[0].start();
+    template <typename Probe>
+    iterator_type conjunctive_topk(std::string const& query, const uint32_t k,
+                                   Probe& probe) {
         assert(k <= constants::MAX_K);
+
+        probe.start(0);
         init();
         completion_type prefix;
-        byte_range suffix{0, 0};
-        parse(m_dictionary, query, prefix, suffix);
-        timers[0].stop();
+        byte_range suffix;
+        constexpr bool must_find_prefix = false;
+        parse(m_dictionary, query, prefix, suffix, must_find_prefix);
+        probe.stop(0);
 
-        // step 1
-        timers[1].start();
+        probe.start(1);
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-        if (r.is_invalid()) return m_pool.begin();
-        timers[1].stop();
-
-        // step 2
-        timers[2].start();
-        uint32_t num_completions =
-            m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        timers[2].stop();
-
-        // step 3
-        timers[3].start();
-        auto it = extract_strings(num_completions);
-        timers[3].stop();
-
-        return it;
-    }
-
-    // for benchmarking
-    iterator_type conjunctive_topk(std::string const& query, uint32_t const k,
-                                   std::vector<timer_type>& timers) {
-        // step 0
-        timers[0].start();
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix{0, 0};
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-        timers[0].stop();
-
         uint32_t num_completions = 0;
-
-        // step 1
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) {
-            // ++skipped_searches;
-            // std::cout << "'" << query << "'\n";
-            return m_pool.begin();
-        }
-
-        timers[1].stop();
-
-        // step 2
-        timers[2].start();
-        if (num_terms == 1) {  // special case
+        if (prefix.size() == 0) {
             suffix_lex_range.end += 1;
+            constexpr bool must_return_unique_results = true;
             num_completions = m_unsorted_minimal_docs_list.topk(
                 suffix_lex_range, k, m_pool.scores(),
-                true  // must return unique results
-            );
+                must_return_unique_results);
+            if (num_completions < k) {
+                suffix_lex_range.begin += 1;
+                num_completions = heap_topk(m_inverted_index, suffix_lex_range,
+                                            k, m_pool.scores());
+            }
         } else {
+            suffix_lex_range.begin += 1;
+            suffix_lex_range.end += 1;
             num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         }
-        timers[2].stop();
+        probe.stop(1);
 
-        // step 3
-        timers[3].start();
+        probe.start(2);
         auto it = extract_strings(num_completions);
-        timers[3].stop();
+        probe.stop(2);
 
         return it;
     }
 
+    // iterator_type topk(std::string const& query, const uint32_t k) {
+    //     assert(k <= constants::MAX_K);
+    //     init();
+    //     completion_type prefix;
+    //     byte_range suffix;
+    //     uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
+    //     assert(num_terms > 0);
+
+    //     range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+    //     if (suffix_lex_range.is_invalid()) return m_pool.begin();
+
+    //     suffix_lex_range.begin += 1;
+    //     suffix_lex_range.end += 1;
+    //     range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+
+    //     uint32_t num_completions = 0;
+    //     if (r.is_valid()) {
+    //         num_completions = m_unsorted_docs_list.topk(r, k,
+    //         m_pool.scores());
+    //     }
+
+    //     if (num_completions < k) {
+    //         if (num_terms == 1) {  // special case
+    //             suffix_lex_range.begin -= 1;
+    //             num_completions = m_unsorted_minimal_docs_list.topk(
+    //                 suffix_lex_range, k, m_pool.scores(),
+    //                 true  // must return unique results
+    //             );
+    //         } else {
+    //             num_completions = conjunctive_topk(prefix, suffix_lex_range,
+    //             k);
+    //         }
+    //     }
+
+    //     return extract_strings(num_completions);
+    // }
+
     size_t bytes() const {
         return m_completions.bytes() + m_unsorted_docs_list.bytes() +
                m_unsorted_minimal_docs_list.bytes() + m_dictionary.bytes() +
@@ -272,10 +160,6 @@ struct autocomplete {
         visitor.visit(m_forward_index);
     }
 
-    // uint64_t heap_size;
-    // uint64_t checked_docids;
-    // uint64_t skipped_searches;
-
 private:
     Completions m_completions;
     UnsortedDocsList m_unsorted_docs_list;
@@ -309,7 +193,6 @@ struct autocomplete {
         uint32_t results = 0;
         for (; it.has_next(); ++it) {
             auto doc_id = *it;
-            // ++checked_docids;
             if (m_forward_index.intersects(doc_id, r)) {
                 topk_scores[results++] = doc_id;
                 if (results == k) break;
diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp
index cd6f411..f713043 100644
--- a/include/autocomplete2.hpp
+++ b/include/autocomplete2.hpp
@@ -15,8 +15,6 @@ struct autocomplete2 {
     typedef scored_string_pool::iterator iterator_type;
 
     autocomplete2() {
-        // heap_size = 0;
-        // checked_docids = 0;
         m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
         m_topk_completion_set.resize(constants::MAX_K,
                                      2 * constants::MAX_NUM_TERMS_PER_QUERY);
@@ -27,237 +25,133 @@ struct autocomplete2 {
         typename Completions::builder cm_builder(params);
         typename Dictionary::builder di_builder(params);
         typename InvertedIndex::builder ii_builder(params);
-
         auto const& docid_to_lexid = cm_builder.docid_to_lexid();
         m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
                                util::ceil_log2(params.num_completions + 1));
         m_unsorted_docs_list.build(
             util::invert(docid_to_lexid, params.num_completions));
         m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids());
-
         cm_builder.build(m_completions);
         di_builder.build(m_dictionary);
         ii_builder.build(m_inverted_index);
     }
 
-    iterator_type prefix_topk(std::string const& query, const uint32_t k) {
+    template <typename Probe>
+    iterator_type prefix_topk(std::string const& query, const uint32_t k,
+                              Probe& probe) {
         assert(k <= constants::MAX_K);
+
+        probe.start(0);
         init();
         completion_type prefix;
         byte_range suffix;
-        if (parse(m_dictionary, query, prefix, suffix, true) == 0) {
+        constexpr bool must_find_prefix = true;
+        if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) {
             return m_pool.begin();
         }
+        probe.stop(0);
 
+        probe.start(1);
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
         suffix_lex_range.begin += 1;
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         if (r.is_invalid()) return m_pool.begin();
-
         uint32_t num_completions =
             m_unsorted_docs_list.topk(r, k, m_pool.scores());
+        probe.stop(1);
+
+        probe.start(2);
         extract_completions(num_completions);
-        return extract_strings(num_completions);
+        auto it = extract_strings(num_completions);
+        probe.stop(2);
+
+        return it;
     }
 
-    iterator_type conjunctive_topk(std::string const& query, const uint32_t k) {
+    template <typename Probe>
+    iterator_type conjunctive_topk(std::string const& query, const uint32_t k,
+                                   Probe& probe) {
         assert(k <= constants::MAX_K);
+
+        probe.start(0);
         init();
         completion_type prefix;
         byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
+        constexpr bool must_find_prefix = false;
+        parse(m_dictionary, query, prefix, suffix, must_find_prefix);
+        probe.stop(0);
 
+        probe.start(1);
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
         uint32_t num_completions = 0;
-
-        if (num_terms == 1) {  // special case
+        if (prefix.size() == 0) {
             suffix_lex_range.end += 1;
+            constexpr bool must_return_unique_results = true;
             num_completions = m_unsorted_minimal_docs_list.topk(
                 suffix_lex_range, k, m_pool.scores(),
-                true  // must return unique results
-            );
+                must_return_unique_results);
+            if (num_completions < k) {
+                suffix_lex_range.begin += 1;
+                num_completions = heap_topk(m_inverted_index, suffix_lex_range,
+                                            k, m_pool.scores());
+            }
             extract_completions(num_completions);
         } else {
             suffix_lex_range.begin += 1;
             suffix_lex_range.end += 1;
             num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
         }
+        probe.stop(1);
 
-        return extract_strings(num_completions);
-    }
-
-    iterator_type topk(std::string const& query, const uint32_t k) {
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-
-        uint32_t num_completions = 0;
-        if (r.is_valid()) {
-            num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        }
-
-        if (num_completions < k) {
-            if (num_terms == 1) {  // special case
-                suffix_lex_range.begin -= 1;
-                num_completions = m_unsorted_minimal_docs_list.topk(
-                    suffix_lex_range, k, m_pool.scores(),
-                    true  // must return unique results
-                );
-                extract_completions(num_completions);
-            } else {
-                num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-            }
-        } else {
-            extract_completions(num_completions);
-        }
-
-        return extract_strings(num_completions);
-    }
-
-    iterator_type topk(std::string const& query, const uint32_t k,
-                       std::vector<timer_type>& timers) {
-        assert(k <= constants::MAX_K);
-
-        timers[0].start();
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-        timers[0].stop();
-
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-        uint32_t num_completions = 0;
-        if (r.is_valid()) {
-            num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        }
-        timers[1].stop();
-
-        timers[2].start();
-        if (num_completions < k) {
-            if (num_terms == 1) {  // special case
-                suffix_lex_range.begin -= 1;
-                num_completions = m_unsorted_minimal_docs_list.topk(
-                    suffix_lex_range, k, m_pool.scores(),
-                    true  // must return unique results
-                );
-                extract_completions(num_completions);
-            } else {
-                num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-            }
-        } else {
-            extract_completions(num_completions);
-        }
-        timers[2].stop();
-
-        timers[3].start();
+        probe.start(2);
         auto it = extract_strings(num_completions);
-        timers[3].stop();
+        probe.stop(2);
 
         return it;
     }
 
-    // for benchmarking
-    iterator_type prefix_topk(std::string const& query, uint32_t const k,
-                              std::vector<timer_type>& timers) {
-        // step 0
-        timers[0].start();
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix{0, 0};
-        parse(m_dictionary, query, prefix, suffix);
-        timers[0].stop();
-
-        // step 1
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-        if (r.is_invalid()) return m_pool.begin();
-        // timers[1].stop();
-
-        // step 2
-        // timers[2].start();
-        uint32_t num_completions =
-            m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        timers[1].stop();
-
-        // step 3
-        timers[2].start();
-        extract_completions(num_completions);
-        auto it = extract_strings(num_completions);
-        timers[2].stop();
-
-        return it;
-    }
-
-    // for benchmarking
-    iterator_type conjunctive_topk(std::string const& query, uint32_t const k,
-                                   std::vector<timer_type>& timers) {
-        // step 0
-        timers[0].start();
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix{0, 0};
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-        timers[0].stop();
-
-        uint32_t num_completions = 0;
-
-        // step 1
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-        timers[1].stop();
-
-        // step 2
-        timers[2].start();
-        if (num_terms == 1) {  // special case
-            suffix_lex_range.end += 1;
-            num_completions = m_unsorted_minimal_docs_list.topk(
-                suffix_lex_range, k, m_pool.scores(),
-                true  // must return unique results
-            );
-            extract_completions(num_completions);
-        } else {
-            num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-        }
-        timers[2].stop();
-
-        // step 3
-        timers[3].start();
-        auto it = extract_strings(num_completions);
-        timers[3].stop();
-
-        return it;
-    }
+    // iterator_type topk(std::string const& query, const uint32_t k) {
+    //     assert(k <= constants::MAX_K);
+    //     init();
+    //     completion_type prefix;
+    //     byte_range suffix;
+    //     uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
+    //     assert(num_terms > 0);
+
+    //     range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+    //     if (suffix_lex_range.is_invalid()) return m_pool.begin();
+
+    //     suffix_lex_range.begin += 1;
+    //     suffix_lex_range.end += 1;
+    //     range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+
+    //     uint32_t num_completions = 0;
+    //     if (r.is_valid()) {
+    //         num_completions = m_unsorted_docs_list.topk(r, k,
+    //         m_pool.scores());
+    //     }
+
+    //     if (num_completions < k) {
+    //         if (num_terms == 1) {  // special case
+    //             suffix_lex_range.begin -= 1;
+    //             num_completions = m_unsorted_minimal_docs_list.topk(
+    //                 suffix_lex_range, k, m_pool.scores(),
+    //                 true  // must return unique results
+    //             );
+    //             extract_completions(num_completions);
+    //         } else {
+    //             num_completions = conjunctive_topk(prefix, suffix_lex_range,
+    //             k);
+    //         }
+    //     } else {
+    //         extract_completions(num_completions);
+    //     }
+
+    //     return extract_strings(num_completions);
+    // }
 
     size_t bytes() const {
         return m_completions.bytes() + m_unsorted_docs_list.bytes() +
@@ -277,9 +171,6 @@ struct autocomplete2 {
         visitor.visit(m_docid_to_lexid);
     }
 
-    // uint64_t heap_size;
-    // uint64_t checked_docids;
-
 private:
     Completions m_completions;
     UnsortedDocsList m_unsorted_docs_list;
@@ -329,7 +220,6 @@ struct autocomplete2 {
 
         for (; it.has_next(); ++it) {
             auto doc_id = *it;
-            // ++checked_docids;
             auto lex_id = m_docid_to_lexid[doc_id];
             uint32_t size = m_completions.extract(lex_id, completions[i]);
             for (uint32_t j = 0; j != size; ++j) {
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index a166d9f..b6b76b4 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -5,7 +5,6 @@
 #include "compact_vector.hpp"
 #include "autocomplete_common.hpp"
 #include "scored_string_pool.hpp"
-#include "min_heap.hpp"
 #include "constants.hpp"
 
 namespace autocomplete {
@@ -25,8 +24,6 @@ struct autocomplete3 {
         min_priority_queue_type;
 
     autocomplete3() {
-        // heap_size = 0;
-        // checked_docids = 0;
         m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
         m_topk_completion_set.resize(constants::MAX_K,
                                      2 * constants::MAX_NUM_TERMS_PER_QUERY);
@@ -37,202 +34,109 @@ struct autocomplete3 {
         typename Completions::builder cm_builder(params);
         typename Dictionary::builder di_builder(params);
         typename InvertedIndex::builder ii_builder(params);
-
         auto const& docid_to_lexid = cm_builder.docid_to_lexid();
         m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
                                util::ceil_log2(params.num_completions + 1));
         m_unsorted_docs_list.build(
             util::invert(docid_to_lexid, params.num_completions));
-
         cm_builder.build(m_completions);
         di_builder.build(m_dictionary);
         ii_builder.build(m_inverted_index);
     }
 
-    iterator_type prefix_topk(std::string const& query, const uint32_t k) {
+    template <typename Probe>
+    iterator_type prefix_topk(std::string const& query, const uint32_t k,
+                              Probe& probe) {
         assert(k <= constants::MAX_K);
+
+        probe.start(0);
         init();
         completion_type prefix;
         byte_range suffix;
-        if (parse(m_dictionary, query, prefix, suffix, true) == 0) {
+        constexpr bool must_find_prefix = true;
+        if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) {
             return m_pool.begin();
         }
+        probe.stop(0);
 
+        probe.start(1);
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
         suffix_lex_range.begin += 1;
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         if (r.is_invalid()) return m_pool.begin();
-
         uint32_t num_completions =
             m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        extract_completions(num_completions);
-        return extract_strings(num_completions);
-    }
-
-    iterator_type conjunctive_topk(std::string const& query, const uint32_t k) {
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-
-        uint32_t num_completions = 0;
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
+        probe.stop(1);
 
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        num_completions =
-            conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
+        probe.start(2);
         extract_completions(num_completions);
-        return extract_strings(num_completions);
-    }
-
-    iterator_type topk(std::string const& query, const uint32_t k) {
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-
-        uint32_t num_completions = 0;
-        if (r.is_valid()) {
-            num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        }
-
-        if (num_completions < k) {
-            num_completions =
-                conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
-        }
+        auto it = extract_strings(num_completions);
+        probe.stop(2);
 
-        extract_completions(num_completions);
-        return extract_strings(num_completions);
+        return it;
     }
 
-    iterator_type topk(std::string const& query, const uint32_t k,
-                       std::vector<timer_type>& timers) {
+    template <typename Probe>
+    iterator_type conjunctive_topk(std::string const& query, const uint32_t k,
+                                   Probe& probe) {
         assert(k <= constants::MAX_K);
 
-        timers[0].start();
+        probe.start(0);
         init();
         completion_type prefix;
         byte_range suffix;
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-        timers[0].stop();
+        constexpr bool must_find_prefix = false;
+        parse(m_dictionary, query, prefix, suffix, must_find_prefix);
+        probe.stop(0);
 
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
+        probe.start(1);
         uint32_t num_completions = 0;
-        if (r.is_valid()) {
-            num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        }
-        timers[1].stop();
-
-        timers[2].start();
-        if (num_completions < k) {
-            num_completions =
-                conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
-        }
-        timers[2].stop();
-
-        timers[3].start();
-        extract_completions(num_completions);
-        auto it = extract_strings(num_completions);
-        timers[3].stop();
-
-        return it;
-    }
-
-    // for benchmarking
-    iterator_type prefix_topk(std::string const& query, uint32_t const k,
-                              std::vector<timer_type>& timers) {
-        // step 0
-        timers[0].start();
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix{0, 0};
-        parse(m_dictionary, query, prefix, suffix);
-        timers[0].stop();
-
-        // step 1
-        timers[1].start();
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
         suffix_lex_range.begin += 1;
         suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-        if (r.is_invalid()) return m_pool.begin();
-        timers[1].stop();
-
-        // step 2
-        timers[2].start();
-        uint32_t num_completions =
-            m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        timers[2].stop();
+        num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
+        probe.stop(1);
 
-        // step 3
-        timers[3].start();
+        probe.start(2);
         extract_completions(num_completions);
         auto it = extract_strings(num_completions);
-        timers[3].stop();
+        probe.stop(2);
 
         return it;
     }
 
-    // for benchmarking
-    iterator_type conjunctive_topk(std::string const& query, uint32_t const k,
-                                   std::vector<timer_type>& timers) {
-        // step 0
-        timers[0].start();
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix{0, 0};
-        uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
-        assert(num_terms > 0);
-        timers[0].stop();
+    // iterator_type topk(std::string const& query, const uint32_t k) {
+    //     assert(k <= constants::MAX_K);
+    //     init();
+    //     completion_type prefix;
+    //     byte_range suffix;
+    //     uint32_t num_terms = parse(m_dictionary, query, prefix, suffix);
+    //     assert(num_terms > 0);
 
-        uint32_t num_completions = 0;
+    //     range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+    //     if (suffix_lex_range.is_invalid()) return m_pool.begin();
 
-        // step 1
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-        timers[1].stop();
+    //     suffix_lex_range.begin += 1;
+    //     suffix_lex_range.end += 1;
+    //     range r = m_completions.locate_prefix(prefix, suffix_lex_range);
 
-        // step 2
-        timers[2].start();
-        num_completions =
-            conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
-        timers[2].stop();
+    //     uint32_t num_completions = 0;
+    //     if (r.is_valid()) {
+    //         num_completions = m_unsorted_docs_list.topk(r, k,
+    //         m_pool.scores());
+    //     }
 
-        // step 3
-        timers[3].start();
-        extract_completions(num_completions);
-        auto it = extract_strings(num_completions);
-        timers[3].stop();
+    //     if (num_completions < k) {
+    //         num_completions =
+    //             conjunctive_topk(num_terms, prefix, suffix_lex_range, k);
+    //     }
 
-        return it;
-    }
+    //     extract_completions(num_completions);
+    //     return extract_strings(num_completions);
+    // }
 
     size_t bytes() const {
         return m_completions.bytes() + m_unsorted_docs_list.bytes() +
@@ -251,9 +155,6 @@ struct autocomplete3 {
         visitor.visit(m_docid_to_lexid);
     }
 
-    // uint64_t heap_size;
-    // uint64_t checked_docids;
-
 private:
     Completions m_completions;
     UnsortedDocsList m_unsorted_docs_list;
@@ -282,11 +183,11 @@ struct autocomplete3 {
         }
     }
 
-    uint32_t conjunctive_topk(uint32_t num_terms, completion_type& prefix,
+    uint32_t conjunctive_topk(completion_type& prefix,
                               const range suffix_lex_range, const uint32_t k) {
-        if (num_terms == 1) {  // we've got nothing to intersect
-            iterator it(0, m_inverted_index.num_docs());
-            return conjunctive_topk(it, suffix_lex_range, k);
+        if (prefix.size() == 0) {  // we've got nothing to intersect
+            return heap_topk(m_inverted_index, suffix_lex_range, k,
+                             m_pool.scores());
         }
         deduplicate(prefix);
         if (prefix.size() == 1) {  // we've got nothing to intersect
@@ -310,13 +211,9 @@ struct autocomplete3 {
         }
         q.make_heap();
 
-        // heap_size += q.size();
-
         uint32_t results = 0;
         for (; it.has_next() and !q.empty(); ++it) {
             auto doc_id = *it;
-            // ++checked_docids;
-
             while (!q.empty()) {
                 auto& z = q.top();
                 auto val = *z;
diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index 3006592..d478683 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -5,7 +5,6 @@
 #include "compact_vector.hpp"
 #include "autocomplete_common.hpp"
 #include "scored_string_pool.hpp"
-#include "min_heap.hpp"
 #include "constants.hpp"
 
 namespace autocomplete {
@@ -18,8 +17,6 @@ struct autocomplete4 {
     typedef scored_string_pool::iterator iterator_type;
 
     autocomplete4() {
-        // heap_size = 0;
-        // checked_docids = 0;
         m_pool.resize(constants::POOL_SIZE, constants::MAX_K);
         m_topk_completion_set.resize(constants::MAX_K,
                                      2 * constants::MAX_NUM_TERMS_PER_QUERY);
@@ -30,194 +27,107 @@ struct autocomplete4 {
         typename Completions::builder cm_builder(params);
         typename Dictionary::builder di_builder(params);
         typename BlockedInvertedIndex::builder ii_builder(params, c);
-
         auto const& docid_to_lexid = cm_builder.docid_to_lexid();
         m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(),
                                util::ceil_log2(params.num_completions + 1));
         m_unsorted_docs_list.build(
             util::invert(docid_to_lexid, params.num_completions));
-
         cm_builder.build(m_completions);
         di_builder.build(m_dictionary);
         ii_builder.build(m_inverted_index);
     }
 
-    iterator_type prefix_topk(std::string const& query, const uint32_t k) {
+    template <typename Probe>
+    iterator_type prefix_topk(std::string const& query, const uint32_t k,
+                              Probe& probe) {
         assert(k <= constants::MAX_K);
+
+        probe.start(0);
         init();
         completion_type prefix;
         byte_range suffix;
-        if (parse(m_dictionary, query, prefix, suffix, true) == 0) {
+        constexpr bool must_find_prefix = true;
+        if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) {
             return m_pool.begin();
         }
+        probe.stop(0);
 
+        probe.start(1);
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
         suffix_lex_range.begin += 1;
         suffix_lex_range.end += 1;
         range r = m_completions.locate_prefix(prefix, suffix_lex_range);
         if (r.is_invalid()) return m_pool.begin();
-
         uint32_t num_completions =
             m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        extract_completions(num_completions);
-        return extract_strings(num_completions);
-    }
-
-    iterator_type conjunctive_topk(std::string const& query, const uint32_t k) {
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        parse(m_dictionary, query, prefix, suffix);
-
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        uint32_t num_completions =
-            conjunctive_topk(prefix, suffix_lex_range, k);
-        extract_completions(num_completions);
-        return extract_strings(num_completions);
-    }
-
-    iterator_type topk(std::string const& query, const uint32_t k) {
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        parse(m_dictionary, query, prefix, suffix);
-
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-
-        uint32_t num_completions = 0;
-        if (r.is_valid()) {
-            num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        }
-
-        if (num_completions < k) {
-            num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-        }
-
-        extract_completions(num_completions);
-        return extract_strings(num_completions);
-    }
-
-    iterator_type topk(std::string const& query, const uint32_t k,
-                       std::vector<timer_type>& timers) {
-        assert(k <= constants::MAX_K);
-
-        timers[0].start();
-        init();
-        completion_type prefix;
-        byte_range suffix;
-        parse(m_dictionary, query, prefix, suffix);
-        timers[0].stop();
+        probe.stop(1);
 
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-        suffix_lex_range.begin += 1;
-        suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-        uint32_t num_completions = 0;
-        if (r.is_valid()) {
-            num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        }
-        timers[1].stop();
-
-        timers[2].start();
-        if (num_completions < k) {
-            num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-        }
-        timers[2].stop();
-
-        timers[3].start();
+        probe.start(2);
         extract_completions(num_completions);
         auto it = extract_strings(num_completions);
-        timers[3].stop();
+        probe.stop(2);
 
         return it;
     }
 
-    // for benchmarking
-    iterator_type prefix_topk(std::string const& query, uint32_t const k,
-                              std::vector<timer_type>& timers) {
-        // step 0
-        timers[0].start();
+    template <typename Probe>
+    iterator_type conjunctive_topk(std::string const& query, const uint32_t k,
+                                   Probe& probe) {
         assert(k <= constants::MAX_K);
+
+        probe.start(0);
         init();
         completion_type prefix;
-        byte_range suffix{0, 0};
-        parse(m_dictionary, query, prefix, suffix);
-        timers[0].stop();
+        byte_range suffix;
+        constexpr bool must_find_prefix = false;
+        parse(m_dictionary, query, prefix, suffix, must_find_prefix);
+        probe.stop(0);
 
-        // step 1
-        timers[1].start();
+        probe.start(1);
         range suffix_lex_range = m_dictionary.locate_prefix(suffix);
         if (suffix_lex_range.is_invalid()) return m_pool.begin();
-
         suffix_lex_range.begin += 1;
         suffix_lex_range.end += 1;
-        range r = m_completions.locate_prefix(prefix, suffix_lex_range);
-        if (r.is_invalid()) return m_pool.begin();
-        timers[1].stop();
-
-        // step 2
-        timers[2].start();
         uint32_t num_completions =
-            m_unsorted_docs_list.topk(r, k, m_pool.scores());
-        timers[2].stop();
+            conjunctive_topk(prefix, suffix_lex_range, k);
+        probe.stop(1);
 
-        // step 3
-        timers[3].start();
+        probe.start(2);
         extract_completions(num_completions);
         auto it = extract_strings(num_completions);
-        timers[3].stop();
+        probe.stop(2);
 
         return it;
     }
 
-    // for benchmarking
-    iterator_type conjunctive_topk(std::string const& query, uint32_t const k,
-                                   std::vector<timer_type>& timers) {
-        // step 0
-        timers[0].start();
-        assert(k <= constants::MAX_K);
-        init();
-        completion_type prefix;
-        byte_range suffix{0, 0};
-        parse(m_dictionary, query, prefix, suffix);
-        timers[0].stop();
+    // iterator_type topk(std::string const& query, const uint32_t k) {
+    //     assert(k <= constants::MAX_K);
+    //     init();
+    //     completion_type prefix;
+    //     byte_range suffix;
+    //     parse(m_dictionary, query, prefix, suffix);
 
-        uint32_t num_completions = 0;
+    //     range suffix_lex_range = m_dictionary.locate_prefix(suffix);
+    //     if (suffix_lex_range.is_invalid()) return m_pool.begin();
 
-        // step 1
-        timers[1].start();
-        range suffix_lex_range = m_dictionary.locate_prefix(suffix);
-        if (suffix_lex_range.is_invalid()) return m_pool.begin();
-        timers[1].stop();
+    //     suffix_lex_range.begin += 1;
+    //     suffix_lex_range.end += 1;
+    //     range r = m_completions.locate_prefix(prefix, suffix_lex_range);
 
-        // step 2
-        timers[2].start();
-        num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
-        timers[2].stop();
+    //     uint32_t num_completions = 0;
+    //     if (r.is_valid()) {
+    //         num_completions = m_unsorted_docs_list.topk(r, k,
+    //         m_pool.scores());
+    //     }
 
-        // step 3
-        timers[3].start();
-        extract_completions(num_completions);
-        auto it = extract_strings(num_completions);
-        timers[3].stop();
+    //     if (num_completions < k) {
+    //         num_completions = conjunctive_topk(prefix, suffix_lex_range, k);
+    //     }
 
-        return it;
-    }
+    //     extract_completions(num_completions);
+    //     return extract_strings(num_completions);
+    // }
 
     size_t bytes() const {
         return m_completions.bytes() + m_unsorted_docs_list.bytes() +
@@ -236,9 +146,6 @@ struct autocomplete4 {
         visitor.visit(m_docid_to_lexid);
     }
 
-    // uint64_t heap_size;
-    // uint64_t checked_docids;
-
 private:
     Completions m_completions;
     UnsortedDocsList m_unsorted_docs_list;
@@ -275,13 +182,12 @@ struct autocomplete4 {
         }
     };
 
-    typedef min_heap<block_t, block_type_comparator> min_priority_queue_type;
-
     uint32_t conjunctive_topk(completion_type& prefix, const range suffix,
                               const uint32_t k) {
         auto& topk_scores = m_pool.scores();
-        deduplicate(prefix);
 
+        typedef min_heap<block_t, block_type_comparator>
+            min_priority_queue_type;
         min_priority_queue_type q;
         uint32_t current_block_id = m_inverted_index.block_id(suffix.begin);
         uint32_t current_block_boundary =
@@ -298,43 +204,57 @@ struct autocomplete4 {
         q.push_back(m_inverted_index.block(current_block_id));
         q.make_heap();
 
-        // heap_size += q.size();
-
-        auto it = m_inverted_index.intersection_iterator(prefix, suffix);
         uint32_t results = 0;
-        for (; it.has_next() and !q.empty(); ++it) {
-            auto doc_id = *it;
-            // ++checked_docids;
 
+        auto check = [&](block_t& block, id_type doc_id) {
+            uint64_t pos = block.docs_iterator.position();
+            assert(block.docs_iterator.access(pos) == doc_id);
+            uint64_t begin = block.offsets_iterator.access(pos);
+            uint64_t end = block.offsets_iterator.access(pos + 1);
+            assert(end > begin);
+            for (uint64_t i = begin; i != end; ++i) {
+                auto t = block.terms_iterator.access(i) + block.lower_bound;
+                if (t > suffix.end) break;
+                if (suffix.contains(t)) {
+                    topk_scores[results++] = doc_id;
+                    break;
+                }
+            }
+        };
+
+        if (prefix.size() == 0) {
             while (!q.empty()) {
                 auto& z = q.top();
-                auto val = z.docs_iterator.operator*();
-                if (val > doc_id) break;
-                if (val < doc_id) {
-                    val = z.docs_iterator.next_geq(doc_id);
-                    if (!z.docs_iterator.has_next()) {
-                        q.pop();
+                auto doc_id = z.docs_iterator.operator*();
+                check(z, doc_id);
+                if (results == k) return results;
+                z.docs_iterator.next();
+                if (!z.docs_iterator.has_next()) q.pop();
+                q.heapify();
+            }
+        } else {
+            deduplicate(prefix);
+            auto it = m_inverted_index.intersection_iterator(prefix, suffix);
+            for (; it.has_next() and !q.empty(); ++it) {
+                auto doc_id = *it;
+                while (!q.empty()) {
+                    auto& z = q.top();
+                    auto val = z.docs_iterator.operator*();
+                    if (val > doc_id) break;
+                    if (val < doc_id) {
+                        val = z.docs_iterator.next_geq(doc_id);
+                        if (!z.docs_iterator.has_next()) {
+                            q.pop();
+                        } else {
+                            q.heapify();
+                        }
                     } else {
-                        q.heapify();
-                    }
-                } else {
-                    if (val == doc_id) {
-                        uint64_t pos = z.docs_iterator.position();
-                        assert(z.docs_iterator.access(pos) == doc_id);
-                        uint64_t begin = z.offsets_iterator.access(pos);
-                        uint64_t end = z.offsets_iterator.access(pos + 1);
-                        assert(end > begin);
-                        for (uint64_t i = begin; i != end; ++i) {
-                            auto t = z.terms_iterator.access(i) + z.lower_bound;
-                            if (t > suffix.end) break;
-                            if (suffix.contains(t)) {
-                                topk_scores[results++] = doc_id;
-                                if (results == k) return results;
-                                break;
-                            }
+                        if (val == doc_id) {
+                            check(z, doc_id);
+                            if (results == k) return results;
                         }
+                        break;
                     }
-                    break;
                 }
             }
         }
diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp
index bd49934..f655d3f 100644
--- a/include/autocomplete_common.hpp
+++ b/include/autocomplete_common.hpp
@@ -1,14 +1,13 @@
 #pragma once
 
 #include "util_types.hpp"
+#include "min_heap.hpp"
 
 namespace autocomplete {
 
 template <typename Dictionary>
-uint32_t parse(Dictionary const& dict, std::string const& query,
-               completion_type& prefix, byte_range& suffix,
-               bool must_find_prefix = false) {
-    uint32_t num_terms = 1;  // for suffix
+bool parse(Dictionary const& dict, std::string const& query,
+           completion_type& prefix, byte_range& suffix, bool must_find_prefix) {
     byte_range_iterator it(string_to_byte_range(query));
     while (true) {
         suffix = it.next();
@@ -16,12 +15,11 @@ uint32_t parse(Dictionary const& dict, std::string const& query,
         auto term_id = dict.locate(suffix);
         if (term_id != global::invalid_term_id) {
             prefix.push_back(term_id);
-            ++num_terms;
         } else {
-            if (must_find_prefix) return 0;
+            if (must_find_prefix) return false;
         }
     }
-    return num_terms;
+    return true;
 }
 
 void deduplicate(completion_type& c) {
@@ -30,4 +28,36 @@ void deduplicate(completion_type& c) {
     c.resize(std::distance(c.begin(), end));
 }
 
+template <typename InvertedIndex>
+uint32_t heap_topk(InvertedIndex const& index, const range r, const uint32_t k,
+                   std::vector<id_type>& topk_scores) {
+    assert(r.is_valid());
+
+    typedef min_heap<typename InvertedIndex::iterator_type,
+                     iterator_comparator<typename InvertedIndex::iterator_type>>
+        min_priority_queue_type;
+
+    min_priority_queue_type q;
+    q.reserve(r.end - r.begin + 1);  // inclusive range
+    assert(r.begin > 0);
+    for (uint64_t term_id = r.begin; term_id <= r.end; ++term_id) {
+        q.push_back(index.iterator(term_id - 1));
+    }
+    q.make_heap();
+
+    uint32_t results = 0;
+
+    while (!q.empty()) {
+        auto& z = q.top();
+        auto doc_id = *z;
+        topk_scores[results++] = doc_id;
+        if (results == k) return results;
+        z.next();
+        if (!z.has_next()) q.pop();
+        q.heapify();
+    }
+
+    return results;
+}
+
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp
index 519a0bf..2f1af3a 100644
--- a/include/blocked_inverted_index.hpp
+++ b/include/blocked_inverted_index.hpp
@@ -273,48 +273,44 @@ struct blocked_inverted_index {
             , m_num_docs(ii->num_docs())
             , m_suffix(r) {
             assert(r.is_valid());
-
-            if (!term_ids.empty()) {
-                assert(std::is_sorted(term_ids.begin(), term_ids.end()));
-                assert(std::unique(term_ids.begin(), term_ids.end()) ==
-                       term_ids.end());
-
-                m_blocks.reserve(term_ids.size());  // at most
-                uint32_t current_block_id = ii->block_id(term_ids.front());
-                uint32_t i = 0;
-                uint32_t prev_i = 0;
-                for (; i != term_ids.size(); ++i) {
-                    auto term_id = term_ids[i];
-                    assert(term_id > 0);
-                    uint32_t b = ii->block_id(term_id);
-                    if (b > current_block_id) {
-                        auto block = ii->block(current_block_id);
-                        block.term_ids.reserve(term_ids.size());  // at most
-                        for (; prev_i != i; ++prev_i) {
-                            block.term_ids.push_back(term_ids[prev_i]);
-                        }
-                        m_blocks.push_back(std::move(block));
+            assert(!term_ids.empty());
+            assert(std::is_sorted(term_ids.begin(), term_ids.end()));
+            assert(std::unique(term_ids.begin(), term_ids.end()) ==
+                   term_ids.end());
+
+            m_blocks.reserve(term_ids.size());  // at most
+            uint32_t current_block_id = ii->block_id(term_ids.front());
+            uint32_t i = 0;
+            uint32_t prev_i = 0;
+            for (; i != term_ids.size(); ++i) {
+                auto term_id = term_ids[i];
+                assert(term_id > 0);
+                uint32_t b = ii->block_id(term_id);
+                if (b > current_block_id) {
+                    auto block = ii->block(current_block_id);
+                    block.term_ids.reserve(term_ids.size());  // at most
+                    for (; prev_i != i; ++prev_i) {
+                        block.term_ids.push_back(term_ids[prev_i]);
                     }
-                    current_block_id = b;
+                    m_blocks.push_back(std::move(block));
                 }
+                current_block_id = b;
+            }
 
-                auto block = ii->block(current_block_id);
-                block.term_ids.reserve(term_ids.size());  // at most
-                for (; prev_i != i; ++prev_i) {
-                    block.term_ids.push_back(term_ids[prev_i]);
-                }
-                m_blocks.push_back(std::move(block));
+            auto block = ii->block(current_block_id);
+            block.term_ids.reserve(term_ids.size());  // at most
+            for (; prev_i != i; ++prev_i) {
+                block.term_ids.push_back(term_ids[prev_i]);
+            }
+            m_blocks.push_back(std::move(block));
 
-                std::sort(m_blocks.begin(), m_blocks.end(),
-                          [](auto const& l, auto const& r) {
-                              return l.docs_iterator.size() <
-                                     r.docs_iterator.size();
-                          });
+            std::sort(m_blocks.begin(), m_blocks.end(),
+                      [](auto const& l, auto const& r) {
+                          return l.docs_iterator.size() <
+                                 r.docs_iterator.size();
+                      });
 
-                m_candidate = m_blocks[0].docs_iterator.access(0);
-            } else {
-                m_candidate = 0;
-            }
+            m_candidate = m_blocks[0].docs_iterator.access(0);
 
             next();
         }
@@ -329,12 +325,8 @@ struct blocked_inverted_index {
 
         void operator++() {
             assert(m_i == m_blocks.size());
-            if (!m_blocks.empty()) {
-                if (m_blocks.size() > 1) {
-                    m_candidate = m_blocks[0].docs_iterator.next();
-                }
-            } else {
-                m_candidate += 1;
+            if (m_blocks.size() > 1) {
+                m_candidate = m_blocks[0].docs_iterator.next();
             }
             m_i = 0;
             next();
@@ -375,7 +367,6 @@ struct blocked_inverted_index {
         }
 
         void next() {
-            if (m_blocks.empty()) return;
             if (m_blocks.size() == 1) {
                 while (m_candidate < m_num_docs and m_i != m_blocks.size()) {
                     assert(m_i == 0);
diff --git a/include/probe.hpp b/include/probe.hpp
new file mode 100644
index 0000000..955a939
--- /dev/null
+++ b/include/probe.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <vector>
+#include "util_types.hpp"
+
+namespace autocomplete {
+
+struct nop_probe {
+    inline void start(uint64_t) {}
+    inline void stop(uint64_t) {}
+};
+
+struct timer_probe {
+    timer_probe(uint64_t n)
+        : m_timers(n) {}
+
+    inline void start(uint64_t i) {
+        assert(i < m_timers.size());
+        m_timers[i].start();
+    }
+
+    inline void stop(uint64_t i) {
+        assert(i < m_timers.size());
+        m_timers[i].stop();
+    }
+
+    timer_type const& get(uint64_t i) {
+        assert(i < m_timers.size());
+        return m_timers[i];
+    }
+
+private:
+    std::vector<timer_type> m_timers;
+};
+
+}  // namespace autocomplete
diff --git a/include/util_types.hpp b/include/util_types.hpp
index e056bb6..531e65d 100644
--- a/include/util_types.hpp
+++ b/include/util_types.hpp
@@ -242,25 +242,4 @@ struct timer {
 
 typedef timer<clock_type, duration_type> timer_type;
 
-struct iterator {
-    iterator(id_type begin, id_type end)
-        : m_begin(begin)
-        , m_end(end) {}
-
-    bool has_next() const {
-        return m_begin < m_end;
-    }
-
-    id_type operator*() const {
-        return m_begin;
-    }
-
-    void operator++() {
-        ++m_begin;
-    }
-
-private:
-    id_type m_begin, m_end;
-};
-
 }  // namespace autocomplete
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 576f34b..1c5a82d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,5 +2,5 @@ add_executable(build build.cpp)
 add_executable(web_server web_server.cpp ../external/mongoose/mongoose.c)
 add_executable(output_ds2i_format output_ds2i_format.cpp)
 add_executable(statistics statistics.cpp)
-add_executable(check_topk check_topk.cpp)
+# add_executable(check_topk check_topk.cpp)
 add_executable(map_queries map_queries.cpp)
\ No newline at end of file
diff --git a/src/web_server.cpp b/src/web_server.cpp
index 7a0a61c..db317fa 100644
--- a/src/web_server.cpp
+++ b/src/web_server.cpp
@@ -5,6 +5,7 @@
 
 #include "constants.hpp"
 #include "types.hpp"
+#include "probe.hpp"
 
 #include "../external/mongoose/mongoose.h"
 
@@ -53,9 +54,10 @@ static void ev_handler(struct mg_connection* nc, int ev, void* p) {
             }
 
             std::string data;
-            auto it = topk_index.topk(query, k);
-            // auto it = topk_index.prefix_topk(query, k);
-            // auto it = topk_index.conjunctive_topk(query, k);
+            nop_probe probe;
+            // auto it = topk_index.topk(query, k probe);
+            // auto it = topk_index.prefix_topk(query, k, probe);
+            auto it = topk_index.conjunctive_topk(query, k, probe);
             if (it.empty()) {
                 data = "{\"suggestions\":[\"value\":\"\",\"data\":\"\"]}\n";
             } else {
diff --git a/test/test_autocomplete.cpp b/test/test_autocomplete.cpp
index 964a451..8fe49cc 100644
--- a/test/test_autocomplete.cpp
+++ b/test/test_autocomplete.cpp
@@ -36,8 +36,9 @@ TEST_CASE("test autocomplete topk functions") {
                 "florir",   "fly",         "the starting l",
                 "floridaaa"};
 
+            nop_probe probe;
             for (auto& query : queries) {
-                auto it = index.prefix_topk(query, k);
+                auto it = index.prefix_topk(query, k, probe);
                 std::cout << "top-" << it.size() << " completions for '"
                           << query << "':\n";
                 for (uint32_t i = 0; i != it.size(); ++i, ++it) {
@@ -61,8 +62,9 @@ TEST_CASE("test autocomplete topk functions") {
                 "fo",       "f",        "matt",          "fl",
                 "flor",     "fly",      "the starting l"};
 
+            nop_probe probe;
             for (auto& query : queries) {
-                auto it = index.conjunctive_topk(query, k);
+                auto it = index.conjunctive_topk(query, k, probe);
                 std::cout << "top-" << it.size() << " completions for '"
                           << query << "':\n";
                 for (uint32_t i = 0; i != it.size(); ++i, ++it) {
diff --git a/test/test_common.hpp b/test/test_common.hpp
index 24f4540..c17283f 100644
--- a/test/test_common.hpp
+++ b/test/test_common.hpp
@@ -6,6 +6,7 @@
 #include <iostream>
 
 #include "types.hpp"
+#include "probe.hpp"
 #include "../benchmark/benchmark_common.hpp"
 
 namespace autocomplete {
diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp
index ae99a6b..1a81693 100644
--- a/test/test_locate_prefix.cpp
+++ b/test/test_locate_prefix.cpp
@@ -12,7 +12,7 @@ void test_locate_prefix(Dictionary const& dict, Index const& index,
         range expected = testing::locate_prefix(strings, query);
         completion_type prefix;
         byte_range suffix;
-        parse(dict, query, prefix, suffix);
+        parse(dict, query, prefix, suffix, true);
 
         range suffix_lex_range = dict.locate_prefix(suffix);
         suffix_lex_range.begin += 1;

From 9adfac2a91b668f5022b0dfad5780b55d532fc79 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 11 Mar 2020 11:13:02 +0100
Subject: [PATCH 056/102] up

---
 benchmark/benchmark_common.hpp                  | 2 +-
 benchmark/benchmark_conjunctive_topk.cpp        | 4 ++++
 script/collect_results_by_varying_percentage.py | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp
index e7f9160..14dca8a 100644
--- a/benchmark/benchmark_common.hpp
+++ b/benchmark/benchmark_common.hpp
@@ -6,7 +6,7 @@
 namespace autocomplete {
 
 namespace benchmarking {
-static const uint32_t runs = 1;
+static const uint32_t runs = 5;
 }
 
 // void tolower(std::string& str) {
diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp
index ae73512..5ab37b7 100644
--- a/benchmark/benchmark_conjunctive_topk.cpp
+++ b/benchmark/benchmark_conjunctive_topk.cpp
@@ -32,6 +32,8 @@ void benchmark(std::string const& index_filename, uint32_t k,
             }
         }
         std::cout << "#ignore: " << reported_strings << std::endl;
+        breakdowns.add("reported_strings",
+                       std::to_string(reported_strings / benchmarking::runs));
         breakdowns.add("parsing_musec_per_query",
                        std::to_string(musec_per_query(probe.get(0).elapsed())));
         breakdowns.add("conjunctive_search_musec_per_query",
@@ -50,6 +52,8 @@ void benchmark(std::string const& index_filename, uint32_t k,
         }
         timer.stop();
         std::cout << "#ignore: " << reported_strings << std::endl;
+        breakdowns.add("reported_strings",
+                       std::to_string(reported_strings / benchmarking::runs));
         breakdowns.add("musec_per_query",
                        std::to_string(musec_per_query(timer.elapsed())));
     }
diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py
index 48a7dd1..f268443 100644
--- a/script/collect_results_by_varying_percentage.py
+++ b/script/collect_results_by_varying_percentage.py
@@ -19,6 +19,6 @@
 
 percentages = ["0.0", "0.25", "0.50", "0.75"]
 for perc in percentages:
-    for terms in range(2,8): # (1,8)
+    for terms in range(1,8):
         os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
     os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)

From 2732f639f253dbd4c27d9cf06d30bdbdc42b36c2 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 11 Mar 2020 12:27:39 +0100
Subject: [PATCH 057/102] refactored benchmarking suite

---
 README.md                                     |  6 +-
 benchmark/benchmark_common.hpp                | 84 ++++++++++++++++-
 benchmark/benchmark_conjunctive_topk.cpp      | 93 +------------------
 benchmark/benchmark_prefix_topk.cpp           | 89 +-----------------
 benchmark/benchmark_topk.cpp                  | 90 +-----------------
 ...ctiveness_results_by_varying_percentage.py |  3 +-
 .../collect_results_by_varying_percentage.py  | 10 +-
 7 files changed, 94 insertions(+), 281 deletions(-)

diff --git a/README.md b/README.md
index ce69cb7..12b6328 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,11 @@ in preparing the data for indexing.
 Thus, from within the directory `test_data`, it is sufficient
 to do:
 
-	bash preprocess.sh <test_collection> 300
+	bash preprocess.sh <test_collection> <num_queries>
+
+Therefore, for our example with `trec_05_efficiency_queries`, it would be:
+
+	bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300
 
 The second argument in the example, i.e., 300, represents the
 number of completions (per completion size) that are drawn at
diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp
index 14dca8a..1a96333 100644
--- a/benchmark/benchmark_common.hpp
+++ b/benchmark/benchmark_common.hpp
@@ -43,7 +43,89 @@ void configure_parser_for_benchmarking(cmd_line_parser::parser& parser) {
     parser.add("percentage",
                "A float in [0,1] specifying how much we keep of the last token "
                "in a query: n x 100 <=> n%, for n in [0,1].");
-    parser.add("breakdown", "Collect timings breakdown.", "--breakdown");
 }
 
+#define BENCHMARK(what)                                                        \
+    template <typename Index>                                                  \
+    void benchmark(std::string const& index_filename, uint32_t k,              \
+                   uint32_t max_num_queries, float keep,                       \
+                   essentials::json_lines& breakdowns) {                       \
+        Index index;                                                           \
+        essentials::load(index, index_filename.c_str());                       \
+                                                                               \
+        std::vector<std::string> queries;                                      \
+        uint32_t num_queries =                                                 \
+            load_queries(queries, max_num_queries, keep, std::cin);            \
+                                                                               \
+        uint64_t reported_strings = 0;                                         \
+        auto musec_per_query = [&](double time) {                              \
+            return time / (benchmarking::runs * num_queries);                  \
+        };                                                                     \
+                                                                               \
+        breakdowns.add("num_queries", std::to_string(num_queries));            \
+                                                                               \
+        timer_probe probe(3);                                                  \
+        for (uint32_t run = 0; run != benchmarking::runs; ++run) {             \
+            for (auto const& query : queries) {                                \
+                auto it = index.what##topk(query, k, probe);                   \
+                reported_strings += it.size();                                 \
+            }                                                                  \
+        }                                                                      \
+        std::cout << "#ignore: " << reported_strings << std::endl;             \
+                                                                               \
+        breakdowns.add("reported_strings",                                     \
+                       std::to_string(reported_strings / benchmarking::runs)); \
+        breakdowns.add(                                                        \
+            "parsing_musec_per_query",                                         \
+            std::to_string(musec_per_query(probe.get(0).elapsed())));          \
+        breakdowns.add(                                                        \
+            std::string(#what) + "search_musec_per_query",                     \
+            std::to_string(musec_per_query(probe.get(1).elapsed())));          \
+        breakdowns.add(                                                        \
+            "reporting_musec_per_query",                                       \
+            std::to_string(musec_per_query(probe.get(2).elapsed())));          \
+        breakdowns.add(                                                        \
+            "total_musec_per_query",                                           \
+            std::to_string(musec_per_query(probe.get(0).elapsed()) +           \
+                           musec_per_query(probe.get(1).elapsed()) +           \
+                           musec_per_query(probe.get(2).elapsed())));          \
+    }                                                                          \
+                                                                               \
+    int main(int argc, char** argv) {                                          \
+        cmd_line_parser::parser parser(argc, argv);                            \
+        configure_parser_for_benchmarking(parser);                             \
+        if (!parser.parse()) return 1;                                         \
+                                                                               \
+        auto type = parser.get<std::string>("type");                           \
+        auto k = parser.get<uint32_t>("k");                                    \
+        auto index_filename = parser.get<std::string>("index_filename");       \
+        auto max_num_queries = parser.get<uint32_t>("max_num_queries");        \
+        auto keep = parser.get<float>("percentage");                           \
+                                                                               \
+        essentials::json_lines breakdowns;                                     \
+        breakdowns.new_line();                                                 \
+        breakdowns.add("num_terms_per_query",                                  \
+                       parser.get<std::string>("num_terms_per_query"));        \
+        breakdowns.add("percentage", std::to_string(keep));                    \
+                                                                               \
+        if (type == "ef_type1") {                                              \
+            benchmark<ef_autocomplete_type1>(                                  \
+                index_filename, k, max_num_queries, keep, breakdowns);         \
+        } else if (type == "ef_type2") {                                       \
+            benchmark<ef_autocomplete_type2>(                                  \
+                index_filename, k, max_num_queries, keep, breakdowns);         \
+        } else if (type == "ef_type3") {                                       \
+            benchmark<ef_autocomplete_type3>(                                  \
+                index_filename, k, max_num_queries, keep, breakdowns);         \
+        } else if (type == "ef_type4") {                                       \
+            benchmark<ef_autocomplete_type4>(                                  \
+                index_filename, k, max_num_queries, keep, breakdowns);         \
+        } else {                                                               \
+            return 1;                                                          \
+        }                                                                      \
+                                                                               \
+        breakdowns.print();                                                    \
+        return 0;                                                              \
+    }
+
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp
index 5ab37b7..df14c84 100644
--- a/benchmark/benchmark_conjunctive_topk.cpp
+++ b/benchmark/benchmark_conjunctive_topk.cpp
@@ -4,95 +4,4 @@
 #include "benchmark_common.hpp"
 
 using namespace autocomplete;
-
-template <typename Index>
-void benchmark(std::string const& index_filename, uint32_t k,
-               uint32_t max_num_queries, float keep,
-               essentials::json_lines& breakdowns, bool breakdown) {
-    Index index;
-    essentials::load(index, index_filename.c_str());
-
-    std::vector<std::string> queries;
-    uint32_t num_queries =
-        load_queries(queries, max_num_queries, keep, std::cin);
-
-    uint64_t reported_strings = 0;
-    auto musec_per_query = [&](double time) {
-        return time / (benchmarking::runs * num_queries);
-    };
-
-    breakdowns.add("num_queries", std::to_string(num_queries));
-
-    if (breakdown) {
-        timer_probe probe(3);
-        for (uint32_t run = 0; run != benchmarking::runs; ++run) {
-            for (auto const& query : queries) {
-                auto it = index.conjunctive_topk(query, k, probe);
-                reported_strings += it.size();
-            }
-        }
-        std::cout << "#ignore: " << reported_strings << std::endl;
-        breakdowns.add("reported_strings",
-                       std::to_string(reported_strings / benchmarking::runs));
-        breakdowns.add("parsing_musec_per_query",
-                       std::to_string(musec_per_query(probe.get(0).elapsed())));
-        breakdowns.add("conjunctive_search_musec_per_query",
-                       std::to_string(musec_per_query(probe.get(1).elapsed())));
-        breakdowns.add("reporting_musec_per_query",
-                       std::to_string(musec_per_query(probe.get(2).elapsed())));
-    } else {
-        essentials::timer_type timer;
-        nop_probe probe;
-        timer.start();
-        for (uint32_t run = 0; run != benchmarking::runs; ++run) {
-            for (auto const& query : queries) {
-                auto it = index.conjunctive_topk(query, k, probe);
-                reported_strings += it.size();
-            }
-        }
-        timer.stop();
-        std::cout << "#ignore: " << reported_strings << std::endl;
-        breakdowns.add("reported_strings",
-                       std::to_string(reported_strings / benchmarking::runs));
-        breakdowns.add("musec_per_query",
-                       std::to_string(musec_per_query(timer.elapsed())));
-    }
-}
-
-int main(int argc, char** argv) {
-    cmd_line_parser::parser parser(argc, argv);
-    configure_parser_for_benchmarking(parser);
-    if (!parser.parse()) return 1;
-
-    auto type = parser.get<std::string>("type");
-    auto k = parser.get<uint32_t>("k");
-    auto index_filename = parser.get<std::string>("index_filename");
-    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
-    auto keep = parser.get<float>("percentage");
-    auto breakdown = parser.get<bool>("breakdown");
-
-    essentials::json_lines breakdowns;
-    breakdowns.new_line();
-    breakdowns.add("num_terms_per_query",
-                   parser.get<std::string>("num_terms_per_query"));
-    breakdowns.add("percentage", std::to_string(keep));
-
-    if (type == "ef_type1") {
-        benchmark<ef_autocomplete_type1>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type2") {
-        benchmark<ef_autocomplete_type2>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type3") {
-        benchmark<ef_autocomplete_type3>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type4") {
-        benchmark<ef_autocomplete_type4>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else {
-        return 1;
-    }
-
-    breakdowns.print();
-    return 0;
-}
\ No newline at end of file
+BENCHMARK(conjunctive_)
\ No newline at end of file
diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp
index f09d3dc..69a0bc1 100644
--- a/benchmark/benchmark_prefix_topk.cpp
+++ b/benchmark/benchmark_prefix_topk.cpp
@@ -4,91 +4,4 @@
 #include "benchmark_common.hpp"
 
 using namespace autocomplete;
-
-template <typename Index>
-void benchmark(std::string const& index_filename, uint32_t k,
-               uint32_t max_num_queries, float keep,
-               essentials::json_lines& breakdowns, bool breakdown) {
-    Index index;
-    essentials::load(index, index_filename.c_str());
-
-    std::vector<std::string> queries;
-    uint32_t num_queries =
-        load_queries(queries, max_num_queries, keep, std::cin);
-
-    uint64_t reported_strings = 0;
-    auto musec_per_query = [&](double time) {
-        return time / (benchmarking::runs * num_queries);
-    };
-
-    breakdowns.add("num_queries", std::to_string(num_queries));
-
-    if (breakdown) {
-        timer_probe probe(3);
-        for (uint32_t run = 0; run != benchmarking::runs; ++run) {
-            for (auto const& query : queries) {
-                auto it = index.prefix_topk(query, k, probe);
-                reported_strings += it.size();
-            }
-        }
-        std::cout << "#ignore: " << reported_strings << std::endl;
-        breakdowns.add("parsing_musec_per_query",
-                       std::to_string(musec_per_query(probe.get(0).elapsed())));
-        breakdowns.add("prefix_search_musec_per_query",
-                       std::to_string(musec_per_query(probe.get(1).elapsed())));
-        breakdowns.add("reporting_musec_per_query",
-                       std::to_string(musec_per_query(probe.get(2).elapsed())));
-    } else {
-        essentials::timer_type timer;
-        nop_probe probe;
-        timer.start();
-        for (uint32_t run = 0; run != benchmarking::runs; ++run) {
-            for (auto const& query : queries) {
-                auto it = index.prefix_topk(query, k, probe);
-                reported_strings += it.size();
-            }
-        }
-        timer.stop();
-        std::cout << reported_strings << std::endl;
-        breakdowns.add("musec_per_query",
-                       std::to_string(musec_per_query(timer.elapsed())));
-    }
-}
-
-int main(int argc, char** argv) {
-    cmd_line_parser::parser parser(argc, argv);
-    configure_parser_for_benchmarking(parser);
-    if (!parser.parse()) return 1;
-
-    auto type = parser.get<std::string>("type");
-    auto k = parser.get<uint32_t>("k");
-    auto index_filename = parser.get<std::string>("index_filename");
-    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
-    auto keep = parser.get<float>("percentage");
-    auto breakdown = parser.get<bool>("breakdown");
-
-    essentials::json_lines breakdowns;
-    breakdowns.new_line();
-    breakdowns.add("num_terms_per_query",
-                   parser.get<std::string>("num_terms_per_query"));
-    breakdowns.add("percentage", std::to_string(keep));
-
-    if (type == "ef_type1") {
-        benchmark<ef_autocomplete_type1>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type2") {
-        benchmark<ef_autocomplete_type2>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type3") {
-        benchmark<ef_autocomplete_type3>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type4") {
-        benchmark<ef_autocomplete_type4>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else {
-        return 1;
-    }
-
-    breakdowns.print();
-    return 0;
-}
\ No newline at end of file
+BENCHMARK(prefix_)
\ No newline at end of file
diff --git a/benchmark/benchmark_topk.cpp b/benchmark/benchmark_topk.cpp
index 0ea1e97..98d208c 100644
--- a/benchmark/benchmark_topk.cpp
+++ b/benchmark/benchmark_topk.cpp
@@ -4,92 +4,4 @@
 #include "benchmark_common.hpp"
 
 using namespace autocomplete;
-
-template <typename Index>
-void benchmark(std::string const& index_filename, uint32_t k,
-               uint32_t max_num_queries, float keep,
-               essentials::json_lines& breakdowns, bool breakdown) {
-    Index index;
-    essentials::load(index, index_filename.c_str());
-
-    std::vector<std::string> queries;
-    uint32_t num_queries =
-        load_queries(queries, max_num_queries, keep, std::cin);
-
-    uint64_t reported_strings = 0;
-    auto musec_per_query = [&](double time) {
-        return time / (runs * num_queries);
-    };
-
-    breakdowns.add("num_queries", std::to_string(num_queries));
-
-    if (breakdown) {
-        std::vector<timer_type> timers(4);
-        for (uint32_t run = 0; run != runs; ++run) {
-            for (auto const& query : queries) {
-                auto it = index.topk(query, k, timers);
-                reported_strings += it.size();
-            }
-        }
-        std::cout << reported_strings << std::endl;
-        breakdowns.add("parsing_musec_per_query",
-                       std::to_string(musec_per_query(timers[0].elapsed())));
-        breakdowns.add("prefix_search_musec_per_query",
-                       std::to_string(musec_per_query(timers[1].elapsed())));
-        breakdowns.add("conjunctive_search_musec_per_query",
-                       std::to_string(musec_per_query(timers[2].elapsed())));
-        breakdowns.add("reporting_musec_per_query",
-                       std::to_string(musec_per_query(timers[3].elapsed())));
-    } else {
-        essentials::timer_type timer;
-        timer.start();
-        for (uint32_t run = 0; run != runs; ++run) {
-            for (auto const& query : queries) {
-                auto it = index.topk(query, k);
-                reported_strings += it.size();
-            }
-        }
-        timer.stop();
-        std::cout << reported_strings << std::endl;
-        breakdowns.add("musec_per_query",
-                       std::to_string(musec_per_query(timer.elapsed())));
-    }
-}
-
-int main(int argc, char** argv) {
-    cmd_line_parser::parser parser(argc, argv);
-    configure_parser_for_benchmarking(parser);
-    if (!parser.parse()) return 1;
-
-    auto type = parser.get<std::string>("type");
-    auto k = parser.get<uint32_t>("k");
-    auto index_filename = parser.get<std::string>("index_filename");
-    auto max_num_queries = parser.get<uint32_t>("max_num_queries");
-    auto keep = parser.get<float>("percentage");
-    auto breakdown = parser.get<bool>("breakdown");
-
-    essentials::json_lines breakdowns;
-    breakdowns.new_line();
-    breakdowns.add("num_terms_per_query",
-                   parser.get<std::string>("num_terms_per_query"));
-    breakdowns.add("percentage", std::to_string(keep));
-
-    if (type == "ef_type1") {
-        benchmark<ef_autocomplete_type1>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type2") {
-        benchmark<ef_autocomplete_type2>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type3") {
-        benchmark<ef_autocomplete_type3>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else if (type == "ef_type4") {
-        benchmark<ef_autocomplete_type4>(index_filename, k, max_num_queries,
-                                         keep, breakdowns, breakdown);
-    } else {
-        return 1;
-    }
-
-    breakdowns.print();
-    return 0;
-}
\ No newline at end of file
+BENCHMARK("")
\ No newline at end of file
diff --git a/script/collect_effectiveness_results_by_varying_percentage.py b/script/collect_effectiveness_results_by_varying_percentage.py
index 4fc7683..b1cfe40 100644
--- a/script/collect_effectiveness_results_by_varying_percentage.py
+++ b/script/collect_effectiveness_results_by_varying_percentage.py
@@ -7,12 +7,11 @@
 num_queries = sys.argv[5]
 
 output_filename = collection_basename + "." + index_type
-
 output_filename += ".effectiveness.json"
 query_filename_prefix = collection_basename + ".queries/queries."
 
 percentages = ["0.0", "0.25", "0.50", "0.75"]
 for perc in percentages:
-    for terms in range(2,8): # (1,8)
+    for terms in range(1,8):
         os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
     os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)
diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py
index f268443..d565689 100644
--- a/script/collect_results_by_varying_percentage.py
+++ b/script/collect_results_by_varying_percentage.py
@@ -8,17 +8,11 @@
 num_queries = sys.argv[6]
 
 output_filename = collection_basename + "." + index_type
-
-breakdown = ""
-if len(sys.argv) > 7 and sys.argv[7] == "--breakdown":
-    breakdown = "--breakdown"
-    output_filename += ".breakdown"
-
 output_filename += "." + query_mode + ".json"
 query_filename_prefix = collection_basename + ".queries/queries."
 
 percentages = ["0.0", "0.25", "0.50", "0.75"]
 for perc in percentages:
     for terms in range(1,8):
-        os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
-    os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)
+        os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
+    os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)

From 58f6cb187f3b037e797e9b65abcd168b6c6729b3 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 11 Mar 2020 15:13:35 +0100
Subject: [PATCH 058/102] scripts updated

---
 script/collect_effectiveness_results_by_varying_percentage.py | 4 ++--
 script/collect_results_by_varying_percentage.py               | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/script/collect_effectiveness_results_by_varying_percentage.py b/script/collect_effectiveness_results_by_varying_percentage.py
index b1cfe40..2693e70 100644
--- a/script/collect_effectiveness_results_by_varying_percentage.py
+++ b/script/collect_effectiveness_results_by_varying_percentage.py
@@ -12,6 +12,6 @@
 
 percentages = ["0.0", "0.25", "0.50", "0.75"]
 for perc in percentages:
-    for terms in range(1,8):
+    for terms in range(1,7):
         os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
-    os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)
+    os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 7+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=7+ 2>> " + output_filename)
diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py
index d565689..c639032 100644
--- a/script/collect_results_by_varying_percentage.py
+++ b/script/collect_results_by_varying_percentage.py
@@ -13,6 +13,6 @@
 
 percentages = ["0.0", "0.25", "0.50", "0.75"]
 for perc in percentages:
-    for terms in range(1,8):
+    for terms in range(1,7):
         os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename)
-    os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename)
+    os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 7+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=7+ 2>> " + output_filename)

From 05298907a0342eb0dd35b9ab8df853fa7049633b Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 11 Mar 2020 15:15:47 +0100
Subject: [PATCH 059/102] scripts updated

---
 test_data/filter_and_preprocess.sh       | 2 +-
 test_data/filter_dataset.py              | 4 ++--
 test_data/partition_queries_by_length.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_data/filter_and_preprocess.sh b/test_data/filter_and_preprocess.sh
index 38425d7..9a5d787 100644
--- a/test_data/filter_and_preprocess.sh
+++ b/test_data/filter_and_preprocess.sh
@@ -3,7 +3,7 @@
 echo $1 # input filename
 
 # number of completions to exclude per completion size,
-# e.g., if it is 100, then at most 8 x 100 completions are filtered out
+# e.g., if it is 100, then at most 7 x 100 completions are filtered out
 echo $2
 
 python partition_queries_by_length.py $1 $1.filtered.queries $2
diff --git a/test_data/filter_dataset.py b/test_data/filter_dataset.py
index 4481cbe..dc68a28 100644
--- a/test_data/filter_dataset.py
+++ b/test_data/filter_dataset.py
@@ -6,12 +6,12 @@
 
 to_filter = Set({})
 print("loading strings to filter...")
-for i in range(1,8):
+for i in range(1,7):
     with open(queries_directory + "/queries.length=" + str(i)) as f:
         for line in f:
             s = line.rstrip('\n')
             to_filter.add(s)
-with open(queries_directory + "/queries.length=8+") as f:
+with open(queries_directory + "/queries.length=7+") as f:
     for line in f:
         s = line.rstrip('\n')
         to_filter.add(s)
diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py
index eb9b95d..3d3823b 100644
--- a/test_data/partition_queries_by_length.py
+++ b/test_data/partition_queries_by_length.py
@@ -7,7 +7,7 @@
 if not os.path.exists(output_directory):
     os.makedirs(output_directory)
 
-num_shards = 7
+num_shards = 6
 files = [open(output_directory + "/queries.length=" + str(i), "w") for i in range(1,num_shards + 1)]
 all_others = open(output_directory + "/queries.length=" + str(num_shards + 1) + "+", "w")
 

From bac98f457efff320e881bf31defa6a163de790b9 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 11 Mar 2020 15:17:02 +0100
Subject: [PATCH 060/102] readme updated

---
 README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 12b6328..1247d50 100644
--- a/README.md
+++ b/README.md
@@ -156,8 +156,8 @@ to partition the input completions by number of query terms
 and retain 300 queries at random.
 Query files are placed in the output directory
 `trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries`.
-(By default, 8 shards will be created: the ones having [1,7] query terms and
-the one collecting all completions with >= 8 query terms).
+(By default, 7 shards will be created: the ones having [1,6] query terms and
+the one collecting all completions with *at least* 7 query terms).
 
 Then the command
 
@@ -171,8 +171,6 @@ From within the `/build` directory, run
 
 	python ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300
 
-You can also specify the option `--breakdown` to record timings breakdowns.
-
 To benchmark the dictionaries (Front-Coding and trie), just run the following script from within
 the `script` directory:
 

From 13c04333110ef2eac4f948d86d434f9fa9bb0f8e Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Mon, 16 Mar 2020 17:59:21 +0100
Subject: [PATCH 061/102] minimal_docids

---
 include/autocomplete.hpp          |  20 ++---
 include/autocomplete2.hpp         |  17 ++--
 include/autocomplete3.hpp         |   5 +-
 include/autocomplete4.hpp         |   4 +-
 include/autocomplete_common.hpp   |  13 ++-
 include/compact_forward_index.hpp |   1 +
 include/minimal_docids.hpp        | 131 ++++++++++++++++++++++++++++++
 include/scored_string_pool.hpp    |   5 ++
 include/statistics.hpp            |  26 +++---
 include/types.hpp                 |  46 ++---------
 include/unsorted_list.hpp         |  74 ++++++++---------
 include/util_types.hpp            |  49 +++++++++++
 test/test_unsorted_list.cpp       |  12 +--
 13 files changed, 275 insertions(+), 128 deletions(-)
 create mode 100644 include/minimal_docids.hpp

diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp
index f55b9e5..78e54ad 100644
--- a/include/autocomplete.hpp
+++ b/include/autocomplete.hpp
@@ -7,8 +7,8 @@
 
 namespace autocomplete {
 
-template <typename Completions, typename UnsortedDocsList, typename Dictionary,
-          typename InvertedIndex, typename ForwardIndex>
+template <typename Completions, typename Dictionary, typename InvertedIndex,
+          typename ForwardIndex>
 struct autocomplete {
     typedef scored_string_pool::iterator iterator_type;
 
@@ -22,8 +22,10 @@ struct autocomplete {
         typename Dictionary::builder di_builder(params);
         typename InvertedIndex::builder ii_builder(params);
         typename ForwardIndex::builder fi_builder(params);
+
         m_unsorted_docs_list.build(cm_builder.doc_ids());
         m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids());
+
         cm_builder.build(m_completions);
         di_builder.build(m_dictionary);
         ii_builder.build(m_inverted_index);
@@ -82,15 +84,8 @@ struct autocomplete {
         uint32_t num_completions = 0;
         if (prefix.size() == 0) {
             suffix_lex_range.end += 1;
-            constexpr bool must_return_unique_results = true;
             num_completions = m_unsorted_minimal_docs_list.topk(
-                suffix_lex_range, k, m_pool.scores(),
-                must_return_unique_results);
-            if (num_completions < k) {
-                suffix_lex_range.begin += 1;
-                num_completions = heap_topk(m_inverted_index, suffix_lex_range,
-                                            k, m_pool.scores());
-            }
+                m_inverted_index, suffix_lex_range, k, m_pool.scores());
         } else {
             suffix_lex_range.begin += 1;
             suffix_lex_range.end += 1;
@@ -162,8 +157,9 @@ struct autocomplete {
 
 private:
     Completions m_completions;
-    UnsortedDocsList m_unsorted_docs_list;
-    UnsortedDocsList m_unsorted_minimal_docs_list;
+    unsorted_list_type m_unsorted_docs_list;
+    typedef minimal_docids<cartesian_tree, InvertedIndex> minimal_docids_type;
+    minimal_docids_type m_unsorted_minimal_docs_list;
     Dictionary m_dictionary;
     InvertedIndex m_inverted_index;
     ForwardIndex m_forward_index;
diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp
index f713043..eb3f994 100644
--- a/include/autocomplete2.hpp
+++ b/include/autocomplete2.hpp
@@ -9,8 +9,7 @@
 
 namespace autocomplete {
 
-template <typename Completions, typename UnsortedDocsList, typename Dictionary,
-          typename InvertedIndex>
+template <typename Completions, typename Dictionary, typename InvertedIndex>
 struct autocomplete2 {
     typedef scored_string_pool::iterator iterator_type;
 
@@ -89,15 +88,8 @@ struct autocomplete2 {
         uint32_t num_completions = 0;
         if (prefix.size() == 0) {
             suffix_lex_range.end += 1;
-            constexpr bool must_return_unique_results = true;
             num_completions = m_unsorted_minimal_docs_list.topk(
-                suffix_lex_range, k, m_pool.scores(),
-                must_return_unique_results);
-            if (num_completions < k) {
-                suffix_lex_range.begin += 1;
-                num_completions = heap_topk(m_inverted_index, suffix_lex_range,
-                                            k, m_pool.scores());
-            }
+                m_inverted_index, suffix_lex_range, k, m_pool.scores());
             extract_completions(num_completions);
         } else {
             suffix_lex_range.begin += 1;
@@ -173,8 +165,9 @@ struct autocomplete2 {
 
 private:
     Completions m_completions;
-    UnsortedDocsList m_unsorted_docs_list;
-    UnsortedDocsList m_unsorted_minimal_docs_list;
+    unsorted_list_type m_unsorted_docs_list;
+    typedef minimal_docids<cartesian_tree, InvertedIndex> minimal_docids_type;
+    minimal_docids_type m_unsorted_minimal_docs_list;
     Dictionary m_dictionary;
     InvertedIndex m_inverted_index;
     compact_vector m_docid_to_lexid;
diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp
index b6b76b4..6765ad6 100644
--- a/include/autocomplete3.hpp
+++ b/include/autocomplete3.hpp
@@ -15,8 +15,7 @@ one iterator for each termID in the lexicographic range of the
 last token of the query.
 */
 
-template <typename Completions, typename UnsortedDocsList, typename Dictionary,
-          typename InvertedIndex>
+template <typename Completions, typename Dictionary, typename InvertedIndex>
 struct autocomplete3 {
     typedef scored_string_pool::iterator iterator_type;
     typedef min_heap<typename InvertedIndex::iterator_type,
@@ -157,7 +156,7 @@ struct autocomplete3 {
 
 private:
     Completions m_completions;
-    UnsortedDocsList m_unsorted_docs_list;
+    unsorted_list_type m_unsorted_docs_list;
     Dictionary m_dictionary;
     InvertedIndex m_inverted_index;
     compact_vector m_docid_to_lexid;
diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp
index d478683..7d84bae 100644
--- a/include/autocomplete4.hpp
+++ b/include/autocomplete4.hpp
@@ -11,7 +11,7 @@ namespace autocomplete {
 
 /* Bast and Weber approach. */
 
-template <typename Completions, typename UnsortedDocsList, typename Dictionary,
+template <typename Completions, typename Dictionary,
           typename BlockedInvertedIndex>
 struct autocomplete4 {
     typedef scored_string_pool::iterator iterator_type;
@@ -148,7 +148,7 @@ struct autocomplete4 {
 
 private:
     Completions m_completions;
-    UnsortedDocsList m_unsorted_docs_list;
+    unsorted_list_type m_unsorted_docs_list;
     Dictionary m_dictionary;
     BlockedInvertedIndex m_inverted_index;
     compact_vector m_docid_to_lexid;
diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp
index f655d3f..21d952b 100644
--- a/include/autocomplete_common.hpp
+++ b/include/autocomplete_common.hpp
@@ -2,9 +2,14 @@
 
 #include "util_types.hpp"
 #include "min_heap.hpp"
+#include "unsorted_list.hpp"
+#include "minimal_docids.hpp"
+#include "succinct_rmq/cartesian_tree.hpp"
 
 namespace autocomplete {
 
+typedef unsorted_list<cartesian_tree> unsorted_list_type;
+
 template <typename Dictionary>
 bool parse(Dictionary const& dict, std::string const& query,
            completion_type& prefix, byte_range& suffix, bool must_find_prefix) {
@@ -50,8 +55,12 @@ uint32_t heap_topk(InvertedIndex const& index, const range r, const uint32_t k,
     while (!q.empty()) {
         auto& z = q.top();
         auto doc_id = *z;
-        topk_scores[results++] = doc_id;
-        if (results == k) return results;
+        bool alread_present = std::binary_search(
+            topk_scores.begin(), topk_scores.begin() + results, doc_id);
+        if (!alread_present) {
+            topk_scores[results++] = doc_id;
+            if (results == k) return results;
+        }
         z.next();
         if (!z.has_next()) q.pop();
         q.heapify();
diff --git a/include/compact_forward_index.hpp b/include/compact_forward_index.hpp
index 21aaa7c..50267f4 100644
--- a/include/compact_forward_index.hpp
+++ b/include/compact_forward_index.hpp
@@ -104,6 +104,7 @@ struct compact_forward_index {
     };
 
     forward_list_iterator_type iterator(id_type doc_id) {
+        assert(doc_id < num_docs());
         uint64_t pos = m_pointers.access(doc_id);
         uint64_t n = m_pointers.access(doc_id + 1) - pos;
         return {m_data, pos, n};
diff --git a/include/minimal_docids.hpp b/include/minimal_docids.hpp
new file mode 100644
index 0000000..a7cb8f8
--- /dev/null
+++ b/include/minimal_docids.hpp
@@ -0,0 +1,131 @@
+#pragma once
+
+#include "compact_vector.hpp"
+#include "util_types.hpp"
+
+namespace autocomplete {
+
+template <typename RMQ, typename InvertedIndex>
+struct minimal_docids {
+    static const uint32_t SCAN_THRESHOLD = 64;
+    typedef scored_range_with_list_iterator<
+        typename InvertedIndex::iterator_type>
+        range_type;
+    typedef scored_range_with_list_iterator_comparator<
+        typename range_type::iterator_type>
+        comparator_range_type;
+
+    minimal_docids() {}
+
+    void build(std::vector<id_type> const& list) {
+        essentials::logger("building minimal_docids...");
+        m_rmq.build(list, std::less<id_type>());
+        m_list.build(list.begin(), list.size());
+        essentials::logger("DONE");
+    }
+
+    uint32_t topk(InvertedIndex const& index, const range r, const uint32_t k,
+                  std::vector<id_type>& topk_scores) {
+        range_type sr;
+        sr.r = {r.begin, r.end - 1};  // rmq needs inclusive ranges
+        sr.min_pos = m_rmq.rmq(sr.r.begin, sr.r.end);
+        sr.min_val = m_list.access(sr.min_pos);
+
+        m_q.clear();
+        m_q.push(sr);
+
+        uint32_t results = 0;
+        while (!m_q.empty()) {
+            auto& min = m_q.top();
+            auto docid = min.minimum();
+            bool alread_present = std::binary_search(
+                topk_scores.begin(), topk_scores.begin() + results, docid);
+            if (!alread_present) {
+                topk_scores[results++] = docid;
+                if (results == k) break;
+            }
+
+            if (min.is_open()) {
+                min.iterator.next();
+                if (!min.iterator.has_next()) {
+                    m_q.pop();
+                }
+                m_q.heapify();
+            } else {
+                // save
+                auto min_range = min.r;
+                auto min_pos = min.min_pos;
+
+                min.set_iterator(index);
+                min.iterator.next();
+                if (!min.iterator.has_next()) {
+                    m_q.pop();
+                }
+
+                m_q.heapify();
+
+                if (min_pos > 0 and min_pos - 1 >= min_range.begin) {
+                    range_type left;
+                    left.r = {min_range.begin, min_pos - 1};
+                    if (left.r.end - left.r.begin <= SCAN_THRESHOLD) {
+                        left.min_pos = rmq(left.r.begin, left.r.end);
+                    } else {
+                        left.min_pos = m_rmq.rmq(left.r.begin, left.r.end);
+                    }
+                    left.min_val = m_list.access(left.min_pos);
+                    m_q.push(left);
+                }
+
+                if (min_pos < size() - 1 and min_range.end >= min_pos + 1) {
+                    range_type right;
+                    right.r = {min_pos + 1, min_range.end};
+                    if (right.r.end - right.r.begin <= SCAN_THRESHOLD) {
+                        right.min_pos = rmq(right.r.begin, right.r.end);
+                    } else {
+                        right.min_pos = m_rmq.rmq(right.r.begin, right.r.end);
+                    }
+                    right.min_val = m_list.access(right.min_pos);
+                    m_q.push(right);
+                }
+            }
+        }
+
+        return results;
+    }
+
+    size_t size() const {
+        return m_list.size();
+    }
+
+    size_t bytes() const {
+        return m_rmq.bytes() + m_list.bytes();
+    }
+
+    template <typename Visitor>
+    void visit(Visitor& visitor) {
+        visitor.visit(m_rmq);
+        visitor.visit(m_list);
+    }
+
+private:
+    typedef min_heap<range_type, comparator_range_type> min_priority_queue_type;
+    min_priority_queue_type m_q;
+
+    RMQ m_rmq;
+    compact_vector m_list;
+
+    uint64_t rmq(uint64_t lo, uint64_t hi) {  // inclusive endpoints
+        uint64_t pos = lo;
+        id_type min = id_type(-1);
+        for (uint64_t i = lo; i <= hi; ++i) {
+            id_type val = m_list.access(i);
+            if (val < min) {
+                min = val;
+                pos = i;
+            }
+        }
+        return pos;
+    }
+};
+
+}  // namespace autocomplete
\ No newline at end of file
diff --git a/include/scored_string_pool.hpp b/include/scored_string_pool.hpp
index c679aeb..3f03f06 100644
--- a/include/scored_string_pool.hpp
+++ b/include/scored_string_pool.hpp
@@ -4,6 +4,11 @@
 
 namespace autocomplete {
 
+struct scored_byte_range {
+    byte_range string;
+    id_type score;
+};
+
 struct scored_string_pool {
     void init() {
         push_back_offset(0);
diff --git a/include/statistics.hpp b/include/statistics.hpp
index aa1fbe0..42654ae 100644
--- a/include/statistics.hpp
+++ b/include/statistics.hpp
@@ -40,9 +40,9 @@ void completion_trie<Nodes, Pointers, LeftExtremes, Sizes>::print_stats()
     print_bps("sizes", sizes_bytes(), size());
 }
 
-template <typename Completions, typename UnsortedDocsList, typename Dictionary,
-          typename InvertedIndex, typename ForwardIndex>
-void autocomplete<Completions, UnsortedDocsList, Dictionary, InvertedIndex,
+template <typename Completions, typename Dictionary, typename InvertedIndex,
+          typename ForwardIndex>
+void autocomplete<Completions, Dictionary, InvertedIndex,
                   ForwardIndex>::print_stats() const {
     size_t total_bytes = bytes();
     std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
@@ -86,10 +86,9 @@ void autocomplete<Completions, UnsortedDocsList, Dictionary, InvertedIndex,
               m_forward_index.num_integers());
 }
 
-template <typename Completions, typename UnsortedDocsList, typename Dictionary,
-          typename InvertedIndex>
-void autocomplete2<Completions, UnsortedDocsList, Dictionary,
-                   InvertedIndex>::print_stats() const {
+template <typename Completions, typename Dictionary, typename InvertedIndex>
+void autocomplete2<Completions, Dictionary, InvertedIndex>::print_stats()
+    const {
     size_t total_bytes = bytes();
     std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
               << " [MiB]: "
@@ -122,10 +121,9 @@ void autocomplete2<Completions, UnsortedDocsList, Dictionary,
           m_completions.size());
 }
 
-template <typename Completions, typename UnsortedDocsList, typename Dictionary,
-          typename InvertedIndex>
-void autocomplete3<Completions, UnsortedDocsList, Dictionary,
-                   InvertedIndex>::print_stats() const {
+template <typename Completions, typename Dictionary, typename InvertedIndex>
+void autocomplete3<Completions, Dictionary, InvertedIndex>::print_stats()
+    const {
     size_t total_bytes = bytes();
     std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
               << " [MiB]: "
@@ -148,10 +146,10 @@ void autocomplete3<Completions, UnsortedDocsList, Dictionary,
           m_completions.size());
 }
 
-template <typename Completions, typename UnsortedDocsList, typename Dictionary,
+template <typename Completions, typename Dictionary,
           typename BlockedInvertedIndex>
-void autocomplete4<Completions, UnsortedDocsList, Dictionary,
-                   BlockedInvertedIndex>::print_stats() const {
+void autocomplete4<Completions, Dictionary, BlockedInvertedIndex>::print_stats()
+    const {
     size_t total_bytes = bytes();
     std::cout << "using " << essentials::convert(total_bytes, essentials::MiB)
               << " [MiB]: "
diff --git a/include/types.hpp b/include/types.hpp
index 6481276..659199d 100644
--- a/include/types.hpp
+++ b/include/types.hpp
@@ -3,75 +3,45 @@
 #include "completion_trie.hpp"
 #include "fc_dictionary.hpp"
 #include "integer_fc_dictionary.hpp"
-#include "unsorted_list.hpp"
-
-// #include "uint_vec.hpp"
-// #include "uncompressed_list.hpp"
-
 #include "compact_forward_index.hpp"
-
 #include "inverted_index.hpp"
 #include "blocked_inverted_index.hpp"
-
 #include "autocomplete.hpp"
 #include "autocomplete2.hpp"
 #include "autocomplete3.hpp"
 #include "autocomplete4.hpp"
-
 #include "compact_vector.hpp"
 #include "ef/ef_sequence.hpp"
 #include "ef/compact_ef.hpp"
-#include "succinct_rmq/cartesian_tree.hpp"
 
 namespace autocomplete {
 
 typedef uint_vec<uint32_t> uint32_vec;
 typedef uint_vec<uint64_t> uint64_vec;
 
-// typedef completion_trie<uint64_vec, uint32_vec, uint32_vec, uint32_vec>
-//     uint64_completion_trie;
-
 typedef completion_trie<ef::ef_sequence, ef::ef_sequence, ef::ef_sequence,
                         ef::ef_sequence>
     ef_completion_trie;
-
 typedef fc_dictionary<> fc_dictionary_type;
 typedef integer_fc_dictionary<> integer_fc_dictionary_type;
-
-typedef unsorted_list<cartesian_tree> succinct_rmq;
-// typedef uncompressed_list<uint32_t> uncompressed_list32_t;
-
-// typedef inverted_index<uncompressed_list32_t> uncompressed_inverted_index;
 typedef inverted_index<ef::compact_ef> ef_inverted_index;
-
-// typedef blocked_inverted_index<uncompressed_list32_t>
-//     uncompressed_blocked_inverted_index;
 typedef blocked_inverted_index<ef::compact_ef> ef_blocked_inverted_index;
 
-// typedef autocomplete<uint64_completion_trie, succinct_rmq,
-// fc_dictionary_type,
-//                      uncompressed_inverted_index, uncompressed_forward_index>
-//     uncompressed_autocomplete_type;
-
-// typedef autocomplete2<integer_fc_dictionary_type, succinct_rmq,
-//                       fc_dictionary_type, uncompressed_inverted_index>
-//     uncompressed_autocomplete_type2;
-
 /* compressed indexes */
-typedef autocomplete<ef_completion_trie, succinct_rmq, fc_dictionary_type,
-                     ef_inverted_index, compact_forward_index>
+typedef autocomplete<ef_completion_trie, fc_dictionary_type, ef_inverted_index,
+                     compact_forward_index>
     ef_autocomplete_type1;
 
-typedef autocomplete2<integer_fc_dictionary_type, succinct_rmq,
-                      fc_dictionary_type, ef_inverted_index>
+typedef autocomplete2<integer_fc_dictionary_type, fc_dictionary_type,
+                      ef_inverted_index>
     ef_autocomplete_type2;
 
-typedef autocomplete3<integer_fc_dictionary_type, succinct_rmq,
-                      fc_dictionary_type, ef_inverted_index>
+typedef autocomplete3<integer_fc_dictionary_type, fc_dictionary_type,
+                      ef_inverted_index>
     ef_autocomplete_type3;
 
-typedef autocomplete4<integer_fc_dictionary_type, succinct_rmq,
-                      fc_dictionary_type, ef_blocked_inverted_index>
+typedef autocomplete4<integer_fc_dictionary_type, fc_dictionary_type,
+                      ef_blocked_inverted_index>
     ef_autocomplete_type4;
 
 }  // namespace autocomplete
\ No newline at end of file
diff --git a/include/unsorted_list.hpp b/include/unsorted_list.hpp
index e7cfddd..bb06a86 100644
--- a/include/unsorted_list.hpp
+++ b/include/unsorted_list.hpp
@@ -1,48 +1,10 @@
 #pragma once
 
 #include "compact_vector.hpp"
+#include "util_types.hpp"
 
 namespace autocomplete {
 
-struct scored_byte_range {
-    byte_range string;
-    id_type score;
-};
-
-typedef std::function<bool(scored_range const&, scored_range const&)>
-    scored_range_comparator_type;
-scored_range_comparator_type scored_range_comparator =
-    [](scored_range const& l, scored_range const& r) {
-        return l.min_val > r.min_val;
-    };
-
-struct topk_queue {
-    void push(scored_range sr) {
-        m_q.push_back(sr);
-        std::push_heap(m_q.begin(), m_q.end(), scored_range_comparator);
-    }
-
-    scored_range top() {
-        return m_q.front();
-    }
-
-    void pop() {
-        std::pop_heap(m_q.begin(), m_q.end(), scored_range_comparator);
-        m_q.pop_back();
-    }
-
-    void clear() {
-        m_q.clear();
-    }
-
-    bool empty() const {
-        return m_q.empty();
-    }
-
-private:
-    std::vector<scored_range> m_q;
-};
-
 template <typename RMQ>
 struct unsorted_list {
     static const uint32_t SCAN_THRESHOLD = 64;
@@ -132,6 +94,40 @@ struct unsorted_list {
     }
 
 private:
+    struct topk_queue {
+        void push(scored_range sr) {
+            m_q.push_back(sr);
+            std::push_heap(m_q.begin(), m_q.end(), m_comparator);
+        }
+
+        scored_range top() {
+            return m_q.front();
+        }
+
+        void pop() {
+            std::pop_heap(m_q.begin(), m_q.end(), m_comparator);
+            m_q.pop_back();
+        }
+
+        void clear() {
+            m_q.clear();
+        }
+
+        bool empty() const {
+            return m_q.empty();
+        }
+
+    private:
+        std::vector<scored_range> m_q;
+
+        typedef std::function<bool(scored_range const&, scored_range const&)>
+            scrored_range_comparator_type;
+        scrored_range_comparator_type m_comparator = [](scored_range const& l,
+                                                        scored_range const& r) {
+            return scored_range::greater(l, r);
+        };
+    };
+
     topk_queue m_q;
     RMQ m_rmq;
     compact_vector m_list;
diff --git a/include/util_types.hpp b/include/util_types.hpp
index 531e65d..0890002 100644
--- a/include/util_types.hpp
+++ b/include/util_types.hpp
@@ -62,6 +62,55 @@ struct scored_range {
     range r;
     uint32_t min_pos;
     id_type min_val;
+
+    static bool greater(scored_range const& l, scored_range const& r) {
+        return l.min_val > r.min_val;
+    }
+};
+
+template <typename Iterator>
+struct scored_range_with_list_iterator {
+    typedef Iterator iterator_type;
+
+    scored_range_with_list_iterator()
+        : min_pos(global::invalid_term_id)
+        , m_open(false) {}
+
+    range r;
+    uint32_t min_pos;
+    id_type min_val;
+    Iterator iterator;
+
+    bool is_open() const {
+        return m_open;
+    }
+
+    template <typename InvertedIndex>
+    void set_iterator(InvertedIndex const& index) {
+        assert(min_pos != global::invalid_term_id);
+        m_open = true;
+        iterator = index.iterator(min_pos);
+    }
+
+    id_type minimum() const {
+        return is_open() ? *iterator : min_val;
+    }
+
+    // static bool greater(scored_range_with_list_iterator const& l,
+    //                     scored_range_with_list_iterator const& r) {
+    //     return l.minimum() > r.minimum();
+    // }
+
+private:
+    bool m_open;
+};
+
+template <typename Iterator>
+struct scored_range_with_list_iterator_comparator {
+    bool operator()(scored_range_with_list_iterator<Iterator> const& l,
+                    scored_range_with_list_iterator<Iterator> const& r) {
+        return l.minimum() > r.minimum();
+    }
 };
 
 struct byte_range {
diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp
index 8b1ce0f..2760532 100644
--- a/test/test_unsorted_list.cpp
+++ b/test/test_unsorted_list.cpp
@@ -74,14 +74,14 @@ TEST_CASE("test unsorted_list on doc_ids") {
         //     }
         // }
 
-        succinct_rmq list;
+        unsorted_list_type list;
         list.build(doc_ids);
         REQUIRE(list.size() == doc_ids.size());
-        essentials::save<succinct_rmq>(list, output_filename);
+        essentials::save<unsorted_list_type>(list, output_filename);
     }
 
     {
-        succinct_rmq list;
+        unsorted_list_type list;
         essentials::load(list, output_filename);
 
         std::vector<id_type> topk(constants::MAX_K);
@@ -137,14 +137,14 @@ TEST_CASE("test unsorted_list on minimal doc_ids") {
         input.close();
         REQUIRE(doc_ids.size() == params.num_terms);
 
-        succinct_rmq list;
+        unsorted_list_type list;
         list.build(doc_ids);
         REQUIRE(list.size() == doc_ids.size());
-        essentials::save<succinct_rmq>(list, output_filename);
+        essentials::save<unsorted_list_type>(list, output_filename);
     }
 
     {
-        succinct_rmq list;
+        unsorted_list_type list;
         essentials::load(list, output_filename);
 
         std::vector<id_type> topk(constants::MAX_K);

From cba9599c74f84cbd6ef375abf1e957d809fab0da Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 12 May 2020 11:12:38 +0200
Subject: [PATCH 062/102] updated README with paper information

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1247d50..2ab2e1c 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
 Autocomplete
 ------------
 
-Query autocompletion in C++.
+A Query Auto-Completion system based on the paper [Efficient and Effective Query Auto-Completion](http://pages.di.unipi.it/pibiri/papers/SIGIR20.pdf), by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini,
+published in ACM SIGIR 2020.
 
 ##### Table of contents
 1. [Installation and quick start](#install)

From ff80c3ac41e3bc5b25a7457089d115a6fa4d413f Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 12 May 2020 11:14:02 +0200
Subject: [PATCH 063/102] updated README with paper information

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2ab2e1c..df9c825 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 Autocomplete
 ------------
 
-A Query Auto-Completion system based on the paper [Efficient and Effective Query Auto-Completion](http://pages.di.unipi.it/pibiri/papers/SIGIR20.pdf), by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini,
+A Query Auto-Completion system based on the paper *[Efficient and Effective Query Auto-Completion](http://pages.di.unipi.it/pibiri/papers/SIGIR20.pdf)*, by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini,
 published in ACM SIGIR 2020.
 
 ##### Table of contents

From 8154ee69aafe1aad5e103e1994f4440536215b18 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Wed, 4 Aug 2021 21:48:36 +0530
Subject: [PATCH 064/102] ci: github actions setup

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 .github/workflows/continuous_integration.yml | 37 ++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/continuous_integration.yml

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
new file mode 100644
index 0000000..7daf050
--- /dev/null
+++ b/.github/workflows/continuous_integration.yml
@@ -0,0 +1,37 @@
+name: Continuous Integration
+
+on:
+  [ push,pull_request ]
+
+jobs:
+  build:
+    name: Continuous Integration
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ ubuntu-latest ]
+    steps:
+
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Checkout submodules
+        run: git submodule update --init --recursive
+
+      - name: Check cmake version
+        run: cmake --version
+
+      - name: Creating build directory
+        run: cmake -E make_directory ./build
+
+      - name: Precompilation
+        working-directory: ./build
+        run: cmake .. -DCMAKE_BUILD_TYPE=Release
+
+      - name: Compilation
+        working-directory: ./build
+        run: cmake --build . --config Release
+
+      - name: Testing
+        working-directory: ./build
+        run: ctest

From bb224284fadb62adecdcec33e01a0c341e5bb3bb Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Wed, 4 Aug 2021 21:57:51 +0530
Subject: [PATCH 065/102] ci: added data preprocessing step

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 .github/workflows/continuous_integration.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 7daf050..b044478 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -32,6 +32,10 @@ jobs:
         working-directory: ./build
         run: cmake --build . --config Release
 
+      - name: Data preprocessing
+        working-directory: ./test_data
+        run: bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300
+
       - name: Testing
         working-directory: ./build
         run: ctest

From 36a78e5799fe8b7bd1de12733780da1f8a3bb1e7 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Wed, 4 Aug 2021 22:07:42 +0530
Subject: [PATCH 066/102] ci: using python2 for data preprocessing

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 .github/workflows/continuous_integration.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index b044478..8cd6890 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -33,6 +33,10 @@ jobs:
         run: cmake --build . --config Release
 
       - name: Data preprocessing
+        uses: actions/setup-python@v2
+        with:
+          python-version: 2.x
+          architecture: x64
         working-directory: ./test_data
         run: bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300
 

From e3ed47846cc2d7f817cc01f455bdd5ff3c0faae2 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Wed, 4 Aug 2021 22:11:02 +0530
Subject: [PATCH 067/102] ci: fixed yml error

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 .github/workflows/continuous_integration.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 8cd6890..c2c76d8 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -32,11 +32,13 @@ jobs:
         working-directory: ./build
         run: cmake --build . --config Release
 
-      - name: Data preprocessing
+      - name: Setup python
         uses: actions/setup-python@v2
-        with:
-          python-version: 2.x
-          architecture: x64
+          with:
+            python-version: 2.x
+            architecture: x64
+
+      - name: Data preprocessing
         working-directory: ./test_data
         run: bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300
 

From 57be3895383f89f22fdc75b3c92a12ef201a1630 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Wed, 4 Aug 2021 22:14:31 +0530
Subject: [PATCH 068/102] ci: fixed yaml indentation

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 .github/workflows/continuous_integration.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index c2c76d8..4a8e6ce 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -34,9 +34,9 @@ jobs:
 
       - name: Setup python
         uses: actions/setup-python@v2
-          with:
-            python-version: 2.x
-            architecture: x64
+        with:
+          python-version: '2.x'
+          architecture: 'x64'
 
       - name: Data preprocessing
         working-directory: ./test_data

From 35acea671e8e70cb79a58d3a93b03c419d14e0fc Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Thu, 5 Aug 2021 23:30:31 +0530
Subject: [PATCH 069/102] ci: added dockerfile and ci steps

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 .github/workflows/continuous_integration.yml |  9 +++++++++
 Dockerfile                                   | 11 +++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 Dockerfile

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 4a8e6ce..f605183 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -45,3 +45,12 @@ jobs:
       - name: Testing
         working-directory: ./build
         run: ctest
+
+      - name: Building docker image
+        run: docker build -t ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest .
+
+      - name: Dockerhub Authentication
+        run: docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
+
+      - name: Publishing image to Container Registry
+        run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..0aefcb7
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,11 @@
+FROM alpine:latest
+
+RUN addgroup -S appgroup && adduser -S appuser -G appgroup
+
+USER appuser
+
+COPY ./build /app
+
+WORKDIR /app
+
+CMD ["./web_server", "8000", "trec_05.ef_type1.bin"]
\ No newline at end of file

From 23f585163819af61ba83203897584d7c6b8d9ef2 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Fri, 6 Aug 2021 08:56:50 +0530
Subject: [PATCH 070/102] ci: docker base image changed from alpine to ubuntu

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 Dockerfile | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0aefcb7..fe19d62 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,15 @@
-FROM alpine:latest
+FROM ubuntu:latest
 
-RUN addgroup -S appgroup && adduser -S appuser -G appgroup
+EXPOSE 8000
 
-USER appuser
+RUN groupadd appgroup && useradd appuser -G appgroup
 
 COPY ./build /app
 
 WORKDIR /app
 
-CMD ["./web_server", "8000", "trec_05.ef_type1.bin"]
\ No newline at end of file
+RUN chmod +x web_server
+
+USER appuser
+
+CMD ["./web_server", "8000", "trec_05.ef_type1.bin"]

From 6b6f28a1a1f39ff9ae7a54a44f2240c8ed2667df Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Fri, 6 Aug 2021 21:50:09 +0530
Subject: [PATCH 071/102] ci: added binary dict building step

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 .github/workflows/continuous_integration.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index f605183..76645b2 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -46,6 +46,10 @@ jobs:
         working-directory: ./build
         run: ctest
 
+      - name: Build binary dictionary
+        working-directory: build
+        run: chmod +x build && ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin
+
       - name: Building docker image
         run: docker build -t ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest .
 

From c58dee45ae1acfb0ad88982d1faf3fbd8e29ddb7 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Sat, 7 Aug 2021 12:36:52 +0530
Subject: [PATCH 072/102] ci: using root user in docker

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index fe19d62..b9acc33 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,6 +10,6 @@ WORKDIR /app
 
 RUN chmod +x web_server
 
-USER appuser
+# USER appuser
 
 CMD ["./web_server", "8000", "trec_05.ef_type1.bin"]

From 05e3055d136edf8fcecf4911fcc72e4d3b1f5f13 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Sat, 7 Aug 2021 19:12:35 +0530
Subject: [PATCH 073/102] ci: compiling in the dockerfile

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 Dockerfile | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b9acc33..1c462ab 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,12 +4,22 @@ EXPOSE 8000
 
 RUN groupadd appgroup && useradd appuser -G appgroup
 
-COPY ./build /app
+COPY . /src
 
 WORKDIR /app
 
-RUN chmod +x web_server
+RUN apt update && apt install -y cmake g++ python
 
-# USER appuser
+RUN cmake /src && cmake --build .
+
+RUN chmod +x web_server && chmod +x build
+
+RUN ./build ef_type1 /src/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin
+
+RUN apt purge -y cmake g++ python
+
+RUN rm -rf /src
+
+USER appuser
 
 CMD ["./web_server", "8000", "trec_05.ef_type1.bin"]

From ce76e9a23a3dd190d0f6d3fd19e20bbcf0b4e175 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Sat, 7 Aug 2021 21:15:56 +0530
Subject: [PATCH 074/102] ci(workflow): docker image pushed to container
 registry only on master

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 .github/workflows/continuous_integration.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index 76645b2..f45a3dd 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -57,4 +57,5 @@ jobs:
         run: docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
 
       - name: Publishing image to Container Registry
+        if: github.ref == 'refs/heads/master'
         run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest

From 7e78b9e3991df0e5dad6b5e657a40fe7988b2ca5 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Sat, 7 Aug 2021 21:29:31 +0530
Subject: [PATCH 075/102] docs: docker instructions added in readme

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index df9c825..4031280 100644
--- a/README.md
+++ b/README.md
@@ -182,3 +182,15 @@ Live demo <a name="demo"></a>
 
 Start the web server with the program `./web_server <port> <index_filename>` and access the demo at
 `localhost:<port>`.
+
+Use a prebuilt docker image
+----------
+
+The following command pulls a prebuilt docker image and runs it locally.
+
+```bash
+docker pull jermp/autocomplete
+docker run -p 8000:8000 -d jermp/autocomplete
+```
+
+The demo can be accessed at [http://localhost:8000](http://localhost:8000)

From 1fa8497ce4e727439f0266e7c8019eeb577d7d72 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Tue, 10 Aug 2021 15:31:29 +0200
Subject: [PATCH 076/102] commented out docker information from readme

---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 4031280..7747a7d 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,18 @@ After that, for having a minimal running example, just run
 
 and then access the service [here](http://127.0.0.1:8000).
 
+<!--Use a prebuilt docker image
+---------------------------
+
+The following command pulls a prebuilt docker image and runs it locally.
+
+```bash
+docker pull jermp/autocomplete
+docker run -p 8000:8000 -d jermp/autocomplete
+```
+
+The demo can be accessed at [http://localhost:8000](http://localhost:8000).-->
+
 Compiling the code <a name="compiling"></a>
 ------------------
 
@@ -181,16 +193,4 @@ Live demo <a name="demo"></a>
 ----------
 
 Start the web server with the program `./web_server <port> <index_filename>` and access the demo at
-`localhost:<port>`.
-
-Use a prebuilt docker image
-----------
-
-The following command pulls a prebuilt docker image and runs it locally.
-
-```bash
-docker pull jermp/autocomplete
-docker run -p 8000:8000 -d jermp/autocomplete
-```
-
-The demo can be accessed at [http://localhost:8000](http://localhost:8000)
+`localhost:<port>`.
\ No newline at end of file

From c91dd28fcd065e7279b91ce3de558d520e5c1c2f Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 26 Aug 2021 11:10:12 +0200
Subject: [PATCH 077/102] added instructions for using a Docker image (credits
 to Razdeep)

---
 README.md | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 7747a7d..40d263e 100644
--- a/README.md
+++ b/README.md
@@ -26,19 +26,16 @@ After that, for having a minimal running example, just run
 
 	bash ./example.sh
 
-and then access the service [here](http://127.0.0.1:8000).
+and then access the service [from localhost](http://localhost:8000).
 
-<!--Use a prebuilt docker image
----------------------------
+### Or you can use a prebuilt Docker image  
 
-The following command pulls a prebuilt docker image and runs it locally.
+The following command pulls a prebuilt Docker image and runs it locally.
 
-```bash
-docker pull jermp/autocomplete
-docker run -p 8000:8000 -d jermp/autocomplete
-```
+	docker pull jermp/autocomplete
+	docker run -p 8000:8000 -d jermp/autocomplete
 
-The demo can be accessed at [http://localhost:8000](http://localhost:8000).-->
+And then access the service [from localhost](http://localhost:8000).
 
 Compiling the code <a name="compiling"></a>
 ------------------

From 2bb6a118eb506519eb72227230e75fdb261bec7f Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Wed, 3 Nov 2021 22:04:26 +0100
Subject: [PATCH 078/102] removed unused copy assignment operator

---
 include/bit_vector.hpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/include/bit_vector.hpp b/include/bit_vector.hpp
index 747faef..4afb7dd 100644
--- a/include/bit_vector.hpp
+++ b/include/bit_vector.hpp
@@ -242,12 +242,6 @@ struct bit_vector {
         build(in);
     }
 
-    bit_vector& operator=(bit_vector const& other) {
-        bit_vector tmp(other);
-        tmp.swap(*this);
-        return *this;
-    }
-
     void swap(bit_vector& other) {
         std::swap(other.m_size, m_size);
         other.m_bits.swap(m_bits);

From 83b921667dcb8064b02cee2cb9a8b72d7d616469 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Fri, 5 Nov 2021 09:07:13 +0100
Subject: [PATCH 079/102] added quality code badge

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 40d263e..12e7f61 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![Language grade: C/C++](https://img.shields.io/lgtm/grade/cpp/g/jermp/autocomplete.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/jermp/autocomplete/context:cpp)
+
 Autocomplete
 ------------
 

From 1e611a67718a89a1e56b5cb95a874c3880b33353 Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Mon, 7 Feb 2022 14:45:18 +0530
Subject: [PATCH 080/102] fix: util::find() issue fixed

When linear scanning doesn't find the target element, the control must
not go back to the binary search logic.

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 include/util.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/util.hpp b/include/util.hpp
index bb20bdb..27942d3 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -45,6 +45,7 @@ uint64_t find(S const& sequence, uint64_t id, uint64_t lo, uint64_t hi) {
                     return pos;
                 }
             }
+            break;
         }
         uint64_t pos = lo + ((hi - lo) >> 1);
         uint64_t val = sequence.access(pos);

From 9627eefbd01e79dd0ad55a47de5d1faf4c1bb91c Mon Sep 17 00:00:00 2001
From: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
Date: Tue, 8 Feb 2022 12:38:46 +0530
Subject: [PATCH 081/102] fix: unsigned underflow handled

Signed-off-by: Rajdeep Roy Chowdhury <rrajdeeproychowdhury@gmail.com>
---
 include/util.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/util.hpp b/include/util.hpp
index 27942d3..b08b3b7 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -45,13 +45,17 @@ uint64_t find(S const& sequence, uint64_t id, uint64_t lo, uint64_t hi) {
                     return pos;
                 }
             }
-            break;
+//            break;
         }
         uint64_t pos = lo + ((hi - lo) >> 1);
         uint64_t val = sequence.access(pos);
         if (val == id) {
             return pos;
         } else if (val > id) {
+            // Rescuing hi from unsigned underflow
+            if (pos == 0) {
+                return global::not_found;
+            }
             hi = pos - 1;
         } else {
             lo = pos + 1;

From 475450a5879fb390557fc4c3252b461cee8afd79 Mon Sep 17 00:00:00 2001
From: jermp <jeis90@gmail.com>
Date: Tue, 8 Feb 2022 12:37:31 +0100
Subject: [PATCH 082/102] style

---
 include/util.hpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/util.hpp b/include/util.hpp
index b08b3b7..4f0b89e 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -45,17 +45,13 @@ uint64_t find(S const& sequence, uint64_t id, uint64_t lo, uint64_t hi) {
                     return pos;
                 }
             }
-//            break;
         }
         uint64_t pos = lo + ((hi - lo) >> 1);
         uint64_t val = sequence.access(pos);
         if (val == id) {
             return pos;
         } else if (val > id) {
-            // Rescuing hi from unsigned underflow
-            if (pos == 0) {
-                return global::not_found;
-            }
+            if (pos == 0) return global::not_found;
             hi = pos - 1;
         } else {
             lo = pos + 1;

From 5ddded0c6652ed25622d74eab98859bbb68cdbb8 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Thu, 14 Jul 2022 15:26:46 +0200
Subject: [PATCH 083/102] added script to build indexes for test

---
 script/build_indexes.py | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 script/build_indexes.py

diff --git a/script/build_indexes.py b/script/build_indexes.py
new file mode 100644
index 0000000..e01e1db
--- /dev/null
+++ b/script/build_indexes.py
@@ -0,0 +1,6 @@
+import sys, os
+
+dataset_name = sys.argv[1] # e.g., aol
+types = ["ef_type1", "ef_type2", "ef_type3", "ef_type4"]
+for t in types:
+    os.system("./build " + t + " ../test_data/" + dataset_name + "/" + dataset_name + ".completions -o " + t + "." + dataset_name + ".bin -c 0.0001")
\ No newline at end of file

From 3320ae51689fdc2123261f140913510ee981035c Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sun, 8 Jan 2023 10:28:32 +0100
Subject: [PATCH 084/102] Update README.md

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 12e7f61..eec59b1 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-[![Language grade: C/C++](https://img.shields.io/lgtm/grade/cpp/g/jermp/autocomplete.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/jermp/autocomplete/context:cpp)
-
 Autocomplete
 ------------
 
@@ -192,4 +190,4 @@ Live demo <a name="demo"></a>
 ----------
 
 Start the web server with the program `./web_server <port> <index_filename>` and access the demo at
-`localhost:<port>`.
\ No newline at end of file
+`localhost:<port>`.

From ced049a0fae414320c4ce459955719415d2b12e4 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 23 Sep 2023 16:01:52 +0200
Subject: [PATCH 085/102] updated scripts in test_data to python3; added one
 extra assert (redundant)

---
 README.md                               |  6 +++---
 include/ef/ef_sequence.hpp              |  1 +
 test_data/build_inverted_and_forward.py |  2 +-
 test_data/build_stats.py                |  2 +-
 test_data/extract_dict.py               |  5 ++---
 test_data/preprocess.sh                 | 10 +++++-----
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index eec59b1..e77b01d 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ After that, for having a minimal running example, just run
 
 and then access the service [from localhost](http://localhost:8000).
 
-### Or you can use a prebuilt Docker image  
+### Or you can use a prebuilt Docker image
 
 The following command pulls a prebuilt Docker image and runs it locally.
 
@@ -160,7 +160,7 @@ They should have been created already if you have run the
 script `preprocess.sh`, otherwise
 you can use
 
-	python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries 300
+	python3 partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries 300
 
 to partition the input completions by number of query terms
 and retain 300 queries at random.
@@ -179,7 +179,7 @@ of the prefix of the last token is retained.
 We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`.
 From within the `/build` directory, run
 
-	python ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300
+	python3 ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300
 
 To benchmark the dictionaries (Front-Coding and trie), just run the following script from within
 the `script` directory:
diff --git a/include/ef/ef_sequence.hpp b/include/ef/ef_sequence.hpp
index 0632f83..2e9e293 100644
--- a/include/ef/ef_sequence.hpp
+++ b/include/ef/ef_sequence.hpp
@@ -49,6 +49,7 @@ struct ef_sequence {
             ++within;
         }
         assert(values.size() == n);
+        assert(std::is_sorted(values.begin(), values.end()));
         compress(values.begin(), values.size(), values.back());
     }
 
diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py
index acf4b8e..0966d99 100644
--- a/test_data/build_inverted_and_forward.py
+++ b/test_data/build_inverted_and_forward.py
@@ -52,7 +52,7 @@
         if not discard:
             # NOTE: not sorted!
             if doc_id >= num_docs:
-                print doc_id,num_docs
+                print(doc_id,num_docs)
             forward_index[doc_id] = mapped;
 
             lines += 1
diff --git a/test_data/build_stats.py b/test_data/build_stats.py
index 8e60a39..880bcd3 100644
--- a/test_data/build_stats.py
+++ b/test_data/build_stats.py
@@ -43,7 +43,7 @@
 output_file.write(str(lines) + "\n")
 output_file.write(str(universe + 1) + "\n")
 output_file.write(str(len(nodes_per_level)) + "\n")
-for key, value in sorted(nodes_per_level.iteritems(), key = lambda kv: kv[0]):
+for key, value in sorted(nodes_per_level.items(), key = lambda kv: kv[0]):
     output_file.write(str(value) + "\n")
 output_file.close()
 
diff --git a/test_data/extract_dict.py b/test_data/extract_dict.py
index e3c05b5..e9b48d0 100644
--- a/test_data/extract_dict.py
+++ b/test_data/extract_dict.py
@@ -1,9 +1,8 @@
 import sys
-from sets import Set
 
 input_filename = sys.argv[1]
 
-tokens = Set({})
+tokens = set()
 lines = 0
 
 print("parsing input file...")
@@ -14,7 +13,7 @@
             tokens.add(x[i])
         lines += 1
         if lines % 1000000 == 0:
-            print "processed " + str(lines) + " lines"
+            print("processed " + str(lines) + " lines")
 
 print("processed " + str(lines) + " lines")
 print("dictionary has " + str(len(tokens)) + " keys")
diff --git a/test_data/preprocess.sh b/test_data/preprocess.sh
index e3d96f7..b795bfe 100755
--- a/test_data/preprocess.sh
+++ b/test_data/preprocess.sh
@@ -2,8 +2,8 @@
 
 echo $1 # input filename
 echo $2 # number of queries for each size
-python extract_dict.py $1
-python map_dataset.py $1
-python build_stats.py $1.mapped
-python build_inverted_and_forward.py $1
-python partition_queries_by_length.py $1 $1.queries $2
+python3 extract_dict.py $1
+python3 map_dataset.py $1
+python3 build_stats.py $1.mapped
+python3 build_inverted_and_forward.py $1
+python3 partition_queries_by_length.py $1 $1.queries $2

From ee542b4afe133f81888be609adc05069bbe27840 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 23 Sep 2023 16:14:51 +0200
Subject: [PATCH 086/102] updated doctest and CMake version

---
 CMakeLists.txt   | 2 +-
 external/doctest | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2908d2c..9b3c162 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.5)
 project(AUTOCOMPLETE)
 
 if(CMAKE_BUILD_TYPE MATCHES Debug)
diff --git a/external/doctest b/external/doctest
index 7ac22cc..ae7a135 160000
--- a/external/doctest
+++ b/external/doctest
@@ -1 +1 @@
-Subproject commit 7ac22cc2190eb090ff66509015fb2d995bce957e
+Subproject commit ae7a13539fb71f270b87eb2e874fbac80bc8dda2

From 36db40f5c8263d27afb8499c0ce5016bd515b2f6 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 23 Sep 2023 16:25:36 +0200
Subject: [PATCH 087/102] updated python version in workflow file

---
 .github/workflows/continuous_integration.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
index f45a3dd..bf625be 100644
--- a/.github/workflows/continuous_integration.yml
+++ b/.github/workflows/continuous_integration.yml
@@ -33,9 +33,9 @@ jobs:
         run: cmake --build . --config Release
 
       - name: Setup python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v3
         with:
-          python-version: '2.x'
+          python-version: '3.x'
           architecture: 'x64'
 
       - name: Data preprocessing

From 5a4d67dda8abca3a5da2ff6210e38389064bead6 Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sat, 23 Sep 2023 16:32:42 +0200
Subject: [PATCH 088/102] updated python version in docker file

---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1c462ab..f29c164 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,7 +8,7 @@ COPY . /src
 
 WORKDIR /app
 
-RUN apt update && apt install -y cmake g++ python
+RUN apt update && apt install -y cmake g++ python3
 
 RUN cmake /src && cmake --build .
 
@@ -16,7 +16,7 @@ RUN chmod +x web_server && chmod +x build
 
 RUN ./build ef_type1 /src/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin
 
-RUN apt purge -y cmake g++ python
+RUN apt purge -y cmake g++ python3
 
 RUN rm -rf /src
 

From 134e1a97cc864e334405f45cdb1eb0f23710633d Mon Sep 17 00:00:00 2001
From: Giulio Ermanno Pibiri <jeis90@gmail.com>
Date: Sun, 24 Sep 2023 08:28:32 +0200
Subject: [PATCH 089/102] minor to readme

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e77b01d..69fe339 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,11 @@
 Autocomplete
 ------------
 
-A Query Auto-Completion system based on the paper *[Efficient and Effective Query Auto-Completion](http://pages.di.unipi.it/pibiri/papers/SIGIR20.pdf)*, by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini,
+A Query Auto-Completion system based on the paper *[Efficient and Effective Query Auto-Completion](https://dl.acm.org/doi/10.1145/3397271.3401432)*, by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini,
 published in ACM SIGIR 2020.
 
+Please, cite the paper if you use the data structures from this library.
+
 ##### Table of contents
 1. [Installation and quick start](#install)
 2. [Compiling the code](#compiling)
@@ -40,7 +42,8 @@ And then access the service [from localhost](http://localhost:8000).
 Compiling the code <a name="compiling"></a>
 ------------------
 
-The code has been tested on Linux with `gcc` 7.4.0, 8.3.0, 9.0.0 and on Mac 10.14 with `clang` 10.0.0.
+The code has been tested on Linux with `gcc` 7.4.0, 8.3.0, 9.0.0, on Mac OS 10.14 and 12.4 with `clang` 10.0.0 and 13.0.0.
+
 To build the code, [`CMake`](https://cmake.org/) is required.
 
 Clone the repository with

From 3dfb83af25f52b90f2f8845762a1af0d1783c73e Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 10:24:03 -0400
Subject: [PATCH 090/102] initial commit, porting primitieves

---
 .../workflows/continuous_integration.yml      |   0
 CMakeLists.txt => archive/CMakeLists.txt      |   0
 Dockerfile => archive/Dockerfile              |   0
 .../benchmark}/CMakeLists.txt                 |   0
 .../benchmark}/benchmark_common.hpp           |   0
 .../benchmark}/benchmark_conjunctive_topk.cpp |   0
 .../benchmark}/benchmark_fc_dictionary.cpp    |   0
 .../benchmark_integer_fc_dictionary.cpp       |   0
 .../benchmark}/benchmark_locate_prefix.cpp    |   0
 .../benchmark}/benchmark_prefix_topk.cpp      |   0
 .../benchmark}/benchmark_topk.cpp             |   0
 .../benchmark}/effectiveness.cpp              |   0
 example.sh => archive/example.sh              |   0
 {external => archive/external}/CMakeLists.txt |   0
 {include => archive/include}/autocomplete.hpp |   0
 .../include}/autocomplete2.hpp                |   0
 .../include}/autocomplete3.hpp                |   0
 .../include}/autocomplete4.hpp                |   0
 .../include}/autocomplete_common.hpp          |   0
 {include => archive/include}/bit_vector.hpp   |   0
 .../include}/blocked_inverted_index.hpp       |   0
 .../include}/compact_forward_index.hpp        |   0
 .../include}/compact_vector.hpp               |   0
 .../include}/completion_trie.hpp              |   0
 {include => archive/include}/constants.hpp    |   0
 .../include}/ef/compact_ef.hpp                |   0
 {include => archive/include}/ef/darray.hpp    |   0
 .../include}/ef/ef_parameters.hpp             |   0
 .../include}/ef/ef_sequence.hpp               |   0
 .../include}/fc_dictionary.hpp                |   0
 .../include}/integer_codes.hpp                |   0
 .../include}/integer_fc_dictionary.hpp        |   0
 .../include}/inverted_index.hpp               |   0
 {include => archive/include}/min_heap.hpp     |   0
 .../include}/minimal_docids.hpp               |   0
 {include => archive/include}/parameters.hpp   |   0
 {include => archive/include}/probe.hpp        |   0
 .../include}/scored_string_pool.hpp           |   0
 {include => archive/include}/statistics.hpp   |   0
 .../include}/succinct_rmq/README.md           |   0
 .../include}/succinct_rmq/bp_vector.hpp       |   0
 .../succinct_rmq/bp_vector_support.hpp        |   0
 .../include}/succinct_rmq/cartesian_tree.hpp  |   0
 .../include}/succinct_rmq/rs_bit_vector.hpp   |   0
 {include => archive/include}/types.hpp        |   0
 {include => archive/include}/uint_vec.hpp     |   0
 .../include}/uncompressed_list.hpp            |   0
 .../include}/unsorted_list.hpp                |   0
 {include => archive/include}/util.hpp         |   0
 {include => archive/include}/util_types.hpp   |   0
 install.sh => archive/install.sh              |   0
 .../script}/benchmark_dictionaries.sh         |   0
 ...ctiveness_results_by_varying_percentage.py |   0
 ...te_prefix_results_by_varying_percentage.py |   0
 .../collect_results_by_varying_percentage.py  |   0
 {src => archive/src}/CMakeLists.txt           |   0
 {src => archive/src}/check_topk.cpp           |   0
 {src => archive/src}/map_queries.cpp          |   0
 {src => archive/src}/output_ds2i_format.cpp   |   0
 {src => archive/src}/statistics.cpp           |   0
 {src => archive/src}/web_server.cpp           |   0
 {test => archive/test}/test_autocomplete.cpp  |   0
 .../test}/test_blocked_inverted_index.cpp     |   0
 {test => archive/test}/test_common.hpp        |   0
 .../test}/test_compact_forward_index.cpp      |   0
 .../test}/test_completion_trie.cpp            |   0
 {test => archive/test}/test_fc_dictionary.cpp |   0
 .../test}/test_integer_fc_dictionary.cpp      |   0
 .../test}/test_inverted_index.cpp             |   0
 {test => archive/test}/test_locate_prefix.cpp |   0
 {test => archive/test}/test_unsorted_list.cpp |   0
 .../test_data}/extract_dict.py                |   0
 .../test_data}/filter_and_preprocess.sh       |   0
 .../test_data}/filter_dataset.py              |   0
 .../test_data}/map_dataset.py                 |   0
 .../test_data}/partition_queries_by_length.py |   0
 .../test_data}/preprocess.sh                  |   0
 .../trec_05_efficiency_queries.completions    |   0
 {web => archive/web}/index.html               |   0
 {web => archive/web}/jquery-1.8.2.min.js      |   0
 {web => archive/web}/jquery.autocomplete.js   |   0
 {web => archive/web}/styles.css               |   0
 {web => archive/web}/topkcomp.js              |   0
 autocomplete-rs/Cargo.lock                    | 191 ++++++++++++++++++
 autocomplete-rs/Cargo.toml                    |   9 +
 autocomplete-rs/README.md                     |  44 ++++
 autocomplete-rs/src/constants.rs              |   8 +
 autocomplete-rs/src/lib.rs                    |   7 +
 autocomplete-rs/src/main.rs                   |   3 +
 autocomplete-rs/src/parameters.rs             | 115 +++++++++++
 autocomplete-rs/src/probe.rs                  |  81 ++++++++
 autocomplete-rs/tests/constants_tests.rs      |  21 ++
 autocomplete-rs/tests/parameters_tests.rs     |  98 +++++++++
 autocomplete-rs/tests/probe_tests.rs          |  79 ++++++++
 external/cmd_line_parser                      |   1 -
 external/doctest                              |   1 -
 external/essentials                           |   1 -
 external/jQuery-Autocomplete                  |   1 -
 external/mongoose                             |   1 -
 include/building_util.hpp                     |  39 ----
 script/build_indexes.py                       |   6 -
 src/build.cpp                                 |  62 ------
 test_data/build_inverted_and_forward.py       |  74 -------
 test_data/build_stats.py                      |  49 -----
 104 files changed, 656 insertions(+), 235 deletions(-)
 rename {.github => archive/.github}/workflows/continuous_integration.yml (100%)
 rename CMakeLists.txt => archive/CMakeLists.txt (100%)
 rename Dockerfile => archive/Dockerfile (100%)
 rename {benchmark => archive/benchmark}/CMakeLists.txt (100%)
 rename {benchmark => archive/benchmark}/benchmark_common.hpp (100%)
 rename {benchmark => archive/benchmark}/benchmark_conjunctive_topk.cpp (100%)
 rename {benchmark => archive/benchmark}/benchmark_fc_dictionary.cpp (100%)
 rename {benchmark => archive/benchmark}/benchmark_integer_fc_dictionary.cpp (100%)
 rename {benchmark => archive/benchmark}/benchmark_locate_prefix.cpp (100%)
 rename {benchmark => archive/benchmark}/benchmark_prefix_topk.cpp (100%)
 rename {benchmark => archive/benchmark}/benchmark_topk.cpp (100%)
 rename {benchmark => archive/benchmark}/effectiveness.cpp (100%)
 rename example.sh => archive/example.sh (100%)
 rename {external => archive/external}/CMakeLists.txt (100%)
 rename {include => archive/include}/autocomplete.hpp (100%)
 rename {include => archive/include}/autocomplete2.hpp (100%)
 rename {include => archive/include}/autocomplete3.hpp (100%)
 rename {include => archive/include}/autocomplete4.hpp (100%)
 rename {include => archive/include}/autocomplete_common.hpp (100%)
 rename {include => archive/include}/bit_vector.hpp (100%)
 rename {include => archive/include}/blocked_inverted_index.hpp (100%)
 rename {include => archive/include}/compact_forward_index.hpp (100%)
 rename {include => archive/include}/compact_vector.hpp (100%)
 rename {include => archive/include}/completion_trie.hpp (100%)
 rename {include => archive/include}/constants.hpp (100%)
 rename {include => archive/include}/ef/compact_ef.hpp (100%)
 rename {include => archive/include}/ef/darray.hpp (100%)
 rename {include => archive/include}/ef/ef_parameters.hpp (100%)
 rename {include => archive/include}/ef/ef_sequence.hpp (100%)
 rename {include => archive/include}/fc_dictionary.hpp (100%)
 rename {include => archive/include}/integer_codes.hpp (100%)
 rename {include => archive/include}/integer_fc_dictionary.hpp (100%)
 rename {include => archive/include}/inverted_index.hpp (100%)
 rename {include => archive/include}/min_heap.hpp (100%)
 rename {include => archive/include}/minimal_docids.hpp (100%)
 rename {include => archive/include}/parameters.hpp (100%)
 rename {include => archive/include}/probe.hpp (100%)
 rename {include => archive/include}/scored_string_pool.hpp (100%)
 rename {include => archive/include}/statistics.hpp (100%)
 rename {include => archive/include}/succinct_rmq/README.md (100%)
 rename {include => archive/include}/succinct_rmq/bp_vector.hpp (100%)
 rename {include => archive/include}/succinct_rmq/bp_vector_support.hpp (100%)
 rename {include => archive/include}/succinct_rmq/cartesian_tree.hpp (100%)
 rename {include => archive/include}/succinct_rmq/rs_bit_vector.hpp (100%)
 rename {include => archive/include}/types.hpp (100%)
 rename {include => archive/include}/uint_vec.hpp (100%)
 rename {include => archive/include}/uncompressed_list.hpp (100%)
 rename {include => archive/include}/unsorted_list.hpp (100%)
 rename {include => archive/include}/util.hpp (100%)
 rename {include => archive/include}/util_types.hpp (100%)
 rename install.sh => archive/install.sh (100%)
 rename {script => archive/script}/benchmark_dictionaries.sh (100%)
 rename {script => archive/script}/collect_effectiveness_results_by_varying_percentage.py (100%)
 rename {script => archive/script}/collect_locate_prefix_results_by_varying_percentage.py (100%)
 rename {script => archive/script}/collect_results_by_varying_percentage.py (100%)
 rename {src => archive/src}/CMakeLists.txt (100%)
 rename {src => archive/src}/check_topk.cpp (100%)
 rename {src => archive/src}/map_queries.cpp (100%)
 rename {src => archive/src}/output_ds2i_format.cpp (100%)
 rename {src => archive/src}/statistics.cpp (100%)
 rename {src => archive/src}/web_server.cpp (100%)
 rename {test => archive/test}/test_autocomplete.cpp (100%)
 rename {test => archive/test}/test_blocked_inverted_index.cpp (100%)
 rename {test => archive/test}/test_common.hpp (100%)
 rename {test => archive/test}/test_compact_forward_index.cpp (100%)
 rename {test => archive/test}/test_completion_trie.cpp (100%)
 rename {test => archive/test}/test_fc_dictionary.cpp (100%)
 rename {test => archive/test}/test_integer_fc_dictionary.cpp (100%)
 rename {test => archive/test}/test_inverted_index.cpp (100%)
 rename {test => archive/test}/test_locate_prefix.cpp (100%)
 rename {test => archive/test}/test_unsorted_list.cpp (100%)
 rename {test_data => archive/test_data}/extract_dict.py (100%)
 rename {test_data => archive/test_data}/filter_and_preprocess.sh (100%)
 rename {test_data => archive/test_data}/filter_dataset.py (100%)
 rename {test_data => archive/test_data}/map_dataset.py (100%)
 rename {test_data => archive/test_data}/partition_queries_by_length.py (100%)
 rename {test_data => archive/test_data}/preprocess.sh (100%)
 rename {test_data => archive/test_data}/trec_05_efficiency_queries/trec_05_efficiency_queries.completions (100%)
 rename {web => archive/web}/index.html (100%)
 rename {web => archive/web}/jquery-1.8.2.min.js (100%)
 rename {web => archive/web}/jquery.autocomplete.js (100%)
 rename {web => archive/web}/styles.css (100%)
 rename {web => archive/web}/topkcomp.js (100%)
 create mode 100644 autocomplete-rs/Cargo.lock
 create mode 100644 autocomplete-rs/Cargo.toml
 create mode 100644 autocomplete-rs/README.md
 create mode 100644 autocomplete-rs/src/constants.rs
 create mode 100644 autocomplete-rs/src/lib.rs
 create mode 100644 autocomplete-rs/src/main.rs
 create mode 100644 autocomplete-rs/src/parameters.rs
 create mode 100644 autocomplete-rs/src/probe.rs
 create mode 100644 autocomplete-rs/tests/constants_tests.rs
 create mode 100644 autocomplete-rs/tests/parameters_tests.rs
 create mode 100644 autocomplete-rs/tests/probe_tests.rs
 delete mode 160000 external/cmd_line_parser
 delete mode 160000 external/doctest
 delete mode 160000 external/essentials
 delete mode 160000 external/jQuery-Autocomplete
 delete mode 160000 external/mongoose
 delete mode 100644 include/building_util.hpp
 delete mode 100644 script/build_indexes.py
 delete mode 100644 src/build.cpp
 delete mode 100644 test_data/build_inverted_and_forward.py
 delete mode 100644 test_data/build_stats.py

diff --git a/.github/workflows/continuous_integration.yml b/archive/.github/workflows/continuous_integration.yml
similarity index 100%
rename from .github/workflows/continuous_integration.yml
rename to archive/.github/workflows/continuous_integration.yml
diff --git a/CMakeLists.txt b/archive/CMakeLists.txt
similarity index 100%
rename from CMakeLists.txt
rename to archive/CMakeLists.txt
diff --git a/Dockerfile b/archive/Dockerfile
similarity index 100%
rename from Dockerfile
rename to archive/Dockerfile
diff --git a/benchmark/CMakeLists.txt b/archive/benchmark/CMakeLists.txt
similarity index 100%
rename from benchmark/CMakeLists.txt
rename to archive/benchmark/CMakeLists.txt
diff --git a/benchmark/benchmark_common.hpp b/archive/benchmark/benchmark_common.hpp
similarity index 100%
rename from benchmark/benchmark_common.hpp
rename to archive/benchmark/benchmark_common.hpp
diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/archive/benchmark/benchmark_conjunctive_topk.cpp
similarity index 100%
rename from benchmark/benchmark_conjunctive_topk.cpp
rename to archive/benchmark/benchmark_conjunctive_topk.cpp
diff --git a/benchmark/benchmark_fc_dictionary.cpp b/archive/benchmark/benchmark_fc_dictionary.cpp
similarity index 100%
rename from benchmark/benchmark_fc_dictionary.cpp
rename to archive/benchmark/benchmark_fc_dictionary.cpp
diff --git a/benchmark/benchmark_integer_fc_dictionary.cpp b/archive/benchmark/benchmark_integer_fc_dictionary.cpp
similarity index 100%
rename from benchmark/benchmark_integer_fc_dictionary.cpp
rename to archive/benchmark/benchmark_integer_fc_dictionary.cpp
diff --git a/benchmark/benchmark_locate_prefix.cpp b/archive/benchmark/benchmark_locate_prefix.cpp
similarity index 100%
rename from benchmark/benchmark_locate_prefix.cpp
rename to archive/benchmark/benchmark_locate_prefix.cpp
diff --git a/benchmark/benchmark_prefix_topk.cpp b/archive/benchmark/benchmark_prefix_topk.cpp
similarity index 100%
rename from benchmark/benchmark_prefix_topk.cpp
rename to archive/benchmark/benchmark_prefix_topk.cpp
diff --git a/benchmark/benchmark_topk.cpp b/archive/benchmark/benchmark_topk.cpp
similarity index 100%
rename from benchmark/benchmark_topk.cpp
rename to archive/benchmark/benchmark_topk.cpp
diff --git a/benchmark/effectiveness.cpp b/archive/benchmark/effectiveness.cpp
similarity index 100%
rename from benchmark/effectiveness.cpp
rename to archive/benchmark/effectiveness.cpp
diff --git a/example.sh b/archive/example.sh
similarity index 100%
rename from example.sh
rename to archive/example.sh
diff --git a/external/CMakeLists.txt b/archive/external/CMakeLists.txt
similarity index 100%
rename from external/CMakeLists.txt
rename to archive/external/CMakeLists.txt
diff --git a/include/autocomplete.hpp b/archive/include/autocomplete.hpp
similarity index 100%
rename from include/autocomplete.hpp
rename to archive/include/autocomplete.hpp
diff --git a/include/autocomplete2.hpp b/archive/include/autocomplete2.hpp
similarity index 100%
rename from include/autocomplete2.hpp
rename to archive/include/autocomplete2.hpp
diff --git a/include/autocomplete3.hpp b/archive/include/autocomplete3.hpp
similarity index 100%
rename from include/autocomplete3.hpp
rename to archive/include/autocomplete3.hpp
diff --git a/include/autocomplete4.hpp b/archive/include/autocomplete4.hpp
similarity index 100%
rename from include/autocomplete4.hpp
rename to archive/include/autocomplete4.hpp
diff --git a/include/autocomplete_common.hpp b/archive/include/autocomplete_common.hpp
similarity index 100%
rename from include/autocomplete_common.hpp
rename to archive/include/autocomplete_common.hpp
diff --git a/include/bit_vector.hpp b/archive/include/bit_vector.hpp
similarity index 100%
rename from include/bit_vector.hpp
rename to archive/include/bit_vector.hpp
diff --git a/include/blocked_inverted_index.hpp b/archive/include/blocked_inverted_index.hpp
similarity index 100%
rename from include/blocked_inverted_index.hpp
rename to archive/include/blocked_inverted_index.hpp
diff --git a/include/compact_forward_index.hpp b/archive/include/compact_forward_index.hpp
similarity index 100%
rename from include/compact_forward_index.hpp
rename to archive/include/compact_forward_index.hpp
diff --git a/include/compact_vector.hpp b/archive/include/compact_vector.hpp
similarity index 100%
rename from include/compact_vector.hpp
rename to archive/include/compact_vector.hpp
diff --git a/include/completion_trie.hpp b/archive/include/completion_trie.hpp
similarity index 100%
rename from include/completion_trie.hpp
rename to archive/include/completion_trie.hpp
diff --git a/include/constants.hpp b/archive/include/constants.hpp
similarity index 100%
rename from include/constants.hpp
rename to archive/include/constants.hpp
diff --git a/include/ef/compact_ef.hpp b/archive/include/ef/compact_ef.hpp
similarity index 100%
rename from include/ef/compact_ef.hpp
rename to archive/include/ef/compact_ef.hpp
diff --git a/include/ef/darray.hpp b/archive/include/ef/darray.hpp
similarity index 100%
rename from include/ef/darray.hpp
rename to archive/include/ef/darray.hpp
diff --git a/include/ef/ef_parameters.hpp b/archive/include/ef/ef_parameters.hpp
similarity index 100%
rename from include/ef/ef_parameters.hpp
rename to archive/include/ef/ef_parameters.hpp
diff --git a/include/ef/ef_sequence.hpp b/archive/include/ef/ef_sequence.hpp
similarity index 100%
rename from include/ef/ef_sequence.hpp
rename to archive/include/ef/ef_sequence.hpp
diff --git a/include/fc_dictionary.hpp b/archive/include/fc_dictionary.hpp
similarity index 100%
rename from include/fc_dictionary.hpp
rename to archive/include/fc_dictionary.hpp
diff --git a/include/integer_codes.hpp b/archive/include/integer_codes.hpp
similarity index 100%
rename from include/integer_codes.hpp
rename to archive/include/integer_codes.hpp
diff --git a/include/integer_fc_dictionary.hpp b/archive/include/integer_fc_dictionary.hpp
similarity index 100%
rename from include/integer_fc_dictionary.hpp
rename to archive/include/integer_fc_dictionary.hpp
diff --git a/include/inverted_index.hpp b/archive/include/inverted_index.hpp
similarity index 100%
rename from include/inverted_index.hpp
rename to archive/include/inverted_index.hpp
diff --git a/include/min_heap.hpp b/archive/include/min_heap.hpp
similarity index 100%
rename from include/min_heap.hpp
rename to archive/include/min_heap.hpp
diff --git a/include/minimal_docids.hpp b/archive/include/minimal_docids.hpp
similarity index 100%
rename from include/minimal_docids.hpp
rename to archive/include/minimal_docids.hpp
diff --git a/include/parameters.hpp b/archive/include/parameters.hpp
similarity index 100%
rename from include/parameters.hpp
rename to archive/include/parameters.hpp
diff --git a/include/probe.hpp b/archive/include/probe.hpp
similarity index 100%
rename from include/probe.hpp
rename to archive/include/probe.hpp
diff --git a/include/scored_string_pool.hpp b/archive/include/scored_string_pool.hpp
similarity index 100%
rename from include/scored_string_pool.hpp
rename to archive/include/scored_string_pool.hpp
diff --git a/include/statistics.hpp b/archive/include/statistics.hpp
similarity index 100%
rename from include/statistics.hpp
rename to archive/include/statistics.hpp
diff --git a/include/succinct_rmq/README.md b/archive/include/succinct_rmq/README.md
similarity index 100%
rename from include/succinct_rmq/README.md
rename to archive/include/succinct_rmq/README.md
diff --git a/include/succinct_rmq/bp_vector.hpp b/archive/include/succinct_rmq/bp_vector.hpp
similarity index 100%
rename from include/succinct_rmq/bp_vector.hpp
rename to archive/include/succinct_rmq/bp_vector.hpp
diff --git a/include/succinct_rmq/bp_vector_support.hpp b/archive/include/succinct_rmq/bp_vector_support.hpp
similarity index 100%
rename from include/succinct_rmq/bp_vector_support.hpp
rename to archive/include/succinct_rmq/bp_vector_support.hpp
diff --git a/include/succinct_rmq/cartesian_tree.hpp b/archive/include/succinct_rmq/cartesian_tree.hpp
similarity index 100%
rename from include/succinct_rmq/cartesian_tree.hpp
rename to archive/include/succinct_rmq/cartesian_tree.hpp
diff --git a/include/succinct_rmq/rs_bit_vector.hpp b/archive/include/succinct_rmq/rs_bit_vector.hpp
similarity index 100%
rename from include/succinct_rmq/rs_bit_vector.hpp
rename to archive/include/succinct_rmq/rs_bit_vector.hpp
diff --git a/include/types.hpp b/archive/include/types.hpp
similarity index 100%
rename from include/types.hpp
rename to archive/include/types.hpp
diff --git a/include/uint_vec.hpp b/archive/include/uint_vec.hpp
similarity index 100%
rename from include/uint_vec.hpp
rename to archive/include/uint_vec.hpp
diff --git a/include/uncompressed_list.hpp b/archive/include/uncompressed_list.hpp
similarity index 100%
rename from include/uncompressed_list.hpp
rename to archive/include/uncompressed_list.hpp
diff --git a/include/unsorted_list.hpp b/archive/include/unsorted_list.hpp
similarity index 100%
rename from include/unsorted_list.hpp
rename to archive/include/unsorted_list.hpp
diff --git a/include/util.hpp b/archive/include/util.hpp
similarity index 100%
rename from include/util.hpp
rename to archive/include/util.hpp
diff --git a/include/util_types.hpp b/archive/include/util_types.hpp
similarity index 100%
rename from include/util_types.hpp
rename to archive/include/util_types.hpp
diff --git a/install.sh b/archive/install.sh
similarity index 100%
rename from install.sh
rename to archive/install.sh
diff --git a/script/benchmark_dictionaries.sh b/archive/script/benchmark_dictionaries.sh
similarity index 100%
rename from script/benchmark_dictionaries.sh
rename to archive/script/benchmark_dictionaries.sh
diff --git a/script/collect_effectiveness_results_by_varying_percentage.py b/archive/script/collect_effectiveness_results_by_varying_percentage.py
similarity index 100%
rename from script/collect_effectiveness_results_by_varying_percentage.py
rename to archive/script/collect_effectiveness_results_by_varying_percentage.py
diff --git a/script/collect_locate_prefix_results_by_varying_percentage.py b/archive/script/collect_locate_prefix_results_by_varying_percentage.py
similarity index 100%
rename from script/collect_locate_prefix_results_by_varying_percentage.py
rename to archive/script/collect_locate_prefix_results_by_varying_percentage.py
diff --git a/script/collect_results_by_varying_percentage.py b/archive/script/collect_results_by_varying_percentage.py
similarity index 100%
rename from script/collect_results_by_varying_percentage.py
rename to archive/script/collect_results_by_varying_percentage.py
diff --git a/src/CMakeLists.txt b/archive/src/CMakeLists.txt
similarity index 100%
rename from src/CMakeLists.txt
rename to archive/src/CMakeLists.txt
diff --git a/src/check_topk.cpp b/archive/src/check_topk.cpp
similarity index 100%
rename from src/check_topk.cpp
rename to archive/src/check_topk.cpp
diff --git a/src/map_queries.cpp b/archive/src/map_queries.cpp
similarity index 100%
rename from src/map_queries.cpp
rename to archive/src/map_queries.cpp
diff --git a/src/output_ds2i_format.cpp b/archive/src/output_ds2i_format.cpp
similarity index 100%
rename from src/output_ds2i_format.cpp
rename to archive/src/output_ds2i_format.cpp
diff --git a/src/statistics.cpp b/archive/src/statistics.cpp
similarity index 100%
rename from src/statistics.cpp
rename to archive/src/statistics.cpp
diff --git a/src/web_server.cpp b/archive/src/web_server.cpp
similarity index 100%
rename from src/web_server.cpp
rename to archive/src/web_server.cpp
diff --git a/test/test_autocomplete.cpp b/archive/test/test_autocomplete.cpp
similarity index 100%
rename from test/test_autocomplete.cpp
rename to archive/test/test_autocomplete.cpp
diff --git a/test/test_blocked_inverted_index.cpp b/archive/test/test_blocked_inverted_index.cpp
similarity index 100%
rename from test/test_blocked_inverted_index.cpp
rename to archive/test/test_blocked_inverted_index.cpp
diff --git a/test/test_common.hpp b/archive/test/test_common.hpp
similarity index 100%
rename from test/test_common.hpp
rename to archive/test/test_common.hpp
diff --git a/test/test_compact_forward_index.cpp b/archive/test/test_compact_forward_index.cpp
similarity index 100%
rename from test/test_compact_forward_index.cpp
rename to archive/test/test_compact_forward_index.cpp
diff --git a/test/test_completion_trie.cpp b/archive/test/test_completion_trie.cpp
similarity index 100%
rename from test/test_completion_trie.cpp
rename to archive/test/test_completion_trie.cpp
diff --git a/test/test_fc_dictionary.cpp b/archive/test/test_fc_dictionary.cpp
similarity index 100%
rename from test/test_fc_dictionary.cpp
rename to archive/test/test_fc_dictionary.cpp
diff --git a/test/test_integer_fc_dictionary.cpp b/archive/test/test_integer_fc_dictionary.cpp
similarity index 100%
rename from test/test_integer_fc_dictionary.cpp
rename to archive/test/test_integer_fc_dictionary.cpp
diff --git a/test/test_inverted_index.cpp b/archive/test/test_inverted_index.cpp
similarity index 100%
rename from test/test_inverted_index.cpp
rename to archive/test/test_inverted_index.cpp
diff --git a/test/test_locate_prefix.cpp b/archive/test/test_locate_prefix.cpp
similarity index 100%
rename from test/test_locate_prefix.cpp
rename to archive/test/test_locate_prefix.cpp
diff --git a/test/test_unsorted_list.cpp b/archive/test/test_unsorted_list.cpp
similarity index 100%
rename from test/test_unsorted_list.cpp
rename to archive/test/test_unsorted_list.cpp
diff --git a/test_data/extract_dict.py b/archive/test_data/extract_dict.py
similarity index 100%
rename from test_data/extract_dict.py
rename to archive/test_data/extract_dict.py
diff --git a/test_data/filter_and_preprocess.sh b/archive/test_data/filter_and_preprocess.sh
similarity index 100%
rename from test_data/filter_and_preprocess.sh
rename to archive/test_data/filter_and_preprocess.sh
diff --git a/test_data/filter_dataset.py b/archive/test_data/filter_dataset.py
similarity index 100%
rename from test_data/filter_dataset.py
rename to archive/test_data/filter_dataset.py
diff --git a/test_data/map_dataset.py b/archive/test_data/map_dataset.py
similarity index 100%
rename from test_data/map_dataset.py
rename to archive/test_data/map_dataset.py
diff --git a/test_data/partition_queries_by_length.py b/archive/test_data/partition_queries_by_length.py
similarity index 100%
rename from test_data/partition_queries_by_length.py
rename to archive/test_data/partition_queries_by_length.py
diff --git a/test_data/preprocess.sh b/archive/test_data/preprocess.sh
similarity index 100%
rename from test_data/preprocess.sh
rename to archive/test_data/preprocess.sh
diff --git a/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions b/archive/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions
similarity index 100%
rename from test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions
rename to archive/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions
diff --git a/web/index.html b/archive/web/index.html
similarity index 100%
rename from web/index.html
rename to archive/web/index.html
diff --git a/web/jquery-1.8.2.min.js b/archive/web/jquery-1.8.2.min.js
similarity index 100%
rename from web/jquery-1.8.2.min.js
rename to archive/web/jquery-1.8.2.min.js
diff --git a/web/jquery.autocomplete.js b/archive/web/jquery.autocomplete.js
similarity index 100%
rename from web/jquery.autocomplete.js
rename to archive/web/jquery.autocomplete.js
diff --git a/web/styles.css b/archive/web/styles.css
similarity index 100%
rename from web/styles.css
rename to archive/web/styles.css
diff --git a/web/topkcomp.js b/archive/web/topkcomp.js
similarity index 100%
rename from web/topkcomp.js
rename to archive/web/topkcomp.js
diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock
new file mode 100644
index 0000000..6cb35bc
--- /dev/null
+++ b/autocomplete-rs/Cargo.lock
@@ -0,0 +1,191 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "autocomplete-rs"
+version = "0.1.0"
+dependencies = [
+ "tempfile",
+]
+
+[[package]]
+name = "bitflags"
+version = "2.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "errno"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18"
+dependencies = [
+ "libc",
+ "windows-sys",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+
+[[package]]
+name = "getrandom"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.172"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "r-efi"
+version = "5.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+
+[[package]]
+name = "rustix"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys",
+]
+
+[[package]]
+name = "tempfile"
+version = "3.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1"
+dependencies = [
+ "fastrand",
+ "getrandom",
+ "once_cell",
+ "rustix",
+ "windows-sys",
+]
+
+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags",
+]
diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml
new file mode 100644
index 0000000..7d62c58
--- /dev/null
+++ b/autocomplete-rs/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "autocomplete-rs"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+
+[dev-dependencies]
+tempfile = "3.8"
diff --git a/autocomplete-rs/README.md b/autocomplete-rs/README.md
new file mode 100644
index 0000000..801e4b2
--- /dev/null
+++ b/autocomplete-rs/README.md
@@ -0,0 +1,44 @@
+# Autocomplete-rs
+
+This project is a Rust port of the original C++ autocomplete system. The goal is to maintain the same functionality while leveraging Rust's safety guarantees and modern tooling.
+
+## Project Status
+
+Currently, we are in the process of porting the core components from C++ to Rust. The following components have been ported:
+
+- Basic constants and configuration
+- Parameters management
+- Performance measurement probes
+
+## Next Steps
+
+1. Continue porting core components:
+   - Scored string pool
+   - Completion trie
+   - Blocked inverted index
+   - Front-coded dictionary
+
+2. Port and adapt unit tests to ensure functionality matches the original implementation
+
+3. Containerize the application using Docker for easy deployment and testing
+
+## Building and Testing
+
+```bash
+# Build the project
+cargo build
+
+# Run tests
+cargo test
+
+# Run with specific test
+cargo test test_name -- --nocapture
+```
+
+## Original Project
+
+This is a port of the original C++ autocomplete system, which provides efficient string completion functionality. The original implementation can be found in the `archive` directory.
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details. 
\ No newline at end of file
diff --git a/autocomplete-rs/src/constants.rs b/autocomplete-rs/src/constants.rs
new file mode 100644
index 0000000..b949eb7
--- /dev/null
+++ b/autocomplete-rs/src/constants.rs
@@ -0,0 +1,8 @@
+// Constants for the autocomplete system
+pub const MAX_K: u32 = 15;
+pub const MAX_NUM_TERMS_PER_QUERY: u32 = 64;
+pub const MAX_NUM_CHARS_PER_QUERY: u32 = 128;
+pub const POOL_SIZE: usize = (MAX_K as usize) * (MAX_NUM_CHARS_PER_QUERY as usize);
+
+// Compile-time assertion
+const _: () = assert!(MAX_NUM_TERMS_PER_QUERY < 256, "MAX_NUM_TERMS_PER_QUERY must be < 256"); 
\ No newline at end of file
diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs
new file mode 100644
index 0000000..c5c3755
--- /dev/null
+++ b/autocomplete-rs/src/lib.rs
@@ -0,0 +1,7 @@
+pub mod constants;
+pub mod parameters;
+pub mod probe;
+
+pub use constants::*;
+pub use parameters::*;
+pub use probe::*; 
\ No newline at end of file
diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs
new file mode 100644
index 0000000..e7a11a9
--- /dev/null
+++ b/autocomplete-rs/src/main.rs
@@ -0,0 +1,3 @@
+fn main() {
+    println!("Hello, world!");
+}
diff --git a/autocomplete-rs/src/parameters.rs b/autocomplete-rs/src/parameters.rs
new file mode 100644
index 0000000..38d5fec
--- /dev/null
+++ b/autocomplete-rs/src/parameters.rs
@@ -0,0 +1,115 @@
+use std::fs::File;
+use std::io::{self, BufRead, BufReader};
+use std::path::Path;
+
+use crate::constants::{MAX_NUM_CHARS_PER_QUERY, MAX_NUM_TERMS_PER_QUERY};
+
+/// Parameters for the autocomplete system
+#[derive(Debug, Default)]
+pub struct Parameters {
+    pub num_terms: u32,
+    pub max_string_length: u32,
+    pub num_completions: u32,
+    pub universe: u32,
+    pub num_levels: u32,
+    pub nodes_per_level: Vec<u32>,
+    pub collection_basename: String,
+}
+
+impl Parameters {
+    /// Creates a new empty Parameters instance
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Loads parameters from a statistics file
+    pub fn load(&mut self) -> io::Result<()> {
+        let stats_path = if self.collection_basename.ends_with(".mapped.stats") {
+            Path::new(&self.collection_basename).to_path_buf()
+        } else {
+            Path::new(&self.collection_basename).with_extension("mapped.stats")
+        };
+        
+        let file = File::open(stats_path)?;
+        let reader = BufReader::new(file);
+        let mut lines = reader.lines();
+
+        // Read basic statistics
+        self.num_terms = lines.next()
+            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_terms"))??
+            .parse()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        self.max_string_length = lines.next()
+            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing max_string_length"))??
+            .parse()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        self.num_completions = lines.next()
+            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_completions"))??
+            .parse()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        self.universe = lines.next()
+            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing universe"))??
+            .parse()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        self.num_levels = lines.next()
+            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_levels"))??
+            .parse()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+        // Validate basic statistics
+        if self.num_terms == 0 {
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "num_terms must be > 0"));
+        }
+        if self.max_string_length == 0 {
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "max_string_length must be > 0"));
+        }
+        if self.num_completions == 0 {
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "num_completions must be > 0"));
+        }
+        if self.universe < self.num_completions {
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "universe must be >= num_completions"));
+        }
+        if self.num_levels == 0 {
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "num_levels must be > 0"));
+        }
+
+        // Validate against constants
+        if self.max_string_length > MAX_NUM_CHARS_PER_QUERY {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("max_string_length ({}) exceeds MAX_NUM_CHARS_PER_QUERY ({})",
+                    self.max_string_length, MAX_NUM_CHARS_PER_QUERY)
+            ));
+        }
+        if self.num_levels > MAX_NUM_TERMS_PER_QUERY {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("num_levels ({}) exceeds MAX_NUM_TERMS_PER_QUERY ({})",
+                    self.num_levels, MAX_NUM_TERMS_PER_QUERY)
+            ));
+        }
+
+        // Read nodes per level
+        self.nodes_per_level = Vec::with_capacity(self.num_levels as usize);
+        for _ in 0..self.num_levels {
+            let count = lines.next()
+                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing nodes_per_level data"))??
+                .parse()
+                .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
+            self.nodes_per_level.push(count);
+        }
+
+        if self.nodes_per_level.len() != self.num_levels as usize {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "File with statistics may be truncated or malformed"
+            ));
+        }
+
+        Ok(())
+    }
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/probe.rs b/autocomplete-rs/src/probe.rs
new file mode 100644
index 0000000..c82f825
--- /dev/null
+++ b/autocomplete-rs/src/probe.rs
@@ -0,0 +1,81 @@
+use std::time::{Duration, Instant};
+
+/// A trait for performance measurement probes
+pub trait Probe {
+    /// Start timing an operation
+    fn start(&mut self, id: u64);
+    /// Stop timing an operation
+    fn stop(&mut self, id: u64);
+}
+
+/// A no-operation probe that does nothing
+#[derive(Debug, Default)]
+pub struct NopProbe;
+
+impl Probe for NopProbe {
+    fn start(&mut self, _id: u64) {}
+    fn stop(&mut self, _id: u64) {}
+}
+
+/// A timer probe that measures operation durations
+#[derive(Debug)]
+pub struct TimerProbe {
+    timers: Vec<Timer>,
+}
+
+#[derive(Debug, Default, Clone)]
+struct Timer {
+    start_time: Option<Instant>,
+    total_duration: Duration,
+}
+
+impl Timer {
+    fn new() -> Self {
+        Self {
+            start_time: None,
+            total_duration: Duration::default(),
+        }
+    }
+
+    fn start(&mut self) {
+        self.start_time = Some(Instant::now());
+    }
+
+    fn stop(&mut self) {
+        if let Some(start) = self.start_time {
+            self.total_duration += start.elapsed();
+            self.start_time = None;
+        }
+    }
+
+    fn get_duration(&self) -> Duration {
+        self.total_duration
+    }
+}
+
+impl TimerProbe {
+    /// Creates a new TimerProbe with the specified number of timers
+    pub fn new(num_timers: u64) -> Self {
+        Self {
+            timers: vec![Timer::new(); num_timers as usize],
+        }
+    }
+
+    /// Gets the total duration for a specific timer
+    pub fn get_duration(&self, id: u64) -> Duration {
+        assert!(id < self.timers.len() as u64, "Timer ID out of bounds");
+        self.timers[id as usize].get_duration()
+    }
+}
+
+impl Probe for TimerProbe {
+    fn start(&mut self, id: u64) {
+        assert!(id < self.timers.len() as u64, "Timer ID out of bounds");
+        self.timers[id as usize].start();
+    }
+
+    fn stop(&mut self, id: u64) {
+        assert!(id < self.timers.len() as u64, "Timer ID out of bounds");
+        self.timers[id as usize].stop();
+    }
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/tests/constants_tests.rs b/autocomplete-rs/tests/constants_tests.rs
new file mode 100644
index 0000000..94123cc
--- /dev/null
+++ b/autocomplete-rs/tests/constants_tests.rs
@@ -0,0 +1,21 @@
+use autocomplete_rs::constants::*;
+
+#[test]
+fn test_constants() {
+    // Test MAX_K
+    assert!(MAX_K > 0, "MAX_K should be positive");
+    assert!(MAX_K <= 100, "MAX_K should be reasonably small");
+
+    // Test MAX_NUM_TERMS_PER_QUERY
+    assert!(MAX_NUM_TERMS_PER_QUERY > 0, "MAX_NUM_TERMS_PER_QUERY should be positive");
+    assert!(MAX_NUM_TERMS_PER_QUERY < 256, "MAX_NUM_TERMS_PER_QUERY must be < 256");
+
+    // Test MAX_NUM_CHARS_PER_QUERY
+    assert!(MAX_NUM_CHARS_PER_QUERY > 0, "MAX_NUM_CHARS_PER_QUERY should be positive");
+    assert!(MAX_NUM_CHARS_PER_QUERY >= MAX_K, "MAX_NUM_CHARS_PER_QUERY should be >= MAX_K");
+
+    // Test POOL_SIZE
+    assert!(POOL_SIZE > 0, "POOL_SIZE should be positive");
+    assert_eq!(POOL_SIZE, (MAX_K as usize) * (MAX_NUM_CHARS_PER_QUERY as usize), 
+        "POOL_SIZE should be MAX_K * MAX_NUM_CHARS_PER_QUERY");
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/tests/parameters_tests.rs b/autocomplete-rs/tests/parameters_tests.rs
new file mode 100644
index 0000000..2bd6762
--- /dev/null
+++ b/autocomplete-rs/tests/parameters_tests.rs
@@ -0,0 +1,98 @@
+use std::fs::File;
+use std::io::Write;
+use std::path::Path;
+use tempfile::NamedTempFile;
+use autocomplete_rs::parameters::Parameters;
+use autocomplete_rs::constants::{MAX_NUM_CHARS_PER_QUERY, MAX_NUM_TERMS_PER_QUERY};
+
+fn create_test_stats_file() -> NamedTempFile {
+    let mut file = NamedTempFile::new().unwrap();
+    writeln!(file, "1000").unwrap();  // num_terms
+    writeln!(file, "50").unwrap();    // max_string_length
+    writeln!(file, "500").unwrap();   // num_completions
+    writeln!(file, "1000").unwrap();  // universe
+    writeln!(file, "3").unwrap();     // num_levels
+    writeln!(file, "100").unwrap();   // nodes_per_level[0]
+    writeln!(file, "200").unwrap();   // nodes_per_level[1]
+    writeln!(file, "300").unwrap();   // nodes_per_level[2]
+    file
+}
+
+#[test]
+fn test_parameters_load_valid() {
+    let test_file = create_test_stats_file();
+    let mut params = Parameters::new();
+    let path = test_file.path().to_str().unwrap().to_string();
+    println!("Test file path: {}", path);
+    params.collection_basename = path;
+    
+    match params.load() {
+        Ok(_) => println!("Load succeeded"),
+        Err(e) => println!("Load failed: {}", e),
+    }
+    
+    assert!(params.load().is_ok());
+    assert_eq!(params.num_terms, 1000);
+    assert_eq!(params.max_string_length, 50);
+    assert_eq!(params.num_completions, 500);
+    assert_eq!(params.universe, 1000);
+    assert_eq!(params.num_levels, 3);
+    assert_eq!(params.nodes_per_level, vec![100, 200, 300]);
+}
+
+#[test]
+fn test_parameters_load_invalid_file() {
+    let mut params = Parameters::new();
+    params.collection_basename = "nonexistent_file".to_string();
+    assert!(params.load().is_err());
+}
+
+#[test]
+fn test_parameters_load_invalid_data() {
+    let mut file = NamedTempFile::new().unwrap();
+    writeln!(file, "0").unwrap();  // invalid num_terms
+    writeln!(file, "50").unwrap();
+    writeln!(file, "500").unwrap();
+    writeln!(file, "1000").unwrap();
+    writeln!(file, "3").unwrap();
+    writeln!(file, "100").unwrap();
+    writeln!(file, "200").unwrap();
+    writeln!(file, "300").unwrap();
+
+    let mut params = Parameters::new();
+    params.collection_basename = file.path().to_str().unwrap().to_string();
+    assert!(params.load().is_err());
+}
+
+#[test]
+fn test_parameters_load_invalid_constants() {
+    let mut file = NamedTempFile::new().unwrap();
+    writeln!(file, "1000").unwrap();
+    writeln!(file, "{}", MAX_NUM_CHARS_PER_QUERY + 1).unwrap();  // exceeds MAX_NUM_CHARS_PER_QUERY
+    writeln!(file, "500").unwrap();
+    writeln!(file, "1000").unwrap();
+    writeln!(file, "3").unwrap();
+    writeln!(file, "100").unwrap();
+    writeln!(file, "200").unwrap();
+    writeln!(file, "300").unwrap();
+
+    let mut params = Parameters::new();
+    params.collection_basename = file.path().to_str().unwrap().to_string();
+    assert!(params.load().is_err());
+}
+
+#[test]
+fn test_parameters_load_truncated() {
+    let mut file = NamedTempFile::new().unwrap();
+    writeln!(file, "1000").unwrap();
+    writeln!(file, "50").unwrap();
+    writeln!(file, "500").unwrap();
+    writeln!(file, "1000").unwrap();
+    writeln!(file, "3").unwrap();
+    writeln!(file, "100").unwrap();
+    // Missing nodes_per_level entries
+
+    let mut params = Parameters::new();
+    params.collection_basename = file.path().to_str().unwrap().to_string();
+    assert!(params.load().is_err());
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/tests/probe_tests.rs b/autocomplete-rs/tests/probe_tests.rs
new file mode 100644
index 0000000..7e869e1
--- /dev/null
+++ b/autocomplete-rs/tests/probe_tests.rs
@@ -0,0 +1,79 @@
+use std::thread;
+use std::time::Duration;
+use autocomplete_rs::probe::{Probe, NopProbe, TimerProbe};
+
+#[test]
+fn test_nop_probe() {
+    let mut probe = NopProbe;
+    // These should not panic
+    probe.start(0);
+    probe.stop(0);
+}
+
+#[test]
+fn test_timer_probe_single() {
+    let mut probe = TimerProbe::new(1);
+    
+    probe.start(0);
+    thread::sleep(Duration::from_millis(100));
+    probe.stop(0);
+    
+    let duration = probe.get_duration(0);
+    assert!(duration >= Duration::from_millis(100));
+}
+
+#[test]
+fn test_timer_probe_multiple() {
+    let mut probe = TimerProbe::new(3);
+    
+    // Timer 0
+    probe.start(0);
+    thread::sleep(Duration::from_millis(100));
+    probe.stop(0);
+    
+    // Timer 1
+    probe.start(1);
+    thread::sleep(Duration::from_millis(200));
+    probe.stop(1);
+    
+    // Timer 2
+    probe.start(2);
+    thread::sleep(Duration::from_millis(300));
+    probe.stop(2);
+    
+    assert!(probe.get_duration(0) >= Duration::from_millis(100));
+    assert!(probe.get_duration(1) >= Duration::from_millis(200));
+    assert!(probe.get_duration(2) >= Duration::from_millis(300));
+}
+
+#[test]
+fn test_timer_probe_accumulation() {
+    let mut probe = TimerProbe::new(1);
+    
+    // First interval
+    probe.start(0);
+    thread::sleep(Duration::from_millis(100));
+    probe.stop(0);
+    
+    // Second interval
+    probe.start(0);
+    thread::sleep(Duration::from_millis(100));
+    probe.stop(0);
+    
+    let duration = probe.get_duration(0);
+    assert!(duration >= Duration::from_millis(200));
+}
+
+#[test]
+#[should_panic(expected = "Timer ID out of bounds")]
+fn test_timer_probe_invalid_id() {
+    let mut probe = TimerProbe::new(1);
+    probe.start(1); // Should panic as we only have timer 0
+}
+
+#[test]
+#[should_panic(expected = "Timer ID out of bounds")]
+fn test_timer_probe_get_invalid_id() {
+    let probe = TimerProbe::new(1);
+    probe.get_duration(1); // Should panic as we only have timer 0
+} 
\ No newline at end of file
diff --git a/external/cmd_line_parser b/external/cmd_line_parser
deleted file mode 160000
index 1776808..0000000
--- a/external/cmd_line_parser
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 1776808718445425dcad42ba2d1b6adf2cb5e496
diff --git a/external/doctest b/external/doctest
deleted file mode 160000
index ae7a135..0000000
--- a/external/doctest
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ae7a13539fb71f270b87eb2e874fbac80bc8dda2
diff --git a/external/essentials b/external/essentials
deleted file mode 160000
index da66810..0000000
--- a/external/essentials
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit da6681019cbad6bef62804927801dd09832e512e
diff --git a/external/jQuery-Autocomplete b/external/jQuery-Autocomplete
deleted file mode 160000
index 0ba2565..0000000
--- a/external/jQuery-Autocomplete
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0ba256501bc365814f43066999f51f0619e739a9
diff --git a/external/mongoose b/external/mongoose
deleted file mode 160000
index dce60c6..0000000
--- a/external/mongoose
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit dce60c6dbb096f3b96e1a45cbfdfd55e18b38bb6
diff --git a/include/building_util.hpp b/include/building_util.hpp
deleted file mode 100644
index 0398879..0000000
--- a/include/building_util.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include "util.hpp"
-#include "bit_vector.hpp"
-
-namespace autocomplete {
-namespace util {
-
-std::vector<id_type> invert(std::vector<id_type> const& docid_to_lexid,
-                            uint64_t size) {
-    std::vector<id_type> lexid_to_docid(size);
-    for (uint64_t doc_id = 0; doc_id != docid_to_lexid.size(); ++doc_id) {
-        if (docid_to_lexid[doc_id] < size) {
-            lexid_to_docid[docid_to_lexid[doc_id]] = doc_id;
-        }
-    }
-    return lexid_to_docid;
-}
-
-void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) {
-    uint64_t mod = bvb.size() % alignment;
-    if (mod) {
-        uint64_t pad = alignment - mod;
-        bvb.append_bits(0, pad);
-        assert(bvb.size() % alignment == 0);
-    }
-}
-
-void eat_pad(bits_iterator<bit_vector>& it, uint64_t alignment = 8) {
-    uint64_t mod = it.position() % alignment;
-    if (mod) {
-        uint64_t pad = alignment - mod;
-        it.get_bits(pad);
-        assert(it.position() % alignment == 0);
-    }
-}
-
-}  // namespace util
-}  // namespace autocomplete
\ No newline at end of file
diff --git a/script/build_indexes.py b/script/build_indexes.py
deleted file mode 100644
index e01e1db..0000000
--- a/script/build_indexes.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import sys, os
-
-dataset_name = sys.argv[1] # e.g., aol
-types = ["ef_type1", "ef_type2", "ef_type3", "ef_type4"]
-for t in types:
-    os.system("./build " + t + " ../test_data/" + dataset_name + "/" + dataset_name + ".completions -o " + t + "." + dataset_name + ".bin -c 0.0001")
\ No newline at end of file
diff --git a/src/build.cpp b/src/build.cpp
deleted file mode 100644
index ba73954..0000000
--- a/src/build.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <iostream>
-
-#include "types.hpp"
-#include "statistics.hpp"
-#include "../external/cmd_line_parser/include/parser.hpp"
-
-using namespace autocomplete;
-
-template <typename Index>
-void build(parameters const& params, std::string const& output_filename) {
-    Index index(params);
-    index.print_stats();
-    if (output_filename != "") {
-        essentials::logger("saving data structure to disk...");
-        essentials::save<Index>(index, output_filename.c_str());
-        essentials::logger("DONE");
-    }
-}
-
-void build_type4(parameters const& params, const float c,
-                 std::string const& output_filename) {
-    ef_autocomplete_type4 index(params, c);
-    index.print_stats();
-    if (output_filename != "") {
-        essentials::logger("saving data structure to disk...");
-        essentials::save<ef_autocomplete_type4>(index, output_filename.c_str());
-        essentials::logger("DONE");
-    }
-}
-
-int main(int argc, char** argv) {
-    cmd_line_parser::parser parser(argc, argv);
-    parser.add("type", "Index type.");
-    parser.add("collection_basename", "Collection basename.");
-    parser.add("output_filename", "Output filename.", "-o", false);
-    parser.add(
-        "c",
-        "Value for Bast and Weber's technique: c must be a float in (0,1].",
-        "-c", false);
-    if (!parser.parse()) return 1;
-
-    auto type = parser.get<std::string>("type");
-    parameters params;
-    params.collection_basename = parser.get<std::string>("collection_basename");
-    params.load();
-    auto output_filename = parser.get<std::string>("output_filename");
-
-    if (type == "ef_type1") {
-        build<ef_autocomplete_type1>(params, output_filename);
-    } else if (type == "ef_type2") {
-        build<ef_autocomplete_type2>(params, output_filename);
-    } else if (type == "ef_type3") {
-        build<ef_autocomplete_type3>(params, output_filename);
-    } else if (type == "ef_type4") {
-        auto c = parser.get<float>("c");
-        build_type4(params, c, output_filename);
-    } else {
-        return 1;
-    }
-
-    return 0;
-}
\ No newline at end of file
diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py
deleted file mode 100644
index 0966d99..0000000
--- a/test_data/build_inverted_and_forward.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import sys
-
-input_filename = sys.argv[1]
-
-tokens = {}
-print("building dictionary...")
-id = 1 # reserve id 0 to mark the end of a string
-with open(input_filename + ".dict") as f:
-    for line in f:
-        t = line.rstrip('\n')
-        tokens[t] = id
-        id += 1
-
-lines = 0
-inverted = open(input_filename + ".inverted", 'w')
-forward = open(input_filename + ".forward", 'w')
-
-num_terms = 0
-num_docs = 0
-with open(input_filename + ".mapped.stats") as f:
-    num_terms = int(f.readline())
-    print("terms: " + str(num_terms))
-    f.readline() # skip line: max num. of query terms
-    f.readline() # skip line: num. of completions
-    num_docs = int(f.readline())
-    print("universe: " + str(num_docs))
-
-inverted_index = [[] for i in range(num_terms + 1)] # id 0 is not assigned
-forward_index = [[] for i in range(num_docs)]
-
-with open(input_filename, 'r') as f:
-    for line in f:
-        x = line.rstrip('\n').split()
-        mapped = []
-        doc_id = int(x[0])
-        discard = False
-        for i in range(1, len(x)):
-            try:
-                term = x[i]
-                try:
-                    term_id = tokens[term]
-                    if term_id not in mapped:
-                        inverted_index[term_id].append(doc_id)
-                    mapped.append(term_id)
-                except KeyError:
-                    print("'" + term + "' not found in dictionary")
-                    print(line)
-                    exit()
-            except UnicodeDecodeError:
-                discard = True
-
-        if not discard:
-            # NOTE: not sorted!
-            if doc_id >= num_docs:
-                print(doc_id,num_docs)
-            forward_index[doc_id] = mapped;
-
-            lines += 1
-            if lines % 1000000 == 0:
-                print("processed " + str(lines) + " lines")
-
-for i in range(0, num_docs):
-    s = [str(k) for k in forward_index[i]]
-    forward.write(str(len(forward_index[i])) + " ")
-    forward.write(" ".join(s) + "\n")
-forward.close()
-
-for i in range(1, num_terms + 1):
-    posting_list = inverted_index[i]
-    unique = sorted(set(posting_list));
-    s = [str(i) for i in unique] # remove any possible duplicate
-    inverted.write(str(len(unique)) + " ")
-    inverted.write(" ".join(s) + "\n")
-inverted.close()
diff --git a/test_data/build_stats.py b/test_data/build_stats.py
deleted file mode 100644
index 880bcd3..0000000
--- a/test_data/build_stats.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import sys
-
-input_filename = sys.argv[1] # e.g., "completions.mapped"
-
-nodes_per_level = {} # (level_id, num_nodes)
-lines = 0
-print("building stats...")
-
-output_file = open(input_filename + ".stats", 'a')
-prev = []
-universe = 0;
-with open(input_filename, 'r') as f:
-    for line in f:
-        x = line.rstrip('\n').split()
-        docid = int(x[0])
-
-        if docid > universe:
-            universe = docid
-
-        q = x[1:len(x)]
-
-        level_id = 0
-        while level_id < len(q) and level_id < len(prev) and q[level_id] == prev[level_id]:
-            level_id += 1
-
-        while level_id < len(q):
-            if level_id in nodes_per_level:
-                nodes_per_level[level_id] += 1
-            else:
-                nodes_per_level[level_id] = 1
-            level_id += 1
-
-        prev = q
-        lines += 1
-        if lines % 1000000 == 0:
-            print("processed " + str(lines) + " lines")
-
-# number of completions
-# number of levels in the trie
-# number of nodes for each level
-print("universe: " + str(universe + 1))
-print("completions: " + str(lines))
-output_file.write(str(lines) + "\n")
-output_file.write(str(universe + 1) + "\n")
-output_file.write(str(len(nodes_per_level)) + "\n")
-for key, value in sorted(nodes_per_level.items(), key = lambda kv: kv[0]):
-    output_file.write(str(value) + "\n")
-output_file.close()
-

From 6d159eba0efc798417ff03d15f0d4467d10c9072 Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 10:26:59 -0400
Subject: [PATCH 091/102] initial commit, porting primitieves

---
 .gitignore                                    | 84 ++++++++++++++++++-
 README.md                                     | 42 ++++++++++
 archive/include/building_util.hpp             | 39 +++++++++
 archive/script/build_indexes.py               |  6 ++
 archive/src/build.cpp                         | 62 ++++++++++++++
 .../test_data/build_inverted_and_forward.py   | 74 ++++++++++++++++
 archive/test_data/build_stats.py              | 49 +++++++++++
 7 files changed, 355 insertions(+), 1 deletion(-)
 create mode 100644 archive/include/building_util.hpp
 create mode 100644 archive/script/build_indexes.py
 create mode 100644 archive/src/build.cpp
 create mode 100644 archive/test_data/build_inverted_and_forward.py
 create mode 100644 archive/test_data/build_stats.py

diff --git a/.gitignore b/.gitignore
index 51855af..69bd68e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,84 @@
+# Rust specific
+/target/
+**/*.rs.bk
+Cargo.lock
+*.pdb
+
+# C++ specific
+*.o
+*.obj
+*.exe
+*.out
+*.app
+*.dll
+*.so
+*.dylib
+*.a
+*.lib
+*.d
+*.lo
+*.la
+*.lai
+*.Plo
+*.Pla
+*.l
+*.o
+*.obj
+*.elf
+*.bin
+*.hex
+*.map
+*.lst
+*.sym
+*.lss
+*.eep
+*.elf
+*.hex
+*.bin
+*.map
+*.lst
+*.sym
+*.lss
+*.eep
+*.elf
+*.hex
+*.bin
+*.map
+*.lst
+*.sym
+*.lss
+*.eep
+
+# Build directories
+/build/
+/debug_build/
+/CMakeFiles/
+/CMakeCache.txt
+/CMakeScripts/
+/Testing/
+/Makefile
+/cmake_install.cmake
+/install_manifest.txt
+/compile_commands.json
+/CTestTestfile.cmake
+/_deps
+/.cmake
+
+# IDE specific
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS specific
 .DS_Store
-build*
+Thumbs.db
+
+# Project specific
+*.mapped
+*.mapped.stats
+*.dict
+*.inverted
+*.forward
+*.bin
diff --git a/README.md b/README.md
index 69fe339..624670f 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,45 @@
+# Autocomplete System
+
+This repository contains an autocomplete system implementation. The original C++ implementation is being ported to Rust and will be containerized for easier deployment and testing.
+
+## Project Structure
+
+- `autocomplete-rs/`: The Rust port of the original C++ implementation
+- `archive/`: Original C++ implementation and related files
+
+## Goals
+
+1. Port the C++ implementation to Rust while maintaining the same functionality
+2. Leverage Rust's safety guarantees and modern tooling
+3. Containerize the application using Docker for easy deployment and testing
+
+## Current Status
+
+The porting process is ongoing. The following components have been ported to Rust:
+
+- Basic constants and configuration
+- Parameters management
+- Performance measurement probes
+
+## Building and Testing
+
+### Original C++ Implementation
+```bash
+cd archive
+make
+```
+
+### Rust Implementation
+```bash
+cd autocomplete-rs
+cargo build
+cargo test
+```
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
+
 Autocomplete
 ------------
 
diff --git a/archive/include/building_util.hpp b/archive/include/building_util.hpp
new file mode 100644
index 0000000..0398879
--- /dev/null
+++ b/archive/include/building_util.hpp
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "util.hpp"
+#include "bit_vector.hpp"
+
+namespace autocomplete {
+namespace util {
+
+std::vector<id_type> invert(std::vector<id_type> const& docid_to_lexid,
+                            uint64_t size) {
+    std::vector<id_type> lexid_to_docid(size);
+    for (uint64_t doc_id = 0; doc_id != docid_to_lexid.size(); ++doc_id) {
+        if (docid_to_lexid[doc_id] < size) {
+            lexid_to_docid[docid_to_lexid[doc_id]] = doc_id;
+        }
+    }
+    return lexid_to_docid;
+}
+
+void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) {
+    uint64_t mod = bvb.size() % alignment;
+    if (mod) {
+        uint64_t pad = alignment - mod;
+        bvb.append_bits(0, pad);
+        assert(bvb.size() % alignment == 0);
+    }
+}
+
+void eat_pad(bits_iterator<bit_vector>& it, uint64_t alignment = 8) {
+    uint64_t mod = it.position() % alignment;
+    if (mod) {
+        uint64_t pad = alignment - mod;
+        it.get_bits(pad);
+        assert(it.position() % alignment == 0);
+    }
+}
+
+}  // namespace util
+}  // namespace autocomplete
\ No newline at end of file
diff --git a/archive/script/build_indexes.py b/archive/script/build_indexes.py
new file mode 100644
index 0000000..e01e1db
--- /dev/null
+++ b/archive/script/build_indexes.py
@@ -0,0 +1,6 @@
+import sys, os
+
+dataset_name = sys.argv[1] # e.g., aol
+types = ["ef_type1", "ef_type2", "ef_type3", "ef_type4"]
+for t in types:
+    os.system("./build " + t + " ../test_data/" + dataset_name + "/" + dataset_name + ".completions -o " + t + "." + dataset_name + ".bin -c 0.0001")
\ No newline at end of file
diff --git a/archive/src/build.cpp b/archive/src/build.cpp
new file mode 100644
index 0000000..ba73954
--- /dev/null
+++ b/archive/src/build.cpp
@@ -0,0 +1,62 @@
+#include <iostream>
+
+#include "types.hpp"
+#include "statistics.hpp"
+#include "../external/cmd_line_parser/include/parser.hpp"
+
+using namespace autocomplete;
+
+template <typename Index>
+void build(parameters const& params, std::string const& output_filename) {
+    Index index(params);
+    index.print_stats();
+    if (output_filename != "") {
+        essentials::logger("saving data structure to disk...");
+        essentials::save<Index>(index, output_filename.c_str());
+        essentials::logger("DONE");
+    }
+}
+
+void build_type4(parameters const& params, const float c,
+                 std::string const& output_filename) {
+    ef_autocomplete_type4 index(params, c);
+    index.print_stats();
+    if (output_filename != "") {
+        essentials::logger("saving data structure to disk...");
+        essentials::save<ef_autocomplete_type4>(index, output_filename.c_str());
+        essentials::logger("DONE");
+    }
+}
+
+int main(int argc, char** argv) {
+    cmd_line_parser::parser parser(argc, argv);
+    parser.add("type", "Index type.");
+    parser.add("collection_basename", "Collection basename.");
+    parser.add("output_filename", "Output filename.", "-o", false);
+    parser.add(
+        "c",
+        "Value for Bast and Weber's technique: c must be a float in (0,1].",
+        "-c", false);
+    if (!parser.parse()) return 1;
+
+    auto type = parser.get<std::string>("type");
+    parameters params;
+    params.collection_basename = parser.get<std::string>("collection_basename");
+    params.load();
+    auto output_filename = parser.get<std::string>("output_filename");
+
+    if (type == "ef_type1") {
+        build<ef_autocomplete_type1>(params, output_filename);
+    } else if (type == "ef_type2") {
+        build<ef_autocomplete_type2>(params, output_filename);
+    } else if (type == "ef_type3") {
+        build<ef_autocomplete_type3>(params, output_filename);
+    } else if (type == "ef_type4") {
+        auto c = parser.get<float>("c");
+        build_type4(params, c, output_filename);
+    } else {
+        return 1;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/archive/test_data/build_inverted_and_forward.py b/archive/test_data/build_inverted_and_forward.py
new file mode 100644
index 0000000..0966d99
--- /dev/null
+++ b/archive/test_data/build_inverted_and_forward.py
@@ -0,0 +1,74 @@
+import sys
+
+input_filename = sys.argv[1]
+
+tokens = {}
+print("building dictionary...")
+id = 1 # reserve id 0 to mark the end of a string
+with open(input_filename + ".dict") as f:
+    for line in f:
+        t = line.rstrip('\n')
+        tokens[t] = id
+        id += 1
+
+lines = 0
+inverted = open(input_filename + ".inverted", 'w')
+forward = open(input_filename + ".forward", 'w')
+
+num_terms = 0
+num_docs = 0
+with open(input_filename + ".mapped.stats") as f:
+    num_terms = int(f.readline())
+    print("terms: " + str(num_terms))
+    f.readline() # skip line: max num. of query terms
+    f.readline() # skip line: num. of completions
+    num_docs = int(f.readline())
+    print("universe: " + str(num_docs))
+
+inverted_index = [[] for i in range(num_terms + 1)] # id 0 is not assigned
+forward_index = [[] for i in range(num_docs)]
+
+with open(input_filename, 'r') as f:
+    for line in f:
+        x = line.rstrip('\n').split()
+        mapped = []
+        doc_id = int(x[0])
+        discard = False
+        for i in range(1, len(x)):
+            try:
+                term = x[i]
+                try:
+                    term_id = tokens[term]
+                    if term_id not in mapped:
+                        inverted_index[term_id].append(doc_id)
+                    mapped.append(term_id)
+                except KeyError:
+                    print("'" + term + "' not found in dictionary")
+                    print(line)
+                    exit()
+            except UnicodeDecodeError:
+                discard = True
+
+        if not discard:
+            # NOTE: not sorted!
+            if doc_id >= num_docs:
+                print(doc_id,num_docs)
+            forward_index[doc_id] = mapped;
+
+            lines += 1
+            if lines % 1000000 == 0:
+                print("processed " + str(lines) + " lines")
+
+for i in range(0, num_docs):
+    s = [str(k) for k in forward_index[i]]
+    forward.write(str(len(forward_index[i])) + " ")
+    forward.write(" ".join(s) + "\n")
+forward.close()
+
+for i in range(1, num_terms + 1):
+    posting_list = inverted_index[i]
+    unique = sorted(set(posting_list));
+    s = [str(i) for i in unique] # remove any possible duplicate
+    inverted.write(str(len(unique)) + " ")
+    inverted.write(" ".join(s) + "\n")
+inverted.close()
diff --git a/archive/test_data/build_stats.py b/archive/test_data/build_stats.py
new file mode 100644
index 0000000..880bcd3
--- /dev/null
+++ b/archive/test_data/build_stats.py
@@ -0,0 +1,49 @@
+import sys
+
+input_filename = sys.argv[1] # e.g., "completions.mapped"
+
+nodes_per_level = {} # (level_id, num_nodes)
+lines = 0
+print("building stats...")
+
+output_file = open(input_filename + ".stats", 'a')
+prev = []
+universe = 0;
+with open(input_filename, 'r') as f:
+    for line in f:
+        x = line.rstrip('\n').split()
+        docid = int(x[0])
+
+        if docid > universe:
+            universe = docid
+
+        q = x[1:len(x)]
+
+        level_id = 0
+        while level_id < len(q) and level_id < len(prev) and q[level_id] == prev[level_id]:
+            level_id += 1
+
+        while level_id < len(q):
+            if level_id in nodes_per_level:
+                nodes_per_level[level_id] += 1
+            else:
+                nodes_per_level[level_id] = 1
+            level_id += 1
+
+        prev = q
+        lines += 1
+        if lines % 1000000 == 0:
+            print("processed " + str(lines) + " lines")
+
+# number of completions
+# number of levels in the trie
+# number of nodes for each level
+print("universe: " + str(universe + 1))
+print("completions: " + str(lines))
+output_file.write(str(lines) + "\n")
+output_file.write(str(universe + 1) + "\n")
+output_file.write(str(len(nodes_per_level)) + "\n")
+for key, value in sorted(nodes_per_level.items(), key = lambda kv: kv[0]):
+    output_file.write(str(value) + "\n")
+output_file.close()
+

From 423c6d409c782ae137162afefc1aa509c4e45aba Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 10:29:16 -0400
Subject: [PATCH 092/102] add doc

---
 doc/component_diagram.md |  45 ++++++++++++
 doc/cpp_structure.md     | 153 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 doc/component_diagram.md
 create mode 100644 doc/cpp_structure.md

diff --git a/doc/component_diagram.md b/doc/component_diagram.md
new file mode 100644
index 0000000..5c9fd83
--- /dev/null
+++ b/doc/component_diagram.md
@@ -0,0 +1,45 @@
+# Component Relationships
+
+```mermaid
+graph TD
+    subgraph Core
+        Constants[Constants]
+        Parameters[Parameters]
+        Probe[Performance Probe]
+    end
+
+    subgraph Data Structures
+        StringPool[String Pool]
+        Trie[Completion Trie]
+        Dictionary[Front-Coded Dictionary]
+        Index[Blocked Inverted Index]
+    end
+
+    subgraph Pipeline
+        Input[Input Processing]
+        Build[Index Building]
+        Query[Query Processing]
+    end
+
+    %% Core Dependencies
+    Constants --> Parameters
+    Parameters --> StringPool
+    Parameters --> Trie
+    Parameters --> Dictionary
+    Parameters --> Index
+    Probe --> Query
+
+    %% Data Structure Dependencies
+    Dictionary --> Trie
+    Trie --> Index
+    StringPool --> Dictionary
+    StringPool --> Trie
+    StringPool --> Index
+
+    %% Pipeline Dependencies
+    Input --> Build
+    Build --> Query
+    Query --> Trie
+    Query --> Index
+    Query --> Dictionary
+``` 
\ No newline at end of file
diff --git a/doc/cpp_structure.md b/doc/cpp_structure.md
new file mode 100644
index 0000000..bfa42d0
--- /dev/null
+++ b/doc/cpp_structure.md
@@ -0,0 +1,153 @@
+# C++ Code Structure Documentation
+
+This document outlines the structure of the original C++ implementation that is being ported to Rust.
+
+## Core Components
+
+### 1. Constants and Configuration
+- **File**: `constants.hpp`
+- **Purpose**: Defines system-wide constants and limits
+- **Key Constants**:
+  - `MAX_K`: Maximum number of completions
+  - `MAX_NUM_TERMS_PER_QUERY`: Maximum terms per query
+  - `MAX_NUM_CHARS_PER_QUERY`: Maximum characters per query
+  - `POOL_SIZE`: Size of the string pool
+
+### 2. Parameters Management
+- **File**: `parameters.hpp`
+- **Purpose**: Manages system configuration parameters
+- **Key Struct**: `parameters`
+  - `num_terms`: Total number of terms
+  - `max_string_length`: Maximum string length
+  - `num_completions`: Number of completions
+  - `universe`: Size of the universe
+  - `num_levels`: Number of levels in the index
+  - `nodes_per_level`: Vector of nodes per level
+  - `collection_basename`: Base name for collection files
+
+### 3. Performance Measurement
+- **File**: `probe.hpp`
+- **Purpose**: Performance measurement and timing
+- **Key Structs**:
+  - `nop_probe`: No-operation probe
+  - `timer_probe`: Timer-based performance measurement
+
+### 4. String Pool Management
+- **File**: `scored_string_pool.hpp`
+- **Purpose**: Manages a pool of scored strings
+- **Key Components**:
+  - String storage
+  - Score management
+  - Pool operations
+
+### 5. Completion Trie
+- **File**: `completion_trie.hpp`
+- **Purpose**: Implements the completion trie data structure
+- **Key Features**:
+  - Prefix-based completion
+  - Node management
+  - Traversal operations
+
+### 6. Blocked Inverted Index
+- **File**: `blocked_inverted_index.hpp`
+- **Purpose**: Implements blocked inverted indexing
+- **Key Components**:
+  - Block management
+  - Index operations
+  - Query processing
+
+### 7. Front-Coded Dictionary
+- **File**: `fc_dictionary.hpp`
+- **Purpose**: Implements front-coding for dictionary compression
+- **Key Features**:
+  - String compression
+  - Dictionary operations
+  - Lookup functionality
+
+## Data Pipeline
+
+1. **Input Processing**
+   - Read input completions
+   - Sort lexicographically
+   - Generate statistics
+
+2. **Index Building**
+   - Build front-coded dictionary
+   - Construct completion trie
+   - Create blocked inverted index
+
+3. **Query Processing**
+   - Parse input query
+   - Traverse completion trie
+   - Search inverted index
+   - Return top-k completions
+
+## Key Methods and Operations
+
+### Dictionary Operations
+```cpp
+// Front-coded dictionary
+void build_dictionary();
+void compress_strings();
+std::string lookup(uint32_t id);
+```
+
+### Trie Operations
+```cpp
+// Completion trie
+void insert(const std::string& completion);
+std::vector<std::string> complete(const std::string& prefix);
+```
+
+### Index Operations
+```cpp
+// Blocked inverted index
+void build_index();
+std::vector<uint32_t> search(const std::vector<uint32_t>& terms);
+```
+
+### Query Processing
+```cpp
+// Query handling
+std::vector<std::string> process_query(const std::string& query);
+void rank_completions(std::vector<std::string>& completions);
+```
+
+## Dependencies and Relationships
+
+1. **Core Dependencies**
+   - Constants → Parameters
+   - Parameters → All major components
+   - Probe → Performance measurement
+
+2. **Data Structure Dependencies**
+   - Front-coded Dictionary → Completion Trie
+   - Completion Trie → Blocked Inverted Index
+   - All components → String Pool
+
+3. **Pipeline Dependencies**
+   - Input Processing → Index Building
+   - Index Building → Query Processing
+   - Query Processing → All components
+
+## Porting Strategy
+
+1. **Phase 1: Core Components**
+   - Constants and configuration
+   - Parameters management
+   - Performance measurement
+
+2. **Phase 2: Data Structures**
+   - String pool
+   - Completion trie
+   - Front-coded dictionary
+
+3. **Phase 3: Index and Query**
+   - Blocked inverted index
+   - Query processing
+   - Pipeline integration
+
+4. **Phase 4: Testing and Optimization**
+   - Unit tests
+   - Integration tests
+   - Performance optimization 
\ No newline at end of file

From 64a1902a17c077f7da087cea4ecd984d5632229c Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 10:32:23 -0400
Subject: [PATCH 093/102] add ds doc

---
 doc/data_structures.md | 253 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 253 insertions(+)
 create mode 100644 doc/data_structures.md

diff --git a/doc/data_structures.md b/doc/data_structures.md
new file mode 100644
index 0000000..9da8761
--- /dev/null
+++ b/doc/data_structures.md
@@ -0,0 +1,253 @@
+# Data Structures Documentation
+
+This document details the key data structures used in the autocomplete system.
+
+## 1. Scored String Pool
+
+### Purpose
+Manages a fixed-size pool of strings with associated scores, optimized for fast retrieval and updates.
+
+### Structure
+```cpp
+struct scored_string_pool {
+    std::vector<std::string> strings;  // String storage
+    std::vector<float> scores;         // Associated scores
+    size_t size;                       // Current pool size
+    size_t capacity;                   // Maximum capacity
+};
+```
+
+### Visualization
+```mermaid
+graph TD
+    subgraph String Pool
+        direction LR
+        S1[String 1] --> SC1[Score 0.8]
+        S2[String 2] --> SC2[Score 0.6]
+        S3[String 3] --> SC3[Score 0.9]
+        S4[String 4] --> SC4[Score 0.7]
+    end
+    style String Pool fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+### Key Operations
+- `insert(string, score)`: Add a new string with its score
+- `get_score(index)`: Retrieve score for a string
+- `get_string(index)`: Retrieve string by index
+- `update_score(index, score)`: Update score for a string
+- `clear()`: Reset the pool
+
+### Memory Management
+- Fixed-size allocation to prevent reallocations
+- Contiguous memory layout for cache efficiency
+- Score and string data stored separately for better cache utilization
+
+## 2. Completion Trie
+
+### Purpose
+Efficient prefix-based string completion using a trie data structure.
+
+### Structure
+```cpp
+struct trie_node {
+    std::unordered_map<char, trie_node*> children;
+    bool is_terminal;
+    std::vector<uint32_t> completion_ids;
+};
+
+struct completion_trie {
+    trie_node* root;
+    size_t num_nodes;
+    size_t num_completions;
+};
+```
+
+### Visualization
+```mermaid
+graph TD
+    Root((Root)) --> H((h))
+    H --> HE((e))
+    HE --> HEL((l))
+    HEL --> HELL((l))
+    HELL --> HELLO((o))
+    HELLO --> HELLOW((w))
+    HELLOW --> HELLOWO((o))
+    HELLOWO --> HELLOWOR((r))
+    HELLOWOR --> HELLOWORL((l))
+    HELLOWORL --> HELLOWORLD((d))
+    
+    style Root fill:#f9f,stroke:#333,stroke-width:2px
+    style HELLOWORLD fill:#9f9,stroke:#333,stroke-width:2px
+```
+
+### Key Operations
+- `insert(completion)`: Add a new completion string
+- `complete(prefix)`: Find all completions for a prefix
+- `remove(completion)`: Remove a completion string
+- `clear()`: Reset the trie
+
+### Optimizations
+- Path compression for common prefixes
+- Node sharing for similar completions
+- Lazy deletion for better performance
+
+## 3. Front-Coded Dictionary
+
+### Purpose
+Compressed string dictionary using front-coding technique.
+
+### Structure
+```cpp
+struct fc_dictionary {
+    std::vector<char> data;           // Compressed string data
+    std::vector<uint32_t> offsets;    // String offsets
+    size_t num_strings;               // Number of strings
+    size_t total_size;                // Total compressed size
+};
+```
+
+### Visualization
+```mermaid
+graph LR
+    subgraph Front-Coded Dictionary
+        direction LR
+        S1[hello] --> |shared prefix| S2[helloworld]
+        S2 --> |shared prefix| S3[hellothere]
+        S3 --> |shared prefix| S4[hellokitty]
+    end
+    style Front-Coded Dictionary fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+### Key Operations
+- `build(strings)`: Build dictionary from string list
+- `lookup(id)`: Retrieve string by ID
+- `compress()`: Apply front-coding compression
+- `decompress(id)`: Decompress a specific string
+
+### Compression Details
+- Common prefixes shared between consecutive strings
+- Variable-length encoding for shared prefix lengths
+- Delta encoding for string differences
+
+## 4. Blocked Inverted Index
+
+### Purpose
+Efficient term-based search using blocked inverted indexing.
+
+### Structure
+```cpp
+struct block {
+    std::vector<uint32_t> doc_ids;    // Document IDs in block
+    uint32_t min_doc_id;              // Minimum doc ID in block
+    uint32_t max_doc_id;              // Maximum doc ID in block
+};
+
+struct inverted_index {
+    std::vector<block> blocks;        // Index blocks
+    std::unordered_map<std::string, std::vector<uint32_t>> term_to_blocks;
+    size_t block_size;                // Size of each block
+};
+```
+
+### Visualization
+```mermaid
+graph TD
+    subgraph Inverted Index
+        direction TB
+        T1[Term 1] --> B1[Block 1]
+        T1 --> B2[Block 2]
+        T2[Term 2] --> B2
+        T2 --> B3[Block 3]
+        T3[Term 3] --> B1
+        T3 --> B3
+        
+        subgraph Block 1
+            D1[Doc 1]
+            D2[Doc 2]
+            D3[Doc 3]
+        end
+        
+        subgraph Block 2
+            D4[Doc 4]
+            D5[Doc 5]
+        end
+        
+        subgraph Block 3
+            D6[Doc 6]
+            D7[Doc 7]
+        end
+    end
+    style Inverted Index fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+### Key Operations
+- `add_document(doc_id, terms)`: Add document to index
+- `search(terms)`: Find documents containing terms
+- `merge_blocks()`: Optimize block structure
+- `clear()`: Reset the index
+
+### Blocking Strategy
+- Fixed-size blocks for predictable memory usage
+- Block-level compression for space efficiency
+- Skip pointers for faster traversal
+
+## Memory and Performance Considerations
+
+### Memory Layout
+1. **Contiguous Storage**
+   - Strings stored in contiguous memory
+   - Scores aligned for SIMD operations
+   - Block data packed efficiently
+
+2. **Cache Optimization**
+   - Hot data kept together
+   - Cold data separated
+   - Alignment for cache lines
+
+### Performance Optimizations
+1. **String Operations**
+   - String interning for deduplication
+   - Small string optimization
+   - Custom string comparison
+
+2. **Search Optimizations**
+   - Block-level skipping
+   - Term frequency caching
+   - Result set intersection optimization
+
+3. **Memory Management**
+   - Custom allocators for specific structures
+   - Memory pooling for frequent allocations
+   - Lazy initialization where appropriate
+
+## Usage Examples
+
+### String Pool Usage
+```cpp
+scored_string_pool pool(POOL_SIZE);
+pool.insert("completion1", 0.8);
+pool.insert("completion2", 0.6);
+auto completions = pool.get_top_k(10);
+```
+
+### Trie Usage
+```cpp
+completion_trie trie;
+trie.insert("hello world");
+trie.insert("hello there");
+auto results = trie.complete("hello");
+```
+
+### Dictionary Usage
+```cpp
+fc_dictionary dict;
+dict.build(strings);
+auto str = dict.lookup(42);
+```
+
+### Index Usage
+```cpp
+inverted_index index;
+index.add_document(1, {"term1", "term2"});
+auto docs = index.search({"term1", "term2"});
+``` 
\ No newline at end of file

From 79987bdfdcc103899de9a5deecab171a116dcdec Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 13:55:27 -0400
Subject: [PATCH 094/102] add class diagram

---
 doc/class_diagram.md | 219 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 219 insertions(+)
 create mode 100644 doc/class_diagram.md

diff --git a/doc/class_diagram.md b/doc/class_diagram.md
new file mode 100644
index 0000000..6561e4a
--- /dev/null
+++ b/doc/class_diagram.md
@@ -0,0 +1,219 @@
+# C++ Class Diagram
+
+This document provides a comprehensive view of all classes in the C++ implementation and their relationships.
+
+## Main Class Diagram
+
+```mermaid
+classDiagram
+    class Parameters {
+        +uint32_t num_terms
+        +uint32_t max_string_length
+        +uint32_t num_completions
+        +uint32_t universe
+        +uint32_t num_levels
+        +vector~uint32_t~ nodes_per_level
+        +string collection_basename
+        +load()
+    }
+
+    class Probe {
+        <<interface>>
+        +start(id: uint64_t)
+        +stop(id: uint64_t)
+    }
+
+    class NopProbe {
+        +start(id: uint64_t)
+        +stop(id: uint64_t)
+    }
+
+    class TimerProbe {
+        -vector~Timer~ timers
+        +start(id: uint64_t)
+        +stop(id: uint64_t)
+        +get_duration(id: uint64_t)
+    }
+
+    class Timer {
+        -Instant start_time
+        -Duration total_duration
+        +start()
+        +stop()
+        +get_duration()
+    }
+
+    class ScoredStringPool {
+        -vector~string~ strings
+        -vector~float~ scores
+        -size_t size
+        -size_t capacity
+        +insert(string, float)
+        +get_score(size_t)
+        +get_string(size_t)
+        +update_score(size_t, float)
+        +clear()
+    }
+
+    class TrieNode {
+        -unordered_map~char, TrieNode*~ children
+        -bool is_terminal
+        -vector~uint32_t~ completion_ids
+        +add_child(char)
+        +get_child(char)
+        +is_terminal()
+    }
+
+    class CompletionTrie {
+        -TrieNode* root
+        -size_t num_nodes
+        -size_t num_completions
+        +insert(string)
+        +complete(string)
+        +remove(string)
+        +clear()
+    }
+
+    class FCDictionary {
+        -vector~char~ data
+        -vector~uint32_t~ offsets
+        -size_t num_strings
+        -size_t total_size
+        +build(vector~string~)
+        +lookup(uint32_t)
+        +compress()
+        +decompress(uint32_t)
+    }
+
+    class Block {
+        -vector~uint32_t~ doc_ids
+        -uint32_t min_doc_id
+        -uint32_t max_doc_id
+        +add_doc(uint32_t)
+        +get_docs()
+        +get_range()
+    }
+
+    class InvertedIndex {
+        -vector~Block~ blocks
+        -unordered_map~string, vector~uint32_t~~ term_to_blocks
+        -size_t block_size
+        +add_document(uint32_t, vector~string~)
+        +search(vector~string~)
+        +merge_blocks()
+        +clear()
+    }
+
+    class Autocomplete {
+        -Parameters params
+        -ScoredStringPool string_pool
+        -CompletionTrie trie
+        -FCDictionary dictionary
+        -InvertedIndex index
+        +build_index(string)
+        +complete(string)
+        +search(vector~string~)
+    }
+
+    %% Relationships
+    Probe <|-- NopProbe
+    Probe <|-- TimerProbe
+    TimerProbe *-- Timer
+    Autocomplete *-- Parameters
+    Autocomplete *-- ScoredStringPool
+    Autocomplete *-- CompletionTrie
+    Autocomplete *-- FCDictionary
+    Autocomplete *-- InvertedIndex
+    CompletionTrie *-- TrieNode
+    InvertedIndex *-- Block
+```
+
+## Component Dependencies
+
+```mermaid
+graph TD
+    subgraph Core
+        Parameters
+        Probe
+    end
+
+    subgraph Data Structures
+        ScoredStringPool
+        CompletionTrie
+        FCDictionary
+        InvertedIndex
+    end
+
+    subgraph Implementation
+        Autocomplete
+    end
+
+    %% Dependencies
+    Parameters --> ScoredStringPool
+    Parameters --> CompletionTrie
+    Parameters --> FCDictionary
+    Parameters --> InvertedIndex
+    
+    ScoredStringPool --> Autocomplete
+    CompletionTrie --> Autocomplete
+    FCDictionary --> Autocomplete
+    InvertedIndex --> Autocomplete
+
+    style Core fill:#f9f,stroke:#333,stroke-width:2px
+    style Data Structures fill:#9f9,stroke:#333,stroke-width:2px
+    style Implementation fill:#99f,stroke:#333,stroke-width:2px
+```
+
+## Memory Layout
+
+```mermaid
+graph TD
+    subgraph Memory Organization
+        direction TB
+        Stack[Stack Memory] --> Heap[Heap Memory]
+        Heap --> Data[Data Structures]
+        Data --> Strings[String Pool]
+        Data --> Trie[Trie Nodes]
+        Data --> Dict[Dictionary]
+        Data --> Index[Inverted Index]
+    end
+
+    style Memory Organization fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+## Key Features and Methods
+
+### Core Components
+- **Parameters**: Configuration management
+- **Probe**: Performance measurement interface
+- **Timer**: Time tracking implementation
+
+### Data Structures
+- **ScoredStringPool**: String and score management
+- **CompletionTrie**: Prefix-based completion
+- **FCDictionary**: String compression
+- **InvertedIndex**: Term-based search
+
+### Main Implementation
+- **Autocomplete**: Orchestrates all components
+
+## Usage Example
+
+```cpp
+// Initialize components
+Parameters params;
+params.load("config.stats");
+
+ScoredStringPool pool(POOL_SIZE);
+CompletionTrie trie;
+FCDictionary dict;
+InvertedIndex index;
+
+// Build autocomplete system
+Autocomplete ac(params, pool, trie, dict, index);
+ac.build_index("data.txt");
+
+// Use the system
+auto completions = ac.complete("hello");
+auto results = ac.search({"hello", "world"});
+``` 
\ No newline at end of file

From 601fad7cbac54eb6631e814099d0067e67a7a084 Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 13:58:58 -0400
Subject: [PATCH 095/102] add more details

---
 doc/activity_diagram.md | 157 ++++++++++++++++++++++++++++++++++++++++
 doc/class_diagram.md    | 132 ++++++++++++++++++++++++++++++---
 2 files changed, 280 insertions(+), 9 deletions(-)
 create mode 100644 doc/activity_diagram.md

diff --git a/doc/activity_diagram.md b/doc/activity_diagram.md
new file mode 100644
index 0000000..993101b
--- /dev/null
+++ b/doc/activity_diagram.md
@@ -0,0 +1,157 @@
+# Activity Diagrams
+
+This document provides activity diagrams for the main workflows in the autocomplete system.
+
+## System Initialization and Index Building
+
+```mermaid
+graph TD
+    Start([Start]) --> LoadParams[Load Parameters]
+    LoadParams --> InitComponents[Initialize Components]
+    InitComponents --> BuildTrie[Build Completion Trie]
+    BuildTrie --> BuildDict[Build Front-Coded Dictionary]
+    BuildDict --> BuildIndex[Build Inverted Index]
+    BuildIndex --> BuildForwardIndex[Build Forward Index]
+    BuildForwardIndex --> End([End])
+
+    style Start fill:#f9f,stroke:#333,stroke-width:2px
+    style End fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+## Autocomplete Query Processing
+
+```mermaid
+graph TD
+    Start([Start]) --> InputQuery[Input Query]
+    InputQuery --> ParseQuery[Parse Query Terms]
+    ParseQuery --> CheckPrefix[Check Prefix in Trie]
+    
+    CheckPrefix -->|Prefix Found| GetCompletions[Get Completions]
+    CheckPrefix -->|No Prefix| ReturnEmpty[Return Empty Results]
+    
+    GetCompletions --> ScoreCompletions[Score Completions]
+    ScoreCompletions --> SortResults[Sort by Score]
+    SortResults --> ReturnResults[Return Top-K Results]
+    
+    ReturnEmpty --> End([End])
+    ReturnResults --> End
+
+    style Start fill:#f9f,stroke:#333,stroke-width:2px
+    style End fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+## Search Operation Flow
+
+```mermaid
+graph TD
+    Start([Start]) --> InputTerms[Input Search Terms]
+    InputTerms --> ParseTerms[Parse Search Terms]
+    ParseTerms --> LookupTerms[Lookup Terms in Dictionary]
+    
+    LookupTerms -->|All Terms Found| GetPostings[Get Posting Lists]
+    LookupTerms -->|Terms Not Found| ReturnEmpty[Return Empty Results]
+    
+    GetPostings --> IntersectLists[Intersect Posting Lists]
+    IntersectLists --> ScoreDocs[Score Documents]
+    ScoreDocs --> SortResults[Sort by Score]
+    SortResults --> ReturnResults[Return Top-K Results]
+    
+    ReturnEmpty --> End([End])
+    ReturnResults --> End
+
+    style Start fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+## String Pool Management
+
+```mermaid
+graph TD
+    Start([Start]) --> CheckCapacity[Check Pool Capacity]
+    CheckCapacity -->|Full| RemoveLowest[Remove Lowest Score]
+    CheckCapacity -->|Space Available| AddString[Add New String]
+    
+    RemoveLowest --> AddString
+    AddString --> UpdateScores[Update Scores]
+    UpdateScores --> SortPool[Sort Pool by Score]
+    SortPool --> End([End])
+
+    style Start fill:#f9f,stroke:#333,stroke-width:2px
+    style End fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+## Blocked Inverted Index Operations
+
+```mermaid
+graph TD
+    Start([Start]) --> InputDoc[Input Document]
+    InputDoc --> ExtractTerms[Extract Terms]
+    ExtractTerms --> CheckBlocks[Check Existing Blocks]
+    
+    CheckBlocks -->|Block Found| UpdateBlock[Update Block]
+    CheckBlocks -->|New Block| CreateBlock[Create New Block]
+    
+    UpdateBlock --> MergeCheck[Check Merge Condition]
+    CreateBlock --> MergeCheck
+    
+    MergeCheck -->|Merge Needed| MergeBlocks[Merge Blocks]
+    MergeCheck -->|No Merge| UpdateIndex[Update Index]
+    
+    MergeBlocks --> UpdateIndex
+    UpdateIndex --> End([End])
+
+    style Start fill:#f9f,stroke:#333,stroke-width:2px
+    style End fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+## Performance Measurement Flow
+
+```mermaid
+graph TD
+    Start([Start]) --> StartTimer[Start Timer]
+    StartTimer --> Operation[Perform Operation]
+    Operation --> StopTimer[Stop Timer]
+    StopTimer --> RecordMetrics[Record Metrics]
+    RecordMetrics --> AnalyzePerformance[Analyze Performance]
+    AnalyzePerformance --> End([End])
+
+    style Start fill:#f9f,stroke:#333,stroke-width:2px
+    style End fill:#f9f,stroke:#333,stroke-width:2px
+```
+
+## Key Operations Description
+
+### System Initialization
+1. Load configuration parameters
+2. Initialize core components
+3. Build data structures
+4. Set up indexes
+
+### Query Processing
+1. Parse and validate input
+2. Check prefix in trie
+3. Retrieve and score completions
+4. Sort and return results
+
+### Search Operations
+1. Process search terms
+2. Lookup in dictionary
+3. Retrieve and intersect posting lists
+4. Score and rank results
+
+### String Pool Management
+1. Maintain fixed-size pool
+2. Handle insertions and removals
+3. Update and sort scores
+4. Manage memory efficiently
+
+### Blocked Index Operations
+1. Process document updates
+2. Manage block structure
+3. Handle block merges
+4. Maintain index consistency
+
+### Performance Measurement
+1. Track operation timing
+2. Record performance metrics
+3. Analyze system behavior
+4. Optimize based on results 
\ No newline at end of file
diff --git a/doc/class_diagram.md b/doc/class_diagram.md
index 6561e4a..4a5f4db 100644
--- a/doc/class_diagram.md
+++ b/doc/class_diagram.md
@@ -44,15 +44,23 @@ classDiagram
     }
 
     class ScoredStringPool {
-        -vector~string~ strings
-        -vector~float~ scores
-        -size_t size
-        -size_t capacity
-        +insert(string, float)
-        +get_score(size_t)
-        +get_string(size_t)
-        +update_score(size_t, float)
+        -vector~id_type~ m_scores
+        -vector~size_t~ m_offsets
+        -vector~uint8_t~ m_data
+        +init()
+        +resize(size_t, uint32_t)
         +clear()
+        +size()
+        +bytes()
+        +data()
+        +push_back_offset(size_t)
+        +scores()
+        +const_scores()
+    }
+
+    class ScoredByteRange {
+        +byte_range string
+        +id_type score
     }
 
     class TrieNode {
@@ -85,6 +93,15 @@ classDiagram
         +decompress(uint32_t)
     }
 
+    class IntegerFCDictionary {
+        -vector~uint32_t~ m_headers
+        -vector~uint8_t~ m_buckets
+        -size_t m_size
+        +build(vector~string~)
+        +lookup(uint32_t)
+        +extract(id_type, completion_type)
+    }
+
     class Block {
         -vector~uint32_t~ doc_ids
         -uint32_t min_doc_id
@@ -104,6 +121,37 @@ classDiagram
         +clear()
     }
 
+    class CompactVector {
+        -vector~uint64_t~ m_bits
+        -uint8_t m_width
+        -uint64_t m_mask
+        +build(vector~uint64_t~)
+        +access(uint64_t)
+        +size()
+    }
+
+    class BitVector {
+        -vector~uint64_t~ m_bits
+        -size_t m_size
+        +build(bit_vector_builder*)
+        +size()
+        +bytes()
+        +operator[](uint64_t)
+        +get_bits(uint64_t, uint64_t)
+    }
+
+    class MinHeap {
+        -vector~T~ m_q
+        -Comparator m_comparator
+        +reserve(uint64_t)
+        +top()
+        +push(T)
+        +pop()
+        +clear()
+        +empty()
+        +size()
+    }
+
     class Autocomplete {
         -Parameters params
         -ScoredStringPool string_pool
@@ -115,6 +163,41 @@ classDiagram
         +search(vector~string~)
     }
 
+    class Autocomplete2 {
+        -Parameters params
+        -ScoredStringPool string_pool
+        -CompletionTrie trie
+        -FCDictionary dictionary
+        -InvertedIndex index
+        -CompactVector docid_to_lexid
+        +build_index(string)
+        +complete(string)
+        +search(vector~string~)
+    }
+
+    class Autocomplete3 {
+        -Parameters params
+        -ScoredStringPool string_pool
+        -CompletionTrie trie
+        -FCDictionary dictionary
+        -InvertedIndex index
+        -MinHeap min_priority_queue
+        +build_index(string)
+        +complete(string)
+        +search(vector~string~)
+    }
+
+    class Autocomplete4 {
+        -Parameters params
+        -ScoredStringPool string_pool
+        -CompletionTrie trie
+        -FCDictionary dictionary
+        -BlockedInvertedIndex index
+        +build_index(string)
+        +complete(string)
+        +search(vector~string~)
+    }
+
     %% Relationships
     Probe <|-- NopProbe
     Probe <|-- TimerProbe
@@ -126,6 +209,13 @@ classDiagram
     Autocomplete *-- InvertedIndex
     CompletionTrie *-- TrieNode
     InvertedIndex *-- Block
+    ScoredStringPool *-- ScoredByteRange
+    Autocomplete2 --|> Autocomplete
+    Autocomplete3 --|> Autocomplete
+    Autocomplete4 --|> Autocomplete
+    Autocomplete3 *-- MinHeap
+    Autocomplete2 *-- CompactVector
+    Autocomplete4 *-- BlockedInvertedIndex
 ```
 
 ## Component Dependencies
@@ -135,17 +225,26 @@ graph TD
     subgraph Core
         Parameters
         Probe
+        Timer
     end
 
     subgraph Data Structures
         ScoredStringPool
         CompletionTrie
         FCDictionary
+        IntegerFCDictionary
         InvertedIndex
+        BlockedInvertedIndex
+        CompactVector
+        BitVector
+        MinHeap
     end
 
     subgraph Implementation
         Autocomplete
+        Autocomplete2
+        Autocomplete3
+        Autocomplete4
     end
 
     %% Dependencies
@@ -153,11 +252,16 @@ graph TD
     Parameters --> CompletionTrie
     Parameters --> FCDictionary
     Parameters --> InvertedIndex
+    Parameters --> IntegerFCDictionary
     
     ScoredStringPool --> Autocomplete
     CompletionTrie --> Autocomplete
     FCDictionary --> Autocomplete
     InvertedIndex --> Autocomplete
+    IntegerFCDictionary --> Autocomplete2
+    CompactVector --> Autocomplete2
+    MinHeap --> Autocomplete3
+    BlockedInvertedIndex --> Autocomplete4
 
     style Core fill:#f9f,stroke:#333,stroke-width:2px
     style Data Structures fill:#9f9,stroke:#333,stroke-width:2px
@@ -176,6 +280,8 @@ graph TD
         Data --> Trie[Trie Nodes]
         Data --> Dict[Dictionary]
         Data --> Index[Inverted Index]
+        Data --> Compact[Compact Vectors]
+        Data --> BitVec[Bit Vectors]
     end
 
     style Memory Organization fill:#f9f,stroke:#333,stroke-width:2px
@@ -192,10 +298,18 @@ graph TD
 - **ScoredStringPool**: String and score management
 - **CompletionTrie**: Prefix-based completion
 - **FCDictionary**: String compression
+- **IntegerFCDictionary**: Integer-based dictionary
 - **InvertedIndex**: Term-based search
+- **BlockedInvertedIndex**: Blocked term-based search
+- **CompactVector**: Space-efficient vector
+- **BitVector**: Bit-level operations
+- **MinHeap**: Priority queue implementation
 
 ### Main Implementation
-- **Autocomplete**: Orchestrates all components
+- **Autocomplete**: Base implementation
+- **Autocomplete2**: Integer-based optimization
+- **Autocomplete3**: Min-heap based optimization
+- **Autocomplete4**: Blocked index optimization
 
 ## Usage Example
 

From 8921c936396149c06d83f1ca390986aa51260933 Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 14:30:31 -0400
Subject: [PATCH 096/102] unstable

---
 .gitignore                               |   28 +
 autocomplete-rs/Cargo.lock               | 1047 +++++++++++++++++++++-
 autocomplete-rs/Cargo.toml               |    7 +
 autocomplete-rs/build.rs                 |    4 +
 autocomplete-rs/examples/client.rs       |   36 +
 autocomplete-rs/proto/autocomplete.proto |   58 ++
 autocomplete-rs/src/autocomplete.rs      |  222 +++++
 autocomplete-rs/src/dictionary.rs        |  199 ++++
 autocomplete-rs/src/index.rs             |  204 +++++
 autocomplete-rs/src/lib.rs               |   14 +-
 autocomplete-rs/src/server.rs            |   89 ++
 autocomplete-rs/src/string_pool.rs       |  151 ++++
 autocomplete-rs/src/trie.rs              |  182 ++++
 autocomplete-rs/src/types.rs             |   92 ++
 14 files changed, 2325 insertions(+), 8 deletions(-)
 create mode 100644 autocomplete-rs/build.rs
 create mode 100644 autocomplete-rs/examples/client.rs
 create mode 100644 autocomplete-rs/proto/autocomplete.proto
 create mode 100644 autocomplete-rs/src/autocomplete.rs
 create mode 100644 autocomplete-rs/src/dictionary.rs
 create mode 100644 autocomplete-rs/src/index.rs
 create mode 100644 autocomplete-rs/src/server.rs
 create mode 100644 autocomplete-rs/src/string_pool.rs
 create mode 100644 autocomplete-rs/src/trie.rs
 create mode 100644 autocomplete-rs/src/types.rs

diff --git a/.gitignore b/.gitignore
index 69bd68e..2d7573c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,24 @@
 # Rust specific
 /target/
+**/target/
 **/*.rs.bk
 Cargo.lock
 *.pdb
 
+# Protocol Buffers
+*.pb.h
+*.pb.cc
+*.pb.go
+*.pb.swift
+*.pb.dart
+*.pb.js
+*.pb.ts
+*.pb.rs
+
+# Generated Rust files
+/src/autocomplete_proto.rs
+/src/autocomplete_proto/*.rs
+
 # C++ specific
 *.o
 *.obj
@@ -51,18 +66,31 @@ Cargo.lock
 
 # Build directories
 /build/
+**/build/
 /debug_build/
+**/debug_build/
 /CMakeFiles/
+**/CMakeFiles/
 /CMakeCache.txt
+**/CMakeCache.txt
 /CMakeScripts/
+**/CMakeScripts/
 /Testing/
+**/Testing/
 /Makefile
+**/Makefile
 /cmake_install.cmake
+**/cmake_install.cmake
 /install_manifest.txt
+**/install_manifest.txt
 /compile_commands.json
+**/compile_commands.json
 /CTestTestfile.cmake
+**/CTestTestfile.cmake
 /_deps
+**/_deps
 /.cmake
+**/.cmake
 
 # IDE specific
 .vscode/
diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock
index 6cb35bc..d1b8fd2 100644
--- a/autocomplete-rs/Cargo.lock
+++ b/autocomplete-rs/Cargo.lock
@@ -2,25 +2,189 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "addr2line"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.98"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
+
+[[package]]
+name = "async-stream"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.88"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
+
 [[package]]
 name = "autocomplete-rs"
 version = "0.1.0"
 dependencies = [
+ "futures",
+ "prost",
  "tempfile",
+ "tokio",
+ "tonic",
+ "tonic-build",
+]
+
+[[package]]
+name = "axum"
+version = "0.6.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "bitflags 1.3.2",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "mime",
+ "rustversion",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "backtrace"
+version = "0.3.75"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "windows-targets",
 ]
 
+[[package]]
+name = "base64"
+version = "0.21.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
 [[package]]
 name = "bitflags"
 version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
 
+[[package]]
+name = "bytes"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
 [[package]]
 name = "errno"
 version = "0.3.12"
@@ -28,7 +192,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -37,6 +201,118 @@ version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 
+[[package]]
+name = "fixedbitset"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "futures"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
+
+[[package]]
+name = "futures-task"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
+
+[[package]]
+name = "futures-util"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.3.3"
@@ -46,9 +322,157 @@ dependencies = [
  "cfg-if",
  "libc",
  "r-efi",
- "wasi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
+[[package]]
+name = "gimli"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+
+[[package]]
+name = "h2"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
+dependencies = [
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http",
+ "indexmap 2.9.0",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.15.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "http"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
+dependencies = [
+ "bytes",
+ "http",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "hyper"
+version = "0.14.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+ "want",
+]
+
+[[package]]
+name = "hyper-timeout"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
+dependencies = [
+ "hyper",
+ "pin-project-lite",
+ "tokio",
+ "tokio-io-timeout",
+]
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
 ]
 
+[[package]]
+name = "indexmap"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.15.3",
+]
+
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
 [[package]]
 name = "libc"
 version = "0.2.172"
@@ -61,31 +485,418 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
 
+[[package]]
+name = "lock_api"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
+
+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a"
+dependencies = [
+ "adler2",
+]
+
+[[package]]
+name = "mio"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
+dependencies = [
+ "libc",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "multimap"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
+
+[[package]]
+name = "object"
+version = "0.36.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.21.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
 
+[[package]]
+name = "parking_lot"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "petgraph"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
+dependencies = [
+ "fixedbitset",
+ "indexmap 2.9.0",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "prost"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29"
+dependencies = [
+ "bytes",
+ "prost-derive",
+]
+
+[[package]]
+name = "prost-build"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
+dependencies = [
+ "bytes",
+ "heck",
+ "itertools",
+ "log",
+ "multimap",
+ "once_cell",
+ "petgraph",
+ "prettyplease",
+ "prost",
+ "prost-types",
+ "regex",
+ "syn",
+ "tempfile",
+]
+
+[[package]]
+name = "prost-derive"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
+dependencies = [
+ "anyhow",
+ "itertools",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0"
+dependencies = [
+ "prost",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
 [[package]]
 name = "r-efi"
 version = "5.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
 
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.16",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af"
+dependencies = [
+ "bitflags 2.9.1",
+]
+
+[[package]]
+name = "regex"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
 [[package]]
 name = "rustix"
 version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266"
 dependencies = [
- "bitflags",
+ "bitflags 2.9.1",
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "rustversion"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d"
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "serde"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "slab"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
+
+[[package]]
+name = "socket2"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.101"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+
 [[package]]
 name = "tempfile"
 version = "3.20.0"
@@ -93,12 +904,205 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1"
 dependencies = [
  "fastrand",
- "getrandom",
+ "getrandom 0.3.3",
  "once_cell",
  "rustix",
- "windows-sys",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "tokio"
+version = "1.45.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2513ca694ef9ede0fb23fe71a4ee4107cb102b9dc1930f6d0fd77aae068ae165"
+dependencies = [
+ "backtrace",
+ "bytes",
+ "libc",
+ "mio",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "tokio-io-timeout"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf"
+dependencies = [
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tonic"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum",
+ "base64",
+ "bytes",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-timeout",
+ "percent-encoding",
+ "pin-project",
+ "prost",
+ "tokio",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic-build"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889"
+dependencies = [
+ "prettyplease",
+ "proc-macro2",
+ "prost-build",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "indexmap 1.9.3",
+ "pin-project",
+ "pin-project-lite",
+ "rand",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
 ]
 
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
 [[package]]
 name = "wasi"
 version = "0.14.2+wasi-0.2.4"
@@ -108,6 +1112,15 @@ dependencies = [
  "wit-bindgen-rt",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.59.0"
@@ -187,5 +1200,25 @@ version = "0.39.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
 dependencies = [
- "bitflags",
+ "bitflags 2.9.1",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml
index 7d62c58..fc1d1f9 100644
--- a/autocomplete-rs/Cargo.toml
+++ b/autocomplete-rs/Cargo.toml
@@ -4,6 +4,13 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
+tonic = "0.10"
+prost = "0.12"
+tokio = { version = "1.0", features = ["full"] }
+futures = "0.3"
 
 [dev-dependencies]
 tempfile = "3.8"
+
+[build-dependencies]
+tonic-build = "0.10"
diff --git a/autocomplete-rs/build.rs b/autocomplete-rs/build.rs
new file mode 100644
index 0000000..7d082f1
--- /dev/null
+++ b/autocomplete-rs/build.rs
@@ -0,0 +1,4 @@
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    tonic_build::compile_protos("proto/autocomplete.proto")?;
+    Ok(())
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/examples/client.rs b/autocomplete-rs/examples/client.rs
new file mode 100644
index 0000000..cbdb2c9
--- /dev/null
+++ b/autocomplete-rs/examples/client.rs
@@ -0,0 +1,36 @@
+use autocomplete_proto::{
+    autocomplete_service_client::AutocompleteServiceClient,
+    CompleteRequest, InitRequest, StringScore,
+};
+
+pub mod autocomplete_proto {
+    tonic::include_proto!("autocomplete");
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut client = AutocompleteServiceClient::connect("http://[::1]:50051").await?;
+
+    // Initialize with some test data
+    let init_request = InitRequest {
+        strings: vec![
+            StringScore { text: "hello".to_string(), score: 1.0 },
+            StringScore { text: "help".to_string(), score: 0.8 },
+            StringScore { text: "hell".to_string(), score: 0.6 },
+        ],
+    };
+
+    let response = client.init(init_request).await?;
+    println!("INIT RESPONSE: {:?}", response);
+
+    // Get completions
+    let request = CompleteRequest {
+        prefix: "hel".to_string(),
+        max_results: 10,
+    };
+
+    let response = client.complete(request).await?;
+    println!("COMPLETE RESPONSE: {:?}", response);
+
+    Ok(())
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/proto/autocomplete.proto b/autocomplete-rs/proto/autocomplete.proto
new file mode 100644
index 0000000..12c2e74
--- /dev/null
+++ b/autocomplete-rs/proto/autocomplete.proto
@@ -0,0 +1,58 @@
+syntax = "proto3";
+
+package autocomplete;
+
+// The autocomplete service definition
+service AutocompleteService {
+  // Get completions for a prefix
+  rpc Complete (CompleteRequest) returns (CompleteResponse) {}
+  
+  // Initialize the autocomplete system with strings and scores
+  rpc Init (InitRequest) returns (InitResponse) {}
+  
+  // Get system statistics
+  rpc GetStats (StatsRequest) returns (StatsResponse) {}
+}
+
+// Request message for completion
+message CompleteRequest {
+  string prefix = 1;
+  int32 max_results = 2;  // Optional: limit number of results
+}
+
+// Response message containing completions
+message CompleteResponse {
+  repeated Completion completions = 1;
+}
+
+// A single completion result
+message Completion {
+  string text = 1;
+  float score = 2;
+}
+
+// Request message for initialization
+message InitRequest {
+  repeated StringScore strings = 1;
+}
+
+// A string with its score
+message StringScore {
+  string text = 1;
+  float score = 2;
+}
+
+// Response message for initialization
+message InitResponse {
+  bool success = 1;
+  string error = 2;  // Empty if success is true
+}
+
+// Request message for stats
+message StatsRequest {}
+
+// Response message containing system statistics
+message StatsResponse {
+  int32 num_terms = 1;
+  int64 memory_bytes = 2;
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/autocomplete.rs b/autocomplete-rs/src/autocomplete.rs
new file mode 100644
index 0000000..1191a75
--- /dev/null
+++ b/autocomplete-rs/src/autocomplete.rs
@@ -0,0 +1,222 @@
+use std::collections::HashMap;
+use crate::types::{IdType, ByteRange, global};
+use crate::string_pool::ScoredStringPool;
+use crate::trie::CompletionTrie;
+use crate::dictionary::{FCDictionary, IntegerFCDictionary};
+use crate::index::{BlockedInvertedIndex, CompactVector, BitVector};
+
+const BLOCK_SIZE: usize = 1024;
+
+/// Main autocomplete implementation
+pub struct Autocomplete {
+    string_pool: ScoredStringPool,
+    trie: CompletionTrie,
+    dictionary: FCDictionary,
+    index: BlockedInvertedIndex,
+    term_to_id: HashMap<String, IdType>,
+    id_to_term: Vec<String>,
+    num_terms: usize,
+}
+
+impl Autocomplete {
+    /// Create a new autocomplete instance
+    pub fn new() -> Self {
+        Self {
+            string_pool: ScoredStringPool::new(),
+            trie: CompletionTrie::new(),
+            dictionary: FCDictionary::new(),
+            index: BlockedInvertedIndex::new(BLOCK_SIZE),
+            term_to_id: HashMap::new(),
+            id_to_term: Vec::new(),
+            num_terms: 0,
+        }
+    }
+
+    /// Initialize the autocomplete system
+    pub fn init(&mut self, strings: &[String], scores: &[IdType]) {
+        assert_eq!(strings.len(), scores.len());
+        
+        // Build string pool
+        self.string_pool = ScoredStringPool::new();
+        let mut offsets = Vec::with_capacity(strings.len() + 1);
+        let mut all_scores = Vec::with_capacity(strings.len());
+        let mut total_bytes = 0;
+        offsets.push(0);
+        for (string, &score) in strings.iter().zip(scores) {
+            total_bytes += string.len();
+            offsets.push(total_bytes);
+            all_scores.push(score);
+        }
+        self.string_pool.set_offsets(offsets);
+        self.string_pool.set_scores(all_scores);
+        self.string_pool.set_data(strings.iter().flat_map(|s| s.as_bytes()).cloned().collect());
+
+        // Build dictionary
+        self.dictionary.build(strings);
+
+        // Build term mappings
+        self.term_to_id.clear();
+        self.id_to_term.clear();
+        for (i, string) in strings.iter().enumerate() {
+            self.term_to_id.insert(string.clone(), (i + 1) as IdType);
+            self.id_to_term.push(string.clone());
+        }
+        self.num_terms = strings.len();
+
+        // Build trie
+        self.trie.clear();
+        for (i, string) in strings.iter().enumerate() {
+            self.trie.insert(string, (i + 1) as IdType);
+        }
+
+        // Build index
+        self.index.clear();
+        for (i, _string) in strings.iter().enumerate() {
+            let term_id = (i + 1) as IdType;
+            self.index.add_doc(term_id, term_id);
+        }
+    }
+
+    /// Find completions for a prefix
+    pub fn complete(&self, prefix: &str) -> Vec<(String, IdType)> {
+        let mut results = Vec::new();
+        
+        // Get completion IDs from trie
+        let completion_ids = self.trie.complete(prefix);
+        
+        // Look up strings and scores
+        for &id in &completion_ids {
+            if let Some(string) = self.dictionary.lookup(id) {
+                let scored_range = self.string_pool.get(id as usize);
+                results.push((string, scored_range.score));
+            }
+        }
+
+        // Sort by score (descending)
+        results.sort_by(|a, b| b.1.cmp(&a.1));
+        results
+    }
+
+    /// Get the number of terms
+    pub fn num_terms(&self) -> usize {
+        self.num_terms
+    }
+
+    /// Get the size in bytes
+    pub fn bytes(&self) -> usize {
+        self.string_pool.bytes() +
+        self.trie.num_nodes() * std::mem::size_of::<char>() +
+        self.dictionary.bytes() +
+        self.index.num_blocks() * std::mem::size_of::<IdType>() +
+        self.term_to_id.capacity() * std::mem::size_of::<(String, IdType)>() +
+        self.id_to_term.capacity() * std::mem::size_of::<String>()
+    }
+}
+
+/// Integer-based autocomplete implementation
+pub struct Autocomplete2 {
+    string_pool: ScoredStringPool,
+    trie: CompletionTrie,
+    dictionary: IntegerFCDictionary,
+    index: BlockedInvertedIndex,
+    term_to_id: HashMap<String, IdType>,
+    id_to_term: Vec<String>,
+    num_terms: usize,
+}
+
+impl Autocomplete2 {
+    /// Create a new integer-based autocomplete instance
+    pub fn new() -> Self {
+        Self {
+            string_pool: ScoredStringPool::new(),
+            trie: CompletionTrie::new(),
+            dictionary: IntegerFCDictionary::new(),
+            index: BlockedInvertedIndex::new(BLOCK_SIZE),
+            term_to_id: HashMap::new(),
+            id_to_term: Vec::new(),
+            num_terms: 0,
+        }
+    }
+
+    /// Initialize the autocomplete system
+    pub fn init(&mut self, strings: &[String], scores: &[IdType]) {
+        assert_eq!(strings.len(), scores.len());
+        
+        // Build string pool
+        self.string_pool = ScoredStringPool::new();
+        let mut offsets = Vec::with_capacity(strings.len() + 1);
+        let mut all_scores = Vec::with_capacity(strings.len());
+        let mut total_bytes = 0;
+        offsets.push(0);
+        for (string, &score) in strings.iter().zip(scores) {
+            total_bytes += string.len();
+            offsets.push(total_bytes);
+            all_scores.push(score);
+        }
+        self.string_pool.set_offsets(offsets);
+        self.string_pool.set_scores(all_scores);
+        self.string_pool.set_data(strings.iter().flat_map(|s| s.as_bytes()).cloned().collect());
+
+        // Build dictionary
+        self.dictionary.build(strings);
+
+        // Build term mappings
+        self.term_to_id.clear();
+        self.id_to_term.clear();
+        for (i, string) in strings.iter().enumerate() {
+            self.term_to_id.insert(string.clone(), (i + 1) as IdType);
+            self.id_to_term.push(string.clone());
+        }
+        self.num_terms = strings.len();
+
+        // Build trie
+        self.trie.clear();
+        for (i, string) in strings.iter().enumerate() {
+            self.trie.insert(string, (i + 1) as IdType);
+        }
+
+        // Build index
+        self.index.clear();
+        for (i, _string) in strings.iter().enumerate() {
+            let term_id = (i + 1) as IdType;
+            self.index.add_doc(term_id, term_id);
+        }
+    }
+
+    /// Find completions for a prefix
+    pub fn complete(&self, prefix: &str) -> Vec<(String, IdType)> {
+        let mut results = Vec::new();
+        let mut completion = Vec::new();
+        
+        // Get completion IDs from trie
+        let completion_ids = self.trie.complete(prefix);
+        
+        // Look up strings and scores
+        for &id in &completion_ids {
+            let len = self.dictionary.extract(id, &mut completion);
+            if len > 0 {
+                let scored_range = self.string_pool.get(id as usize);
+                let string = String::from_utf8_lossy(&completion).into_owned();
+                results.push((string, scored_range.score));
+            }
+        }
+
+        // Sort by score (descending)
+        results.sort_by(|a, b| b.1.cmp(&a.1));
+        results
+    }
+
+    /// Get the number of terms
+    pub fn num_terms(&self) -> usize {
+        self.num_terms
+    }
+
+    /// Get the size in bytes
+    pub fn bytes(&self) -> usize {
+        self.string_pool.bytes() +
+        self.trie.num_nodes() * std::mem::size_of::<char>() +
+        self.index.num_blocks() * std::mem::size_of::<IdType>() +
+        self.term_to_id.capacity() * std::mem::size_of::<(String, IdType)>() +
+        self.id_to_term.capacity() * std::mem::size_of::<String>()
+    }
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/dictionary.rs b/autocomplete-rs/src/dictionary.rs
new file mode 100644
index 0000000..99a37b2
--- /dev/null
+++ b/autocomplete-rs/src/dictionary.rs
@@ -0,0 +1,199 @@
+use std::collections::HashMap;
+use crate::types::{ByteRange, IdType, global};
+
+/// Front-coded dictionary for string compression
+pub struct FCDictionary {
+    data: Vec<u8>,
+    offsets: Vec<u32>,
+    num_strings: usize,
+    total_size: usize,
+}
+
+impl FCDictionary {
+    /// Create a new front-coded dictionary
+    pub fn new() -> Self {
+        Self {
+            data: Vec::new(),
+            offsets: Vec::new(),
+            num_strings: 0,
+            total_size: 0,
+        }
+    }
+
+    /// Build the dictionary from a list of strings
+    pub fn build(&mut self, strings: &[String]) {
+        if strings.is_empty() {
+            return;
+        }
+
+        self.num_strings = strings.len();
+        self.offsets.clear();
+        self.data.clear();
+        self.total_size = 0;
+
+        // Sort strings for better compression
+        let mut sorted_strings: Vec<_> = strings.iter().collect();
+        sorted_strings.sort();
+
+        // First string is stored completely
+        let first = sorted_strings[0];
+        self.offsets.push(0);
+        self.data.extend_from_slice(first.as_bytes());
+        self.total_size += first.len();
+
+        // Process remaining strings
+        for i in 1..sorted_strings.len() {
+            let prev = sorted_strings[i - 1];
+            let curr = sorted_strings[i];
+            
+            // Find common prefix
+            let lcp = self.longest_common_prefix(prev, curr);
+            
+            // Store offset and remaining string
+            self.offsets.push(self.total_size as u32);
+            self.data.push(lcp as u8);
+            self.data.extend_from_slice(&curr.as_bytes()[lcp..]);
+            self.total_size += 1 + curr.len() - lcp;
+        }
+    }
+
+    /// Find the longest common prefix between two strings
+    fn longest_common_prefix(&self, a: &str, b: &str) -> usize {
+        a.bytes()
+            .zip(b.bytes())
+            .take_while(|(x, y)| x == y)
+            .count()
+    }
+
+    /// Look up a string in the dictionary
+    pub fn lookup(&self, id: IdType) -> Option<String> {
+        if id == 0 || id > self.num_strings as IdType {
+            return None;
+        }
+
+        let id = (id - 1) as usize;
+        let offset = self.offsets[id] as usize;
+        
+        if id == 0 {
+            // First string is stored completely
+            let end = if id + 1 < self.offsets.len() {
+                self.offsets[id + 1] as usize
+            } else {
+                self.data.len()
+            };
+            Some(String::from_utf8_lossy(&self.data[offset..end]).into_owned())
+        } else {
+            // Other strings are front-coded
+            let lcp = self.data[offset] as usize;
+            let prev = self.lookup(id as IdType - 1)?;
+            let mut result = prev[..lcp].to_string();
+            let end = if id + 1 < self.offsets.len() {
+                self.offsets[id + 1] as usize
+            } else {
+                self.data.len()
+            };
+            result.push_str(std::str::from_utf8(&self.data[offset + 1..end]).unwrap());
+            Some(result)
+        }
+    }
+
+    /// Get the number of strings in the dictionary
+    pub fn size(&self) -> usize {
+        self.num_strings
+    }
+
+    /// Get the total size of the compressed data
+    pub fn total_size(&self) -> usize {
+        self.total_size
+    }
+
+    /// Get the size of the dictionary in bytes
+    pub fn bytes(&self) -> usize {
+        std::mem::size_of_val(&self.num_strings) +
+        std::mem::size_of_val(&self.total_size) +
+        self.offsets.len() * std::mem::size_of::<u32>() +
+        self.data.len()
+    }
+}
+
+/// Integer-based front-coded dictionary
+pub struct IntegerFCDictionary {
+    headers: Vec<u8>,
+    buckets: Vec<u8>,
+    size: usize,
+}
+
+impl IntegerFCDictionary {
+    /// Create a new integer-based front-coded dictionary
+    pub fn new() -> Self {
+        Self {
+            headers: Vec::new(),
+            buckets: Vec::new(),
+            size: 0,
+        }
+    }
+
+    /// Build the dictionary from a list of strings
+    pub fn build(&mut self, strings: &[String]) {
+        if strings.is_empty() {
+            return;
+        }
+
+        self.size = strings.len();
+        self.headers.clear();
+        self.buckets.clear();
+
+        // Sort strings for better compression
+        let mut sorted_strings: Vec<_> = strings.iter().collect();
+        sorted_strings.sort();
+
+        // Process strings
+        for i in 0..sorted_strings.len() {
+            let curr = sorted_strings[i];
+            let lcp = if i > 0 {
+                self.longest_common_prefix(sorted_strings[i - 1], curr)
+            } else {
+                0
+            };
+
+            // Store header
+            self.headers.extend_from_slice(curr.as_bytes());
+            
+            // Store bucket
+            self.buckets.push(lcp as u8);
+            self.buckets.push((curr.len() - lcp) as u8);
+            self.buckets.extend_from_slice(&curr.as_bytes()[lcp..]);
+        }
+    }
+
+    /// Find the longest common prefix between two strings
+    fn longest_common_prefix(&self, a: &str, b: &str) -> usize {
+        a.bytes()
+            .zip(b.bytes())
+            .take_while(|(x, y)| x == y)
+            .count()
+    }
+
+    /// Extract a string from the dictionary
+    pub fn extract(&self, id: IdType, completion: &mut Vec<u8>) -> u8 {
+        if id == 0 || id > self.size as IdType {
+            return 0;
+        }
+
+        let id = (id - 1) as usize;
+        let bucket_start = id * 2;
+        let lcp = self.buckets[bucket_start] as usize;
+        let remaining = self.buckets[bucket_start + 1] as usize;
+
+        completion.clear();
+        completion.extend_from_slice(&self.headers[id..id + lcp]);
+        completion.extend_from_slice(&self.buckets[bucket_start + 2..bucket_start + 2 + remaining]);
+
+        (lcp + remaining) as u8
+    }
+
+    /// Get the number of strings in the dictionary
+    pub fn size(&self) -> usize {
+        self.size
+    }
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/index.rs b/autocomplete-rs/src/index.rs
new file mode 100644
index 0000000..47d176e
--- /dev/null
+++ b/autocomplete-rs/src/index.rs
@@ -0,0 +1,204 @@
+use std::collections::HashMap;
+use crate::types::{IdType, global};
+
+/// Block in the inverted index
+struct Block {
+    term_id: IdType,
+    num_docs: usize,
+    docs: Vec<IdType>,
+}
+
+impl Block {
+    /// Create a new block
+    fn new(term_id: IdType) -> Self {
+        Self {
+            term_id,
+            num_docs: 0,
+            docs: Vec::new(),
+        }
+    }
+
+    /// Add a document to the block
+    fn add_doc(&mut self, doc_id: IdType) {
+        self.docs.push(doc_id);
+        self.num_docs += 1;
+    }
+
+    /// Get the number of documents in the block
+    fn size(&self) -> usize {
+        self.num_docs
+    }
+}
+
+/// Blocked inverted index for efficient document retrieval
+pub struct BlockedInvertedIndex {
+    blocks: Vec<Block>,
+    term_to_block: HashMap<IdType, usize>,
+    block_size: usize,
+}
+
+impl BlockedInvertedIndex {
+    /// Create a new blocked inverted index
+    pub fn new(block_size: usize) -> Self {
+        Self {
+            blocks: Vec::new(),
+            term_to_block: HashMap::new(),
+            block_size,
+        }
+    }
+
+    /// Add a document to the index
+    pub fn add_doc(&mut self, term_id: IdType, doc_id: IdType) {
+        let block_idx = self.term_to_block.entry(term_id).or_insert_with(|| {
+            self.blocks.push(Block::new(term_id));
+            self.blocks.len() - 1
+        });
+
+        let block = &mut self.blocks[*block_idx];
+        block.add_doc(doc_id);
+
+        // If block is full, create a new one
+        if block.size() >= self.block_size {
+            self.blocks.push(Block::new(term_id));
+            *block_idx = self.blocks.len() - 1;
+        }
+    }
+
+    /// Get documents for a term
+    pub fn get_docs(&self, term_id: IdType) -> Vec<IdType> {
+        let mut docs = Vec::new();
+        
+        // Find all blocks for the term
+        let mut current_idx = self.term_to_block.get(&term_id).copied();
+        while let Some(idx) = current_idx {
+            let block = &self.blocks[idx];
+            docs.extend_from_slice(&block.docs);
+            
+            // Check if there's a next block for the same term
+            current_idx = if idx + 1 < self.blocks.len() && self.blocks[idx + 1].term_id == term_id {
+                Some(idx + 1)
+            } else {
+                None
+            };
+        }
+
+        docs
+    }
+
+    /// Get the number of blocks
+    pub fn num_blocks(&self) -> usize {
+        self.blocks.len()
+    }
+
+    /// Get the total number of documents
+    pub fn num_docs(&self) -> usize {
+        self.blocks.iter().map(|b| b.size()).sum()
+    }
+
+    /// Clear the index
+    pub fn clear(&mut self) {
+        self.blocks.clear();
+        self.term_to_block.clear();
+    }
+}
+
+/// Compact vector for efficient storage
+pub struct CompactVector {
+    data: Vec<u8>,
+    element_size: usize,
+    num_elements: usize,
+}
+
+impl CompactVector {
+    /// Create a new compact vector
+    pub fn new(element_size: usize) -> Self {
+        Self {
+            data: Vec::new(),
+            element_size,
+            num_elements: 0,
+        }
+    }
+
+    /// Add an element to the vector
+    pub fn push(&mut self, element: &[u8]) {
+        assert_eq!(element.len(), self.element_size);
+        self.data.extend_from_slice(element);
+        self.num_elements += 1;
+    }
+
+    /// Get an element from the vector
+    pub fn get(&self, index: usize) -> Option<&[u8]> {
+        if index >= self.num_elements {
+            return None;
+        }
+        let start = index * self.element_size;
+        let end = start + self.element_size;
+        Some(&self.data[start..end])
+    }
+
+    /// Get the number of elements
+    pub fn size(&self) -> usize {
+        self.num_elements
+    }
+
+    /// Get the size in bytes
+    pub fn bytes(&self) -> usize {
+        self.data.len()
+    }
+}
+
+/// Bit vector for efficient bit-level operations
+pub struct BitVector {
+    data: Vec<u8>,
+    num_bits: usize,
+}
+
+impl BitVector {
+    /// Create a new bit vector
+    pub fn new(num_bits: usize) -> Self {
+        let num_bytes = (num_bits + 7) / 8;
+        Self {
+            data: vec![0; num_bytes],
+            num_bits,
+        }
+    }
+
+    /// Set a bit
+    pub fn set(&mut self, index: usize) {
+        if index < self.num_bits {
+            let byte_idx = index / 8;
+            let bit_idx = index % 8;
+            self.data[byte_idx] |= 1 << bit_idx;
+        }
+    }
+
+    /// Clear a bit
+    pub fn clear(&mut self, index: usize) {
+        if index < self.num_bits {
+            let byte_idx = index / 8;
+            let bit_idx = index % 8;
+            self.data[byte_idx] &= !(1 << bit_idx);
+        }
+    }
+
+    /// Test a bit
+    pub fn test(&self, index: usize) -> bool {
+        if index < self.num_bits {
+            let byte_idx = index / 8;
+            let bit_idx = index % 8;
+            (self.data[byte_idx] & (1 << bit_idx)) != 0
+        } else {
+            false
+        }
+    }
+
+    /// Get the number of bits
+    pub fn size(&self) -> usize {
+        self.num_bits
+    }
+
+    /// Get the size in bytes
+    pub fn bytes(&self) -> usize {
+        self.data.len()
+    }
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs
index c5c3755..4004de9 100644
--- a/autocomplete-rs/src/lib.rs
+++ b/autocomplete-rs/src/lib.rs
@@ -1,7 +1,19 @@
 pub mod constants;
 pub mod parameters;
 pub mod probe;
+pub mod types;
+pub mod string_pool;
+pub mod trie;
+pub mod dictionary;
+pub mod index;
+pub mod autocomplete;
 
 pub use constants::*;
 pub use parameters::*;
-pub use probe::*; 
\ No newline at end of file
+pub use probe::*;
+pub use types::*;
+pub use string_pool::*;
+pub use trie::*;
+pub use dictionary::*;
+pub use index::*;
+pub use autocomplete::*; 
\ No newline at end of file
diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs
new file mode 100644
index 0000000..6ddacbd
--- /dev/null
+++ b/autocomplete-rs/src/server.rs
@@ -0,0 +1,89 @@
+use tonic::{transport::Server, Request, Response, Status};
+use crate::autocomplete::{Autocomplete, Autocomplete2};
+
+pub mod autocomplete_proto {
+    tonic::include_proto!("autocomplete");
+}
+
+use autocomplete_proto::{
+    autocomplete_service_server::{AutocompleteService, AutocompleteServiceServer},
+    CompleteRequest, CompleteResponse, Completion,
+    InitRequest, InitResponse,
+    StatsRequest, StatsResponse,
+};
+
+pub struct AutocompleteServiceImpl {
+    autocomplete: Autocomplete,
+}
+
+#[tonic::async_trait]
+impl AutocompleteService for AutocompleteServiceImpl {
+    async fn complete(
+        &self,
+        request: Request<CompleteRequest>,
+    ) -> Result<Response<CompleteResponse>, Status> {
+        let req = request.into_inner();
+        let completions = self.autocomplete.complete(&req.prefix);
+        
+        let response = CompleteResponse {
+            completions: completions.into_iter()
+                .map(|(text, score)| Completion {
+                    text,
+                    score,
+                })
+                .collect(),
+        };
+        
+        Ok(Response::new(response))
+    }
+
+    async fn init(
+        &self,
+        request: Request<InitRequest>,
+    ) -> Result<Response<InitResponse>, Status> {
+        let req = request.into_inner();
+        let strings: Vec<(String, f32)> = req.strings
+            .into_iter()
+            .map(|s| (s.text, s.score))
+            .collect();
+            
+        match self.autocomplete.init(&strings) {
+            Ok(_) => Ok(Response::new(InitResponse {
+                success: true,
+                error: String::new(),
+            })),
+            Err(e) => Ok(Response::new(InitResponse {
+                success: false,
+                error: e.to_string(),
+            })),
+        }
+    }
+
+    async fn get_stats(
+        &self,
+        _request: Request<StatsRequest>,
+    ) -> Result<Response<StatsResponse>, Status> {
+        let response = StatsResponse {
+            num_terms: self.autocomplete.num_terms() as i32,
+            memory_bytes: self.autocomplete.bytes() as i64,
+        };
+        
+        Ok(Response::new(response))
+    }
+}
+
+pub async fn run_server(addr: &str) -> Result<(), Box<dyn std::error::Error>> {
+    let addr = addr.parse()?;
+    let service = AutocompleteServiceImpl {
+        autocomplete: Autocomplete::new(),
+    };
+
+    println!("Autocomplete server listening on {}", addr);
+
+    Server::builder()
+        .add_service(AutocompleteServiceServer::new(service))
+        .serve(addr)
+        .await?;
+
+    Ok(())
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/string_pool.rs b/autocomplete-rs/src/string_pool.rs
new file mode 100644
index 0000000..0dc8ea5
--- /dev/null
+++ b/autocomplete-rs/src/string_pool.rs
@@ -0,0 +1,151 @@
+use crate::types::{ByteRange, IdType};
+
+/// Represents a scored byte range
+#[derive(Debug, Clone)]
+pub struct ScoredByteRange {
+    pub string: ByteRange,
+    pub score: IdType,
+}
+
+/// Manages a pool of scored strings
+pub struct ScoredStringPool {
+    scores: Vec<IdType>,
+    offsets: Vec<usize>,
+    data: Vec<u8>,
+}
+
+impl ScoredStringPool {
+    /// Create a new empty string pool
+    pub fn new() -> Self {
+        let mut pool = Self {
+            scores: Vec::new(),
+            offsets: Vec::new(),
+            data: Vec::new(),
+        };
+        pool.init();
+        pool
+    }
+
+    /// Initialize the pool
+    pub fn init(&mut self) {
+        self.push_back_offset(0);
+    }
+
+    /// Resize the pool
+    pub fn resize(&mut self, num_bytes: usize, k: u32) {
+        self.scores.resize(k as usize, 0);
+        self.data.resize(num_bytes, 0);
+    }
+
+    /// Clear the pool
+    pub fn clear(&mut self) {
+        self.offsets.clear();
+    }
+
+    /// Get the number of strings in the pool
+    pub fn size(&self) -> usize {
+        assert!(!self.offsets.is_empty());
+        self.offsets.len() - 1
+    }
+
+    /// Get the total number of bytes used
+    pub fn bytes(&self) -> usize {
+        self.offsets.last().copied().unwrap_or(0)
+    }
+
+    /// Get a mutable reference to the data
+    pub fn data_mut(&mut self) -> &mut [u8] {
+        &mut self.data
+    }
+
+    /// Add a new offset
+    pub fn push_back_offset(&mut self, offset: usize) {
+        self.offsets.push(offset);
+    }
+
+    /// Get a mutable reference to the scores
+    pub fn scores_mut(&mut self) -> &mut [IdType] {
+        &mut self.scores
+    }
+
+    /// Get a reference to the scores
+    pub fn scores(&self) -> &[IdType] {
+        &self.scores
+    }
+
+    /// Get a scored byte range at the given index
+    pub fn get(&self, i: usize) -> ScoredByteRange {
+        assert!(i < self.size());
+        ScoredByteRange {
+            string: ByteRange {
+                begin: unsafe { self.data.as_ptr().add(self.offsets[i]) },
+                end: unsafe { self.data.as_ptr().add(self.offsets[i + 1]) },
+            },
+            score: self.scores[i],
+        }
+    }
+
+    /// Set the offsets vector
+    pub fn set_offsets(&mut self, offsets: Vec<usize>) {
+        self.offsets = offsets;
+    }
+
+    /// Set the scores vector
+    pub fn set_scores(&mut self, scores: Vec<IdType>) {
+        self.scores = scores;
+    }
+
+    /// Set the data vector
+    pub fn set_data(&mut self, data: Vec<u8>) {
+        self.data = data;
+    }
+}
+
+/// Iterator over scored strings in the pool
+pub struct ScoredStringPoolIterator<'a> {
+    pool: &'a ScoredStringPool,
+    pos: usize,
+}
+
+impl<'a> ScoredStringPoolIterator<'a> {
+    /// Create a new iterator
+    pub fn new(pool: &'a ScoredStringPool, pos: usize) -> Self {
+        Self { pool, pos }
+    }
+
+    /// Check if the iterator is empty
+    pub fn empty(&self) -> bool {
+        self.size() == 0
+    }
+
+    /// Get the number of strings
+    pub fn size(&self) -> usize {
+        self.pool.size()
+    }
+
+    /// Get the pool
+    pub fn pool(&self) -> &ScoredStringPool {
+        self.pool
+    }
+}
+
+impl<'a> Iterator for ScoredStringPoolIterator<'a> {
+    type Item = ScoredByteRange;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.pos < self.pool.size() {
+            let item = self.pool.get(self.pos);
+            self.pos += 1;
+            Some(item)
+        } else {
+            None
+        }
+    }
+}
+
+impl ScoredStringPool {
+    /// Get an iterator over the scored strings
+    pub fn iter(&self) -> ScoredStringPoolIterator {
+        ScoredStringPoolIterator::new(self, 0)
+    }
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/trie.rs b/autocomplete-rs/src/trie.rs
new file mode 100644
index 0000000..1b24c73
--- /dev/null
+++ b/autocomplete-rs/src/trie.rs
@@ -0,0 +1,182 @@
+use std::collections::HashMap;
+use crate::types::{IdType, CompletionType};
+
+/// A node in the completion trie
+pub struct TrieNode {
+    children: HashMap<char, TrieNode>,
+    is_terminal: bool,
+    completion_ids: Vec<IdType>,
+}
+
+impl TrieNode {
+    /// Create a new trie node
+    pub fn new() -> Self {
+        Self {
+            children: HashMap::new(),
+            is_terminal: false,
+            completion_ids: Vec::new(),
+        }
+    }
+
+    /// Add a child node
+    pub fn add_child(&mut self, c: char) -> &mut TrieNode {
+        self.children.entry(c).or_insert_with(TrieNode::new)
+    }
+
+    /// Get a child node
+    pub fn get_child(&self, c: char) -> Option<&TrieNode> {
+        self.children.get(&c)
+    }
+
+    /// Check if this is a terminal node
+    pub fn is_terminal(&self) -> bool {
+        self.is_terminal
+    }
+
+    /// Set this node as terminal
+    pub fn set_terminal(&mut self) {
+        self.is_terminal = true;
+    }
+
+    /// Add a completion ID
+    pub fn add_completion_id(&mut self, id: IdType) {
+        self.completion_ids.push(id);
+    }
+
+    /// Get completion IDs
+    pub fn completion_ids(&self) -> &[IdType] {
+        &self.completion_ids
+    }
+}
+
+/// A trie for prefix-based completion
+pub struct CompletionTrie {
+    root: TrieNode,
+    num_nodes: usize,
+    num_completions: usize,
+}
+
+impl CompletionTrie {
+    /// Create a new completion trie
+    pub fn new() -> Self {
+        Self {
+            root: TrieNode::new(),
+            num_nodes: 1,
+            num_completions: 0,
+        }
+    }
+
+    /// Insert a completion string
+    pub fn insert(&mut self, completion: &str, id: IdType) {
+        let mut node = &mut self.root;
+        for c in completion.chars() {
+            node = node.add_child(c);
+            self.num_nodes += 1;
+        }
+        node.set_terminal();
+        node.add_completion_id(id);
+        self.num_completions += 1;
+    }
+
+    /// Find all completions for a prefix
+    pub fn complete(&self, prefix: &str) -> Vec<IdType> {
+        let mut node = &self.root;
+        for c in prefix.chars() {
+            match node.get_child(c) {
+                Some(next) => node = next,
+                None => return Vec::new(),
+            }
+        }
+        self.collect_completions(node)
+    }
+
+    /// Collect all completion IDs from a node and its children
+    fn collect_completions(&self, node: &TrieNode) -> Vec<IdType> {
+        let mut completions = Vec::new();
+        self.collect_completions_recursive(node, &mut completions);
+        completions
+    }
+
+    /// Recursive helper for collecting completions
+    fn collect_completions_recursive(&self, node: &TrieNode, completions: &mut Vec<IdType>) {
+        if node.is_terminal() {
+            completions.extend_from_slice(node.completion_ids());
+        }
+        for child in node.children.values() {
+            self.collect_completions_recursive(child, completions);
+        }
+    }
+
+    /// Remove a completion string
+    pub fn remove(&mut self, completion: &str) -> bool {
+        let mut chars: Vec<char> = completion.chars().collect();
+        if chars.is_empty() {
+            return false;
+        }
+
+        // First, find if the completion exists and build the path
+        let mut path = Vec::new();
+        let mut current = &self.root;
+        
+        for &c in &chars {
+            match current.get_child(c) {
+                Some(next) => {
+                    path.push(c);
+                    current = next;
+                }
+                None => return false,
+            }
+        }
+
+        if !current.is_terminal() {
+            return false;
+        }
+
+        // Now remove it by traversing the path again
+        let mut current = &mut self.root;
+        let mut parent = None;
+        
+        for &c in &path {
+            if let Some(next) = current.children.get_mut(&c) {
+                parent = Some((c, current));
+                current = next;
+            }
+        }
+
+        // Remove the completion
+        current.completion_ids.clear();
+        current.is_terminal = false;
+        self.num_completions -= 1;
+
+        // Clean up empty nodes
+        while let Some((c, p)) = parent {
+            if current.children.is_empty() && !current.is_terminal() {
+                p.children.remove(&c);
+                self.num_nodes -= 1;
+                current = p;
+                parent = None;
+            } else {
+                break;
+            }
+        }
+
+        true
+    }
+
+    /// Clear the trie
+    pub fn clear(&mut self) {
+        self.root = TrieNode::new();
+        self.num_nodes = 1;
+        self.num_completions = 0;
+    }
+
+    /// Get the number of nodes
+    pub fn num_nodes(&self) -> usize {
+        self.num_nodes
+    }
+
+    /// Get the number of completions
+    pub fn num_completions(&self) -> usize {
+        self.num_completions
+    }
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/types.rs b/autocomplete-rs/src/types.rs
new file mode 100644
index 0000000..5490d59
--- /dev/null
+++ b/autocomplete-rs/src/types.rs
@@ -0,0 +1,92 @@
+use std::ops::Range;
+
+/// Type alias for document and term IDs
+pub type IdType = u32;
+
+/// Type alias for completion type (vector of term IDs)
+pub type CompletionType = Vec<IdType>;
+
+/// Represents a range of values
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct ValueRange {
+    pub begin: u64,
+    pub end: u64,
+}
+
+impl ValueRange {
+    /// Check if the range is invalid
+    pub fn is_invalid(&self) -> bool {
+        self.begin == u64::MAX || self.end == u64::MAX || self.begin > self.end
+    }
+
+    /// Check if the range is valid
+    pub fn is_valid(&self) -> bool {
+        !self.is_invalid()
+    }
+
+    /// Check if a value is contained in the range
+    pub fn contains(&self, val: u64) -> bool {
+        val >= self.begin && val <= self.end
+    }
+}
+
+/// Represents a scored range
+#[derive(Debug, Clone)]
+pub struct ScoredRange {
+    pub range: ValueRange,
+    pub min_pos: u32,
+    pub min_val: IdType,
+}
+
+impl ScoredRange {
+    /// Compare two scored ranges
+    pub fn greater(l: &ScoredRange, r: &ScoredRange) -> bool {
+        l.min_val > r.min_val
+    }
+}
+
+/// Represents a byte range
+#[derive(Debug, Clone, Copy)]
+pub struct ByteRange {
+    pub begin: *const u8,
+    pub end: *const u8,
+}
+
+/// Represents a range of 32-bit integers
+#[derive(Debug, Clone, Copy)]
+pub struct Uint32Range {
+    pub begin: *const u32,
+    pub end: *const u32,
+}
+
+/// Global constants
+pub mod global {
+    use super::IdType;
+
+    /// Invalid term ID
+    pub const INVALID_TERM_ID: IdType = IdType::MAX;
+    
+    /// Terminator value
+    pub const TERMINATOR: IdType = 0;
+    
+    /// Not found value
+    pub const NOT_FOUND: u64 = u64::MAX;
+    
+    /// Linear scan threshold
+    pub const LINEAR_SCAN_THRESHOLD: u64 = 8;
+}
+
+/// Convert a string to a byte range
+pub fn string_to_byte_range(s: &str) -> ByteRange {
+    let begin = s.as_ptr();
+    let end = unsafe { begin.add(s.len()) };
+    ByteRange { begin, end }
+}
+
+/// Convert a completion to a uint32 range
+pub fn completion_to_uint32_range(c: &CompletionType) -> Uint32Range {
+    Uint32Range {
+        begin: c.as_ptr(),
+        end: unsafe { c.as_ptr().add(c.len()) },
+    }
+} 
\ No newline at end of file

From 97293449a961c4c42ac6e5ffa1b20afce99d3903 Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 14:34:04 -0400
Subject: [PATCH 097/102] define entry points

---
 autocomplete-rs/Cargo.lock            | 1194 ++++++++++++++++++++++++-
 autocomplete-rs/Cargo.toml            |    4 +
 autocomplete-rs/schema/schema.graphql |   41 +
 autocomplete-rs/src/graphql.rs        |   90 ++
 autocomplete-rs/src/main.rs           |   39 +-
 autocomplete-rs/src/server.rs         |   59 +-
 6 files changed, 1386 insertions(+), 41 deletions(-)
 create mode 100644 autocomplete-rs/schema/schema.graphql
 create mode 100644 autocomplete-rs/src/graphql.rs

diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock
index d1b8fd2..486a6ac 100644
--- a/autocomplete-rs/Cargo.lock
+++ b/autocomplete-rs/Cargo.lock
@@ -2,6 +2,16 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "Inflector"
+version = "0.11.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3"
+dependencies = [
+ "lazy_static",
+ "regex",
+]
+
 [[package]]
 name = "addr2line"
 version = "0.24.2"
@@ -26,12 +36,160 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "anstream"
+version = "0.6.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
+dependencies = [
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "anyhow"
 version = "1.0.98"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
 
+[[package]]
+name = "ascii_utils"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71938f30533e4d95a6d17aa530939da3842c2ab6f4f84b9dae68447e4129f74a"
+
+[[package]]
+name = "async-graphql"
+version = "6.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "298a5d587d6e6fdb271bf56af2dc325a80eb291fd0fc979146584b9a05494a8c"
+dependencies = [
+ "async-graphql-derive",
+ "async-graphql-parser",
+ "async-graphql-value",
+ "async-stream",
+ "async-trait",
+ "base64 0.13.1",
+ "bytes",
+ "fast_chemail",
+ "fnv",
+ "futures-util",
+ "handlebars",
+ "http 0.2.12",
+ "indexmap 2.9.0",
+ "mime",
+ "multer",
+ "num-traits",
+ "once_cell",
+ "pin-project-lite",
+ "regex",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "static_assertions",
+ "tempfile",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "async-graphql-axum"
+version = "6.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01a1c20a2059bffbc95130715b23435a05168c518fba9709c81fa2a38eed990c"
+dependencies = [
+ "async-graphql",
+ "async-trait",
+ "axum 0.6.20",
+ "bytes",
+ "futures-util",
+ "serde_json",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tower-service",
+]
+
+[[package]]
+name = "async-graphql-derive"
+version = "6.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7f329c7eb9b646a72f70c9c4b516c70867d356ec46cb00dcac8ad343fd006b0"
+dependencies = [
+ "Inflector",
+ "async-graphql-parser",
+ "darling",
+ "proc-macro-crate",
+ "proc-macro2",
+ "quote",
+ "strum",
+ "syn",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "async-graphql-parser"
+version = "6.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6139181845757fd6a73fbb8839f3d036d7150b798db0e9bb3c6e83cdd65bd53b"
+dependencies = [
+ "async-graphql-value",
+ "pest",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "async-graphql-value"
+version = "6.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "323a5143f5bdd2030f45e3f2e0c821c9b1d36e79cf382129c64299c50a7f3750"
+dependencies = [
+ "bytes",
+ "indexmap 2.9.0",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "async-stream"
 version = "0.3.6"
@@ -75,6 +233,10 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 name = "autocomplete-rs"
 version = "0.1.0"
 dependencies = [
+ "async-graphql",
+ "async-graphql-axum",
+ "axum 0.7.9",
+ "clap",
  "futures",
  "prost",
  "tempfile",
@@ -90,13 +252,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
 dependencies = [
  "async-trait",
- "axum-core",
+ "axum-core 0.3.4",
+ "base64 0.21.7",
  "bitflags 1.3.2",
  "bytes",
  "futures-util",
- "http",
- "http-body",
- "hyper",
+ "headers",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.32",
  "itoa",
  "matchit",
  "memchr",
@@ -105,12 +269,52 @@ dependencies = [
  "pin-project-lite",
  "rustversion",
  "serde",
- "sync_wrapper",
- "tower",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sha1",
+ "sync_wrapper 0.1.2",
+ "tokio",
+ "tokio-tungstenite",
+ "tower 0.4.13",
  "tower-layer",
  "tower-service",
 ]
 
+[[package]]
+name = "axum"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+dependencies = [
+ "async-trait",
+ "axum-core 0.4.5",
+ "bytes",
+ "futures-util",
+ "http 1.3.1",
+ "http-body 1.0.1",
+ "http-body-util",
+ "hyper 1.6.0",
+ "hyper-util",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sync_wrapper 1.0.2",
+ "tokio",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "axum-core"
 version = "0.3.4"
@@ -120,12 +324,33 @@ dependencies = [
  "async-trait",
  "bytes",
  "futures-util",
- "http",
- "http-body",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "mime",
+ "rustversion",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http 1.3.1",
+ "http-body 1.0.1",
+ "http-body-util",
  "mime",
+ "pin-project-lite",
  "rustversion",
+ "sync_wrapper 1.0.2",
  "tower-layer",
  "tower-service",
+ "tracing",
 ]
 
 [[package]]
@@ -143,6 +368,12 @@ dependencies = [
  "windows-targets",
 ]
 
+[[package]]
+name = "base64"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
+
 [[package]]
 name = "base64"
 version = "0.21.7"
@@ -161,11 +392,29 @@ version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
 
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "bytes"
 version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+dependencies = [
+ "serde",
+]
 
 [[package]]
 name = "cfg-if"
@@ -173,12 +422,148 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "clap"
+version = "4.5.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.5.38"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.5.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "darling"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "data-encoding"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+]
+
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "either"
 version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -195,6 +580,15 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "fast_chemail"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "495a39d30d624c2caabe6312bfead73e7717692b44e0b32df168c275a2e8e9e4"
+dependencies = [
+ "ascii_utils",
+]
+
 [[package]]
 name = "fastrand"
 version = "2.3.0"
@@ -213,6 +607,15 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "form_urlencoded"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+dependencies = [
+ "percent-encoding",
+]
+
 [[package]]
 name = "futures"
 version = "0.3.31"
@@ -302,6 +705,16 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.16"
@@ -342,7 +755,7 @@ dependencies = [
  "futures-core",
  "futures-sink",
  "futures-util",
- "http",
+ "http 0.2.12",
  "indexmap 2.9.0",
  "slab",
  "tokio",
@@ -350,6 +763,20 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "handlebars"
+version = "4.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "faa67bab9ff362228eb3d00bd024a4965d8231bbb7921167f0cfa66c6626b225"
+dependencies = [
+ "log",
+ "pest",
+ "pest_derive",
+ "serde",
+ "serde_json",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -362,6 +789,36 @@ version = "0.15.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
 
+[[package]]
+name = "headers"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06683b93020a07e3dbcf5f8c0f6d40080d725bea7936fc01ad345c01b97dc270"
+dependencies = [
+ "base64 0.21.7",
+ "bytes",
+ "headers-core",
+ "http 0.2.12",
+ "httpdate",
+ "mime",
+ "sha1",
+]
+
+[[package]]
+name = "headers-core"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429"
+dependencies = [
+ "http 0.2.12",
+]
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -379,6 +836,17 @@ dependencies = [
  "itoa",
 ]
 
+[[package]]
+name = "http"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
 [[package]]
 name = "http-body"
 version = "0.4.6"
@@ -386,7 +854,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
 dependencies = [
  "bytes",
- "http",
+ "http 0.2.12",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http 1.3.1",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http 1.3.1",
+ "http-body 1.0.1",
  "pin-project-lite",
 ]
 
@@ -413,8 +904,8 @@ dependencies = [
  "futures-core",
  "futures-util",
  "h2",
- "http",
- "http-body",
+ "http 0.2.12",
+ "http-body 0.4.6",
  "httparse",
  "httpdate",
  "itoa",
@@ -426,18 +917,166 @@ dependencies = [
  "want",
 ]
 
+[[package]]
+name = "hyper"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http 1.3.1",
+ "http-body 1.0.1",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+]
+
 [[package]]
 name = "hyper-timeout"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper",
+ "hyper 0.14.32",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
 ]
 
+[[package]]
+name = "hyper-util"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.3.1",
+ "http-body 1.0.1",
+ "hyper 1.6.0",
+ "pin-project-lite",
+ "tokio",
+ "tower-service",
+]
+
+[[package]]
+name = "icu_collections"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3"
+
+[[package]]
+name = "icu_properties"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "potential_utf",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632"
+
+[[package]]
+name = "icu_provider"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "stable_deref_trait",
+ "tinystr",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
+[[package]]
+name = "idna"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
+dependencies = [
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+
 [[package]]
 name = "indexmap"
 version = "1.9.3"
@@ -456,8 +1095,15 @@ checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
 dependencies = [
  "equivalent",
  "hashbrown 0.15.3",
+ "serde",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itertools"
 version = "0.12.1"
@@ -473,6 +1119,12 @@ version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
 [[package]]
 name = "libc"
 version = "0.2.172"
@@ -485,6 +1137,12 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
 
+[[package]]
+name = "litemap"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
+
 [[package]]
 name = "lock_api"
 version = "0.4.12"
@@ -539,12 +1197,39 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "multer"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01acbdc23469fd8fe07ab135923371d5f5a422fbf9c522158677c8eb15bc51c2"
+dependencies = [
+ "bytes",
+ "encoding_rs",
+ "futures-util",
+ "http 0.2.12",
+ "httparse",
+ "log",
+ "memchr",
+ "mime",
+ "spin",
+ "version_check",
+]
+
 [[package]]
 name = "multimap"
 version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
 
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "object"
 version = "0.36.7"
@@ -560,6 +1245,12 @@ version = "1.21.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
 
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
+
 [[package]]
 name = "parking_lot"
 version = "0.12.3"
@@ -574,20 +1265,65 @@ dependencies = [
 name = "parking_lot_core"
 version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pest"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "198db74531d58c70a361c42201efde7e2591e976d518caf7662a47dc5720e7b6"
+dependencies = [
+ "memchr",
+ "thiserror 2.0.12",
+ "ucd-trie",
+]
+
+[[package]]
+name = "pest_derive"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d725d9cfd79e87dccc9341a2ef39d1b6f6353d68c4b33c177febbe1a402c97c5"
+dependencies = [
+ "pest",
+ "pest_generator",
+]
+
+[[package]]
+name = "pest_generator"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db7d01726be8ab66ab32f9df467ae8b1148906685bbe75c82d1e65d7f5b3f841"
 dependencies = [
- "cfg-if",
- "libc",
- "redox_syscall",
- "smallvec",
- "windows-targets",
+ "pest",
+ "pest_meta",
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
 
 [[package]]
-name = "percent-encoding"
-version = "2.3.1"
+name = "pest_meta"
+version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+checksum = "7f9f832470494906d1fca5329f8ab5791cc60beb230c74815dff541cbd2b5ca0"
+dependencies = [
+ "once_cell",
+ "pest",
+ "sha2",
+]
 
 [[package]]
 name = "petgraph"
@@ -631,6 +1367,15 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
+[[package]]
+name = "potential_utf"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585"
+dependencies = [
+ "zerovec",
+]
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.21"
@@ -650,6 +1395,16 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "proc-macro-crate"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919"
+dependencies = [
+ "once_cell",
+ "toml_edit",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.95"
@@ -676,7 +1431,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
  "bytes",
- "heck",
+ "heck 0.5.0",
  "itertools",
  "log",
  "multimap",
@@ -820,6 +1575,12 @@ version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d"
 
+[[package]]
+name = "ryu"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -846,6 +1607,62 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "serde_json"
+version = "1.0.140"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_path_to_error"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a"
+dependencies = [
+ "itoa",
+ "serde",
+]
+
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "sha1"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.5"
@@ -880,6 +1697,52 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "spin"
+version = "0.9.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "strum"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.25.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
+dependencies = [
+ "heck 0.4.1",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn",
+]
+
 [[package]]
 name = "syn"
 version = "2.0.101"
@@ -897,6 +1760,23 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
+[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.20.0"
@@ -910,6 +1790,56 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
+dependencies = [
+ "thiserror-impl 2.0.12",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tinystr"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
 [[package]]
 name = "tokio"
 version = "1.45.0"
@@ -960,6 +1890,18 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "tokio-tungstenite"
+version = "0.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "212d5dcb2a1ce06d81107c3d0ffa3121fe974b73f068c8282cb1c32328113b6c"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.15"
@@ -968,11 +1910,29 @@ checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df"
 dependencies = [
  "bytes",
  "futures-core",
+ "futures-io",
  "futures-sink",
  "pin-project-lite",
  "tokio",
 ]
 
+[[package]]
+name = "toml_datetime"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3"
+
+[[package]]
+name = "toml_edit"
+version = "0.19.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
+dependencies = [
+ "indexmap 2.9.0",
+ "toml_datetime",
+ "winnow",
+]
+
 [[package]]
 name = "tonic"
 version = "0.10.2"
@@ -981,20 +1941,20 @@ checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
 dependencies = [
  "async-stream",
  "async-trait",
- "axum",
- "base64",
+ "axum 0.6.20",
+ "base64 0.21.7",
  "bytes",
  "h2",
- "http",
- "http-body",
- "hyper",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.32",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
  "prost",
  "tokio",
  "tokio-stream",
- "tower",
+ "tower 0.4.13",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -1033,6 +1993,22 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "tower"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper 1.0.2",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "tower-layer"
 version = "0.3.3"
@@ -1051,6 +2027,7 @@ version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
+ "log",
  "pin-project-lite",
  "tracing-attributes",
  "tracing-core",
@@ -1082,12 +2059,78 @@ version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
 
+[[package]]
+name = "tungstenite"
+version = "0.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http 0.2.12",
+ "httparse",
+ "log",
+ "rand",
+ "sha1",
+ "thiserror 1.0.69",
+ "url",
+ "utf-8",
+]
+
+[[package]]
+name = "typenum"
+version = "1.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
+
+[[package]]
+name = "ucd-trie"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
 
+[[package]]
+name = "url"
+version = "2.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+]
+
+[[package]]
+name = "utf-8"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
 [[package]]
 name = "want"
 version = "0.3.1"
@@ -1194,6 +2237,15 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
+[[package]]
+name = "winnow"
+version = "0.5.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "wit-bindgen-rt"
 version = "0.39.0"
@@ -1203,6 +2255,36 @@ dependencies = [
  "bitflags 2.9.1",
 ]
 
+[[package]]
+name = "writeable"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
+
+[[package]]
+name = "yoke"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc"
+dependencies = [
+ "serde",
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.8.25"
@@ -1222,3 +2304,57 @@ dependencies = [
  "quote",
  "syn",
 ]
+
+[[package]]
+name = "zerofrom"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "zerotrie"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml
index fc1d1f9..74c25c0 100644
--- a/autocomplete-rs/Cargo.toml
+++ b/autocomplete-rs/Cargo.toml
@@ -8,6 +8,10 @@ tonic = "0.10"
 prost = "0.12"
 tokio = { version = "1.0", features = ["full"] }
 futures = "0.3"
+async-graphql = "6.0"
+async-graphql-axum = "6.0"
+axum = "0.7"
+clap = { version = "4.4", features = ["derive"] }
 
 [dev-dependencies]
 tempfile = "3.8"
diff --git a/autocomplete-rs/schema/schema.graphql b/autocomplete-rs/schema/schema.graphql
new file mode 100644
index 0000000..70da230
--- /dev/null
+++ b/autocomplete-rs/schema/schema.graphql
@@ -0,0 +1,41 @@
+type Query {
+    # Get completions for a prefix
+    complete(prefix: String!, maxResults: Int): CompleteResponse!
+    
+    # Get system statistics
+    stats: Stats!
+}
+
+type Mutation {
+    # Initialize the autocomplete system with strings and scores
+    init(strings: [StringScoreInput!]!): InitResponse!
+}
+
+# Input type for string with score
+input StringScoreInput {
+    text: String!
+    score: Float!
+}
+
+# Response type for completions
+type CompleteResponse {
+    completions: [Completion!]!
+}
+
+# A single completion result
+type Completion {
+    text: String!
+    score: Float!
+}
+
+# Response type for initialization
+type InitResponse {
+    success: Boolean!
+    error: String
+}
+
+# System statistics
+type Stats {
+    numTerms: Int!
+    memoryBytes: Int!
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/graphql.rs b/autocomplete-rs/src/graphql.rs
new file mode 100644
index 0000000..197c180
--- /dev/null
+++ b/autocomplete-rs/src/graphql.rs
@@ -0,0 +1,90 @@
+use async_graphql::{Object, Schema, SimpleObject, InputObject};
+use crate::autocomplete::Autocomplete;
+
+#[derive(SimpleObject)]
+struct Completion {
+    text: String,
+    score: f32,
+}
+
+#[derive(SimpleObject)]
+struct CompleteResponse {
+    completions: Vec<Completion>,
+}
+
+#[derive(SimpleObject)]
+struct Stats {
+    num_terms: i32,
+    memory_bytes: i64,
+}
+
+#[derive(SimpleObject)]
+struct InitResponse {
+    success: bool,
+    error: Option<String>,
+}
+
+#[derive(InputObject)]
+struct StringScoreInput {
+    text: String,
+    score: f32,
+}
+
+pub struct QueryRoot {
+    autocomplete: Autocomplete,
+}
+
+#[Object]
+impl QueryRoot {
+    async fn complete(&self, prefix: String, max_results: Option<i32>) -> CompleteResponse {
+        let completions = self.autocomplete.complete(&prefix);
+        let completions = completions.into_iter()
+            .map(|(text, score)| Completion { text, score })
+            .collect();
+        
+        CompleteResponse { completions }
+    }
+
+    async fn stats(&self) -> Stats {
+        Stats {
+            num_terms: self.autocomplete.num_terms() as i32,
+            memory_bytes: self.autocomplete.bytes() as i64,
+        }
+    }
+}
+
+pub struct MutationRoot {
+    autocomplete: Autocomplete,
+}
+
+#[Object]
+impl MutationRoot {
+    async fn init(&self, strings: Vec<StringScoreInput>) -> InitResponse {
+        let strings: Vec<(String, f32)> = strings
+            .into_iter()
+            .map(|s| (s.text, s.score))
+            .collect();
+            
+        match self.autocomplete.init(&strings) {
+            Ok(_) => InitResponse {
+                success: true,
+                error: None,
+            },
+            Err(e) => InitResponse {
+                success: false,
+                error: Some(e.to_string()),
+            },
+        }
+    }
+}
+
+pub type AppSchema = Schema<QueryRoot, MutationRoot>;
+
+pub fn create_schema(autocomplete: Autocomplete) -> AppSchema {
+    Schema::build(
+        QueryRoot { autocomplete: autocomplete.clone() },
+        MutationRoot { autocomplete },
+        async_graphql::EmptySubscription,
+    )
+    .finish()
+} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs
index e7a11a9..ee606d7 100644
--- a/autocomplete-rs/src/main.rs
+++ b/autocomplete-rs/src/main.rs
@@ -1,3 +1,38 @@
-fn main() {
-    println!("Hello, world!");
+use std::error::Error;
+use clap::Parser;
+
+mod autocomplete;
+mod graphql;
+mod server;
+mod string_pool;
+mod trie;
+mod types;
+mod utils;
+
+/// Autocomplete service with gRPC and GraphQL support
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// gRPC server address
+    #[arg(short, long, default_value = "[::1]:50051")]
+    grpc_addr: String,
+
+    /// GraphQL server address
+    #[arg(short, long, default_value = "[::1]:8000")]
+    graphql_addr: String,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let args = Args::parse();
+
+    println!("Starting Autocomplete Service...");
+    println!("gRPC server will listen on: {}", args.grpc_addr);
+    println!("GraphQL server will listen on: {}", args.graphql_addr);
+    println!("GraphQL Playground available at: http://{}/playground", args.graphql_addr);
+
+    // Start both servers
+    server::run_server(&args.grpc_addr, &args.graphql_addr).await?;
+
+    Ok(())
 }
diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs
index 6ddacbd..733afef 100644
--- a/autocomplete-rs/src/server.rs
+++ b/autocomplete-rs/src/server.rs
@@ -1,5 +1,15 @@
-use tonic::{transport::Server, Request, Response, Status};
+use std::net::SocketAddr;
+use tonic::{transport::Server as TonicServer, Request, Response, Status};
+use axum::{
+    routing::{get, post},
+    Router,
+    extract::State,
+    response::IntoResponse,
+    Json,
+};
+use async_graphql_axum::{GraphQLRequest, GraphQLResponse};
 use crate::autocomplete::{Autocomplete, Autocomplete2};
+use crate::graphql::{create_schema, AppSchema};
 
 pub mod autocomplete_proto {
     tonic::include_proto!("autocomplete");
@@ -72,18 +82,47 @@ impl AutocompleteService for AutocompleteServiceImpl {
     }
 }
 
-pub async fn run_server(addr: &str) -> Result<(), Box<dyn std::error::Error>> {
-    let addr = addr.parse()?;
-    let service = AutocompleteServiceImpl {
-        autocomplete: Autocomplete::new(),
+async fn graphql_handler(
+    State(schema): State<AppSchema>,
+    req: GraphQLRequest,
+) -> GraphQLResponse {
+    schema.execute(req.into_inner()).await.into()
+}
+
+async fn graphql_playground() -> impl IntoResponse {
+    async_graphql::http::playground_source(
+        async_graphql::http::GraphQLPlaygroundConfig::new("/graphql")
+    )
+}
+
+pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box<dyn std::error::Error>> {
+    let autocomplete = Autocomplete::new();
+    let schema = create_schema(autocomplete.clone());
+    
+    // Create gRPC service
+    let grpc_service = AutocompleteServiceImpl {
+        autocomplete: autocomplete.clone(),
     };
 
-    println!("Autocomplete server listening on {}", addr);
+    // Create GraphQL router
+    let app = Router::new()
+        .route("/graphql", post(graphql_handler))
+        .route("/playground", get(graphql_playground))
+        .with_state(schema);
+
+    // Start both servers
+    let grpc_addr = grpc_addr.parse()?;
+    let graphql_addr = graphql_addr.parse()?;
+
+    println!("gRPC server listening on {}", grpc_addr);
+    println!("GraphQL server listening on {}", graphql_addr);
 
-    Server::builder()
-        .add_service(AutocompleteServiceServer::new(service))
-        .serve(addr)
-        .await?;
+    tokio::join!(
+        TonicServer::builder()
+            .add_service(AutocompleteServiceServer::new(grpc_service))
+            .serve(grpc_addr),
+        axum::Server::bind(&graphql_addr).serve(app.into_make_service())
+    );
 
     Ok(())
 } 
\ No newline at end of file

From 4d3cb52425949714ca31900c71dd2293e8de1277 Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 14:52:34 -0400
Subject: [PATCH 098/102] partial fix

---
 autocomplete-rs/Cargo.lock          | 199 ++++++------------------
 autocomplete-rs/Cargo.toml          |   7 +-
 autocomplete-rs/LICENSE             |  21 +++
 autocomplete-rs/README.md           | 122 ++++++++++++---
 autocomplete-rs/src/autocomplete.rs | 225 +++------------------------
 autocomplete-rs/src/dictionary.rs   | 205 ++++--------------------
 autocomplete-rs/src/graphql.rs      |  27 ++--
 autocomplete-rs/src/index.rs        |  56 ++-----
 autocomplete-rs/src/lib.rs          |   2 +
 autocomplete-rs/src/main.rs         |   1 -
 autocomplete-rs/src/server.rs       |  25 +--
 autocomplete-rs/src/string_pool.rs  |  50 +++---
 autocomplete-rs/src/trie.rs         | 233 ++++++++++++----------------
 autocomplete-rs/src/types.rs        |  26 +++-
 14 files changed, 411 insertions(+), 788 deletions(-)
 create mode 100644 autocomplete-rs/LICENSE

diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock
index 486a6ac..bd45602 100644
--- a/autocomplete-rs/Cargo.lock
+++ b/autocomplete-rs/Cargo.lock
@@ -115,7 +115,7 @@ dependencies = [
  "fnv",
  "futures-util",
  "handlebars",
- "http 0.2.12",
+ "http",
  "indexmap 2.9.0",
  "mime",
  "multer",
@@ -139,7 +139,7 @@ checksum = "01a1c20a2059bffbc95130715b23435a05168c518fba9709c81fa2a38eed990c"
 dependencies = [
  "async-graphql",
  "async-trait",
- "axum 0.6.20",
+ "axum",
  "bytes",
  "futures-util",
  "serde_json",
@@ -235,14 +235,17 @@ version = "0.1.0"
 dependencies = [
  "async-graphql",
  "async-graphql-axum",
- "axum 0.7.9",
+ "axum",
  "clap",
  "futures",
+ "hyper",
  "prost",
  "tempfile",
  "tokio",
  "tonic",
  "tonic-build",
+ "tower",
+ "tower-http",
 ]
 
 [[package]]
@@ -252,15 +255,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
 dependencies = [
  "async-trait",
- "axum-core 0.3.4",
+ "axum-core",
+ "axum-macros",
  "base64 0.21.7",
  "bitflags 1.3.2",
  "bytes",
  "futures-util",
  "headers",
- "http 0.2.12",
- "http-body 0.4.6",
- "hyper 0.14.32",
+ "http",
+ "http-body",
+ "hyper",
  "itoa",
  "matchit",
  "memchr",
@@ -273,48 +277,14 @@ dependencies = [
  "serde_path_to_error",
  "serde_urlencoded",
  "sha1",
- "sync_wrapper 0.1.2",
+ "sync_wrapper",
  "tokio",
  "tokio-tungstenite",
- "tower 0.4.13",
+ "tower",
  "tower-layer",
  "tower-service",
 ]
 
-[[package]]
-name = "axum"
-version = "0.7.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
-dependencies = [
- "async-trait",
- "axum-core 0.4.5",
- "bytes",
- "futures-util",
- "http 1.3.1",
- "http-body 1.0.1",
- "http-body-util",
- "hyper 1.6.0",
- "hyper-util",
- "itoa",
- "matchit",
- "memchr",
- "mime",
- "percent-encoding",
- "pin-project-lite",
- "rustversion",
- "serde",
- "serde_json",
- "serde_path_to_error",
- "serde_urlencoded",
- "sync_wrapper 1.0.2",
- "tokio",
- "tower 0.5.2",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
 [[package]]
 name = "axum-core"
 version = "0.3.4"
@@ -324,8 +294,8 @@ dependencies = [
  "async-trait",
  "bytes",
  "futures-util",
- "http 0.2.12",
- "http-body 0.4.6",
+ "http",
+ "http-body",
  "mime",
  "rustversion",
  "tower-layer",
@@ -333,24 +303,15 @@ dependencies = [
 ]
 
 [[package]]
-name = "axum-core"
-version = "0.4.5"
+name = "axum-macros"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+checksum = "cdca6a10ecad987bda04e95606ef85a5417dcaac1a78455242d72e031e2b6b62"
 dependencies = [
- "async-trait",
- "bytes",
- "futures-util",
- "http 1.3.1",
- "http-body 1.0.1",
- "http-body-util",
- "mime",
- "pin-project-lite",
- "rustversion",
- "sync_wrapper 1.0.2",
- "tower-layer",
- "tower-service",
- "tracing",
+ "heck 0.4.1",
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
 
 [[package]]
@@ -755,7 +716,7 @@ dependencies = [
  "futures-core",
  "futures-sink",
  "futures-util",
- "http 0.2.12",
+ "http",
  "indexmap 2.9.0",
  "slab",
  "tokio",
@@ -798,7 +759,7 @@ dependencies = [
  "base64 0.21.7",
  "bytes",
  "headers-core",
- "http 0.2.12",
+ "http",
  "httpdate",
  "mime",
  "sha1",
@@ -810,7 +771,7 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429"
 dependencies = [
- "http 0.2.12",
+ "http",
 ]
 
 [[package]]
@@ -836,17 +797,6 @@ dependencies = [
  "itoa",
 ]
 
-[[package]]
-name = "http"
-version = "1.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
-dependencies = [
- "bytes",
- "fnv",
- "itoa",
-]
-
 [[package]]
 name = "http-body"
 version = "0.4.6"
@@ -854,32 +804,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
 dependencies = [
  "bytes",
- "http 0.2.12",
+ "http",
  "pin-project-lite",
 ]
 
 [[package]]
-name = "http-body"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
-dependencies = [
- "bytes",
- "http 1.3.1",
-]
-
-[[package]]
-name = "http-body-util"
-version = "0.1.3"
+name = "http-range-header"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
-dependencies = [
- "bytes",
- "futures-core",
- "http 1.3.1",
- "http-body 1.0.1",
- "pin-project-lite",
-]
+checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f"
 
 [[package]]
 name = "httparse"
@@ -904,8 +837,8 @@ dependencies = [
  "futures-core",
  "futures-util",
  "h2",
- "http 0.2.12",
- "http-body 0.4.6",
+ "http",
+ "http-body",
  "httparse",
  "httpdate",
  "itoa",
@@ -917,53 +850,18 @@ dependencies = [
  "want",
 ]
 
-[[package]]
-name = "hyper"
-version = "1.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
-dependencies = [
- "bytes",
- "futures-channel",
- "futures-util",
- "http 1.3.1",
- "http-body 1.0.1",
- "httparse",
- "httpdate",
- "itoa",
- "pin-project-lite",
- "smallvec",
- "tokio",
-]
-
 [[package]]
 name = "hyper-timeout"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper 0.14.32",
+ "hyper",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
 ]
 
-[[package]]
-name = "hyper-util"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710"
-dependencies = [
- "bytes",
- "futures-util",
- "http 1.3.1",
- "http-body 1.0.1",
- "hyper 1.6.0",
- "pin-project-lite",
- "tokio",
- "tower-service",
-]
-
 [[package]]
 name = "icu_collections"
 version = "2.0.0"
@@ -1206,7 +1104,7 @@ dependencies = [
  "bytes",
  "encoding_rs",
  "futures-util",
- "http 0.2.12",
+ "http",
  "httparse",
  "log",
  "memchr",
@@ -1760,12 +1658,6 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 
-[[package]]
-name = "sync_wrapper"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
-
 [[package]]
 name = "synstructure"
 version = "0.13.2"
@@ -1941,20 +1833,20 @@ checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
 dependencies = [
  "async-stream",
  "async-trait",
- "axum 0.6.20",
+ "axum",
  "base64 0.21.7",
  "bytes",
  "h2",
- "http 0.2.12",
- "http-body 0.4.6",
- "hyper 0.14.32",
+ "http",
+ "http-body",
+ "hyper",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
  "prost",
  "tokio",
  "tokio-stream",
- "tower 0.4.13",
+ "tower",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -1994,16 +1886,19 @@ dependencies = [
 ]
 
 [[package]]
-name = "tower"
-version = "0.5.2"
+name = "tower-http"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
 dependencies = [
+ "bitflags 2.9.1",
+ "bytes",
  "futures-core",
  "futures-util",
+ "http",
+ "http-body",
+ "http-range-header",
  "pin-project-lite",
- "sync_wrapper 1.0.2",
- "tokio",
  "tower-layer",
  "tower-service",
  "tracing",
@@ -2068,7 +1963,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "data-encoding",
- "http 0.2.12",
+ "http",
  "httparse",
  "log",
  "rand",
diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml
index 74c25c0..b799bbe 100644
--- a/autocomplete-rs/Cargo.toml
+++ b/autocomplete-rs/Cargo.toml
@@ -4,13 +4,16 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-tonic = "0.10"
+tonic = { version = "0.10", features = ["transport"] }
 prost = "0.12"
 tokio = { version = "1.0", features = ["full"] }
 futures = "0.3"
 async-graphql = "6.0"
 async-graphql-axum = "6.0"
-axum = "0.7"
+axum = { version = "0.6", features = ["macros"] }
+tower = "0.4"
+tower-http = { version = "0.4", features = ["trace"] }
+hyper = { version = "0.14", features = ["full"] }
 clap = { version = "4.4", features = ["derive"] }
 
 [dev-dependencies]
diff --git a/autocomplete-rs/LICENSE b/autocomplete-rs/LICENSE
new file mode 100644
index 0000000..d874d0b
--- /dev/null
+++ b/autocomplete-rs/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Autocomplete Service Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE. 
\ No newline at end of file
diff --git a/autocomplete-rs/README.md b/autocomplete-rs/README.md
index 801e4b2..4c11811 100644
--- a/autocomplete-rs/README.md
+++ b/autocomplete-rs/README.md
@@ -1,43 +1,117 @@
-# Autocomplete-rs
+# Autocomplete Service
 
-This project is a Rust port of the original C++ autocomplete system. The goal is to maintain the same functionality while leveraging Rust's safety guarantees and modern tooling.
+A high-performance autocomplete service written in Rust, supporting both gRPC and GraphQL interfaces.
 
-## Project Status
+## Features
+
+- **Dual API Support**
+  - gRPC interface for high-performance RPC calls
+  - GraphQL interface for flexible querying
+  - Shared backend implementation for both APIs
 
-Currently, we are in the process of porting the core components from C++ to Rust. The following components have been ported:
+- **Core Features**
+  - Fast prefix-based autocomplete
+  - Score-based ranking of suggestions
+  - Memory-efficient string storage
+  - Concurrent request handling
 
-- Basic constants and configuration
-- Parameters management
-- Performance measurement probes
+- **API Endpoints**
+  - gRPC: `[::1]:50051` (configurable)
+  - GraphQL: `[::1]:8000/graphql` (configurable)
+  - GraphQL Playground: `[::1]:8000/playground`
 
-## Next Steps
+## Project Status
 
-1. Continue porting core components:
-   - Scored string pool
-   - Completion trie
-   - Blocked inverted index
-   - Front-coded dictionary
+### Completed
+- ✅ Basic autocomplete implementation
+- ✅ gRPC server implementation
+- ✅ GraphQL server implementation
+- ✅ Command-line configuration
+- ✅ Shared backend between APIs
 
-2. Port and adapt unit tests to ensure functionality matches the original implementation
+### In Progress
+- 🔄 Documentation
+- 🔄 Testing suite
+- 🔄 Performance benchmarks
 
-3. Containerize the application using Docker for easy deployment and testing
+### Planned
+- ⏳ Authentication
+- ⏳ Rate limiting
+- ⏳ Metrics and monitoring
+- ⏳ Docker support
+- ⏳ Client examples in multiple languages
 
-## Building and Testing
+## Getting Started
 
+### Prerequisites
+- Rust 1.70 or later
+- Cargo
+
+### Building
 ```bash
-# Build the project
-cargo build
+cargo build --release
+```
 
-# Run tests
-cargo test
+### Running
+```bash
+# Default configuration
+cargo run
+
+# Custom addresses
+cargo run -- --grpc-addr 127.0.0.1:50051 --graphql-addr 127.0.0.1:8000
+
+# Show help
+cargo run -- --help
+```
 
-# Run with specific test
-cargo test test_name -- --nocapture
+## API Usage
+
+### gRPC
+```protobuf
+service AutocompleteService {
+    rpc Complete(CompleteRequest) returns (CompleteResponse);
+    rpc Init(InitRequest) returns (InitResponse);
+    rpc GetStats(StatsRequest) returns (StatsResponse);
+}
+```
+
+### GraphQL
+```graphql
+type Query {
+    complete(prefix: String!, maxResults: Int): CompleteResponse!
+    stats: StatsResponse!
+}
+
+type Mutation {
+    init(strings: [StringInput!]!): InitResponse!
+}
+```
+
+## Project Structure
+
+```
+autocomplete-rs/
+├── src/
+│   ├── main.rs           # Entry point and CLI
+│   ├── autocomplete.rs   # Core autocomplete logic
+│   ├── graphql.rs        # GraphQL schema and resolvers
+│   ├── server.rs         # Server implementations
+│   ├── string_pool.rs    # String interning
+│   ├── trie.rs          # Trie data structure
+│   └── types.rs         # Common types
+├── proto/
+│   └── autocomplete.proto # gRPC service definition
+└── schema/
+    └── schema.graphql    # GraphQL schema
 ```
 
-## Original Project
+## Contributing
 
-This is a port of the original C++ autocomplete system, which provides efficient string completion functionality. The original implementation can be found in the `archive` directory.
+1. Fork the repository
+2. Create your feature branch (`git checkout -b feature/amazing-feature`)
+3. Commit your changes (`git commit -m 'Add amazing feature'`)
+4. Push to the branch (`git push origin feature/amazing-feature`)
+5. Open a Pull Request
 
 ## License
 
diff --git a/autocomplete-rs/src/autocomplete.rs b/autocomplete-rs/src/autocomplete.rs
index 1191a75..b910078 100644
--- a/autocomplete-rs/src/autocomplete.rs
+++ b/autocomplete-rs/src/autocomplete.rs
@@ -1,222 +1,45 @@
-use std::collections::HashMap;
-use crate::types::{IdType, ByteRange, global};
-use crate::string_pool::ScoredStringPool;
-use crate::trie::CompletionTrie;
-use crate::dictionary::{FCDictionary, IntegerFCDictionary};
-use crate::index::{BlockedInvertedIndex, CompactVector, BitVector};
+use crate::types::ScoreType;
+use crate::trie::Trie;
+use crate::dictionary::Dictionary;
 
-const BLOCK_SIZE: usize = 1024;
-
-/// Main autocomplete implementation
+#[derive(Clone)]
 pub struct Autocomplete {
-    string_pool: ScoredStringPool,
-    trie: CompletionTrie,
-    dictionary: FCDictionary,
-    index: BlockedInvertedIndex,
-    term_to_id: HashMap<String, IdType>,
-    id_to_term: Vec<String>,
-    num_terms: usize,
+    trie: Trie,
+    dictionary: Dictionary,
 }
 
 impl Autocomplete {
-    /// Create a new autocomplete instance
-    pub fn new() -> Self {
-        Self {
-            string_pool: ScoredStringPool::new(),
-            trie: CompletionTrie::new(),
-            dictionary: FCDictionary::new(),
-            index: BlockedInvertedIndex::new(BLOCK_SIZE),
-            term_to_id: HashMap::new(),
-            id_to_term: Vec::new(),
-            num_terms: 0,
-        }
-    }
-
-    /// Initialize the autocomplete system
-    pub fn init(&mut self, strings: &[String], scores: &[IdType]) {
-        assert_eq!(strings.len(), scores.len());
-        
-        // Build string pool
-        self.string_pool = ScoredStringPool::new();
-        let mut offsets = Vec::with_capacity(strings.len() + 1);
-        let mut all_scores = Vec::with_capacity(strings.len());
-        let mut total_bytes = 0;
-        offsets.push(0);
-        for (string, &score) in strings.iter().zip(scores) {
-            total_bytes += string.len();
-            offsets.push(total_bytes);
-            all_scores.push(score);
-        }
-        self.string_pool.set_offsets(offsets);
-        self.string_pool.set_scores(all_scores);
-        self.string_pool.set_data(strings.iter().flat_map(|s| s.as_bytes()).cloned().collect());
-
-        // Build dictionary
-        self.dictionary.build(strings);
-
-        // Build term mappings
-        self.term_to_id.clear();
-        self.id_to_term.clear();
-        for (i, string) in strings.iter().enumerate() {
-            self.term_to_id.insert(string.clone(), (i + 1) as IdType);
-            self.id_to_term.push(string.clone());
-        }
-        self.num_terms = strings.len();
-
-        // Build trie
-        self.trie.clear();
-        for (i, string) in strings.iter().enumerate() {
-            self.trie.insert(string, (i + 1) as IdType);
-        }
-
-        // Build index
-        self.index.clear();
-        for (i, _string) in strings.iter().enumerate() {
-            let term_id = (i + 1) as IdType;
-            self.index.add_doc(term_id, term_id);
-        }
-    }
-
-    /// Find completions for a prefix
-    pub fn complete(&self, prefix: &str) -> Vec<(String, IdType)> {
-        let mut results = Vec::new();
-        
-        // Get completion IDs from trie
-        let completion_ids = self.trie.complete(prefix);
-        
-        // Look up strings and scores
-        for &id in &completion_ids {
-            if let Some(string) = self.dictionary.lookup(id) {
-                let scored_range = self.string_pool.get(id as usize);
-                results.push((string, scored_range.score));
-            }
-        }
-
-        // Sort by score (descending)
-        results.sort_by(|a, b| b.1.cmp(&a.1));
-        results
-    }
-
-    /// Get the number of terms
-    pub fn num_terms(&self) -> usize {
-        self.num_terms
-    }
-
-    /// Get the size in bytes
-    pub fn bytes(&self) -> usize {
-        self.string_pool.bytes() +
-        self.trie.num_nodes() * std::mem::size_of::<char>() +
-        self.dictionary.bytes() +
-        self.index.num_blocks() * std::mem::size_of::<IdType>() +
-        self.term_to_id.capacity() * std::mem::size_of::<(String, IdType)>() +
-        self.id_to_term.capacity() * std::mem::size_of::<String>()
-    }
-}
-
-/// Integer-based autocomplete implementation
-pub struct Autocomplete2 {
-    string_pool: ScoredStringPool,
-    trie: CompletionTrie,
-    dictionary: IntegerFCDictionary,
-    index: BlockedInvertedIndex,
-    term_to_id: HashMap<String, IdType>,
-    id_to_term: Vec<String>,
-    num_terms: usize,
-}
-
-impl Autocomplete2 {
-    /// Create a new integer-based autocomplete instance
     pub fn new() -> Self {
         Self {
-            string_pool: ScoredStringPool::new(),
-            trie: CompletionTrie::new(),
-            dictionary: IntegerFCDictionary::new(),
-            index: BlockedInvertedIndex::new(BLOCK_SIZE),
-            term_to_id: HashMap::new(),
-            id_to_term: Vec::new(),
-            num_terms: 0,
+            trie: Trie::new(),
+            dictionary: Dictionary::new(),
         }
     }
 
-    /// Initialize the autocomplete system
-    pub fn init(&mut self, strings: &[String], scores: &[IdType]) {
-        assert_eq!(strings.len(), scores.len());
-        
-        // Build string pool
-        self.string_pool = ScoredStringPool::new();
-        let mut offsets = Vec::with_capacity(strings.len() + 1);
-        let mut all_scores = Vec::with_capacity(strings.len());
-        let mut total_bytes = 0;
-        offsets.push(0);
-        for (string, &score) in strings.iter().zip(scores) {
-            total_bytes += string.len();
-            offsets.push(total_bytes);
-            all_scores.push(score);
-        }
-        self.string_pool.set_offsets(offsets);
-        self.string_pool.set_scores(all_scores);
-        self.string_pool.set_data(strings.iter().flat_map(|s| s.as_bytes()).cloned().collect());
-
-        // Build dictionary
-        self.dictionary.build(strings);
-
-        // Build term mappings
-        self.term_to_id.clear();
-        self.id_to_term.clear();
-        for (i, string) in strings.iter().enumerate() {
-            self.term_to_id.insert(string.clone(), (i + 1) as IdType);
-            self.id_to_term.push(string.clone());
-        }
-        self.num_terms = strings.len();
-
-        // Build trie
-        self.trie.clear();
-        for (i, string) in strings.iter().enumerate() {
-            self.trie.insert(string, (i + 1) as IdType);
-        }
-
-        // Build index
-        self.index.clear();
-        for (i, _string) in strings.iter().enumerate() {
-            let term_id = (i + 1) as IdType;
-            self.index.add_doc(term_id, term_id);
+    pub fn init(&mut self, strings: &[(String, ScoreType)]) -> Result<(), String> {
+        for (string, score) in strings {
+            let id = self.dictionary.insert(string.clone());
+            self.trie.insert(string, id, *score);
         }
+        Ok(())
     }
 
-    /// Find completions for a prefix
-    pub fn complete(&self, prefix: &str) -> Vec<(String, IdType)> {
-        let mut results = Vec::new();
-        let mut completion = Vec::new();
-        
-        // Get completion IDs from trie
-        let completion_ids = self.trie.complete(prefix);
-        
-        // Look up strings and scores
-        for &id in &completion_ids {
-            let len = self.dictionary.extract(id, &mut completion);
-            if len > 0 {
-                let scored_range = self.string_pool.get(id as usize);
-                let string = String::from_utf8_lossy(&completion).into_owned();
-                results.push((string, scored_range.score));
-            }
-        }
-
-        // Sort by score (descending)
-        results.sort_by(|a, b| b.1.cmp(&a.1));
-        results
+    pub fn complete(&self, prefix: &str) -> Vec<(String, ScoreType)> {
+        let completions = self.trie.complete(prefix);
+        completions
+            .into_iter()
+            .filter_map(|(id, score)| {
+                self.dictionary.get(id).map(|text| (text.to_string(), score))
+            })
+            .collect()
     }
 
-    /// Get the number of terms
     pub fn num_terms(&self) -> usize {
-        self.num_terms
+        self.dictionary.len()
     }
 
-    /// Get the size in bytes
     pub fn bytes(&self) -> usize {
-        self.string_pool.bytes() +
-        self.trie.num_nodes() * std::mem::size_of::<char>() +
-        self.index.num_blocks() * std::mem::size_of::<IdType>() +
-        self.term_to_id.capacity() * std::mem::size_of::<(String, IdType)>() +
-        self.id_to_term.capacity() * std::mem::size_of::<String>()
+        // TODO: Implement actual memory usage calculation
+        0
     }
 } 
\ No newline at end of file
diff --git a/autocomplete-rs/src/dictionary.rs b/autocomplete-rs/src/dictionary.rs
index 99a37b2..a09adda 100644
--- a/autocomplete-rs/src/dictionary.rs
+++ b/autocomplete-rs/src/dictionary.rs
@@ -1,199 +1,46 @@
-use std::collections::HashMap;
-use crate::types::{ByteRange, IdType, global};
+use crate::types::IdType;
 
-/// Front-coded dictionary for string compression
-pub struct FCDictionary {
-    data: Vec<u8>,
-    offsets: Vec<u32>,
-    num_strings: usize,
-    total_size: usize,
+#[derive(Clone)]
+pub struct Dictionary {
+    strings: Vec<String>,
+    id_map: std::collections::HashMap<String, IdType>,
+    next_id: IdType,
 }
 
-impl FCDictionary {
-    /// Create a new front-coded dictionary
+impl Dictionary {
     pub fn new() -> Self {
         Self {
-            data: Vec::new(),
-            offsets: Vec::new(),
-            num_strings: 0,
-            total_size: 0,
+            strings: Vec::new(),
+            id_map: std::collections::HashMap::new(),
+            next_id: 0,
         }
     }
 
-    /// Build the dictionary from a list of strings
-    pub fn build(&mut self, strings: &[String]) {
-        if strings.is_empty() {
-            return;
+    pub fn insert(&mut self, string: String) -> IdType {
+        if let Some(&id) = self.id_map.get(&string) {
+            return id;
         }
 
-        self.num_strings = strings.len();
-        self.offsets.clear();
-        self.data.clear();
-        self.total_size = 0;
-
-        // Sort strings for better compression
-        let mut sorted_strings: Vec<_> = strings.iter().collect();
-        sorted_strings.sort();
-
-        // First string is stored completely
-        let first = sorted_strings[0];
-        self.offsets.push(0);
-        self.data.extend_from_slice(first.as_bytes());
-        self.total_size += first.len();
-
-        // Process remaining strings
-        for i in 1..sorted_strings.len() {
-            let prev = sorted_strings[i - 1];
-            let curr = sorted_strings[i];
-            
-            // Find common prefix
-            let lcp = self.longest_common_prefix(prev, curr);
-            
-            // Store offset and remaining string
-            self.offsets.push(self.total_size as u32);
-            self.data.push(lcp as u8);
-            self.data.extend_from_slice(&curr.as_bytes()[lcp..]);
-            self.total_size += 1 + curr.len() - lcp;
-        }
-    }
-
-    /// Find the longest common prefix between two strings
-    fn longest_common_prefix(&self, a: &str, b: &str) -> usize {
-        a.bytes()
-            .zip(b.bytes())
-            .take_while(|(x, y)| x == y)
-            .count()
-    }
-
-    /// Look up a string in the dictionary
-    pub fn lookup(&self, id: IdType) -> Option<String> {
-        if id == 0 || id > self.num_strings as IdType {
-            return None;
-        }
-
-        let id = (id - 1) as usize;
-        let offset = self.offsets[id] as usize;
-        
-        if id == 0 {
-            // First string is stored completely
-            let end = if id + 1 < self.offsets.len() {
-                self.offsets[id + 1] as usize
-            } else {
-                self.data.len()
-            };
-            Some(String::from_utf8_lossy(&self.data[offset..end]).into_owned())
-        } else {
-            // Other strings are front-coded
-            let lcp = self.data[offset] as usize;
-            let prev = self.lookup(id as IdType - 1)?;
-            let mut result = prev[..lcp].to_string();
-            let end = if id + 1 < self.offsets.len() {
-                self.offsets[id + 1] as usize
-            } else {
-                self.data.len()
-            };
-            result.push_str(std::str::from_utf8(&self.data[offset + 1..end]).unwrap());
-            Some(result)
-        }
+        let id = self.next_id;
+        self.next_id += 1;
+        self.strings.push(string.clone());
+        self.id_map.insert(string, id);
+        id
     }
 
-    /// Get the number of strings in the dictionary
-    pub fn size(&self) -> usize {
-        self.num_strings
+    pub fn get(&self, id: IdType) -> Option<&str> {
+        self.strings.get(id as usize).map(|s| s.as_str())
     }
 
-    /// Get the total size of the compressed data
-    pub fn total_size(&self) -> usize {
-        self.total_size
+    pub fn get_id(&self, string: &str) -> Option<IdType> {
+        self.id_map.get(string).copied()
     }
 
-    /// Get the size of the dictionary in bytes
-    pub fn bytes(&self) -> usize {
-        std::mem::size_of_val(&self.num_strings) +
-        std::mem::size_of_val(&self.total_size) +
-        self.offsets.len() * std::mem::size_of::<u32>() +
-        self.data.len()
-    }
-}
-
-/// Integer-based front-coded dictionary
-pub struct IntegerFCDictionary {
-    headers: Vec<u8>,
-    buckets: Vec<u8>,
-    size: usize,
-}
-
-impl IntegerFCDictionary {
-    /// Create a new integer-based front-coded dictionary
-    pub fn new() -> Self {
-        Self {
-            headers: Vec::new(),
-            buckets: Vec::new(),
-            size: 0,
-        }
-    }
-
-    /// Build the dictionary from a list of strings
-    pub fn build(&mut self, strings: &[String]) {
-        if strings.is_empty() {
-            return;
-        }
-
-        self.size = strings.len();
-        self.headers.clear();
-        self.buckets.clear();
-
-        // Sort strings for better compression
-        let mut sorted_strings: Vec<_> = strings.iter().collect();
-        sorted_strings.sort();
-
-        // Process strings
-        for i in 0..sorted_strings.len() {
-            let curr = sorted_strings[i];
-            let lcp = if i > 0 {
-                self.longest_common_prefix(sorted_strings[i - 1], curr)
-            } else {
-                0
-            };
-
-            // Store header
-            self.headers.extend_from_slice(curr.as_bytes());
-            
-            // Store bucket
-            self.buckets.push(lcp as u8);
-            self.buckets.push((curr.len() - lcp) as u8);
-            self.buckets.extend_from_slice(&curr.as_bytes()[lcp..]);
-        }
-    }
-
-    /// Find the longest common prefix between two strings
-    fn longest_common_prefix(&self, a: &str, b: &str) -> usize {
-        a.bytes()
-            .zip(b.bytes())
-            .take_while(|(x, y)| x == y)
-            .count()
-    }
-
-    /// Extract a string from the dictionary
-    pub fn extract(&self, id: IdType, completion: &mut Vec<u8>) -> u8 {
-        if id == 0 || id > self.size as IdType {
-            return 0;
-        }
-
-        let id = (id - 1) as usize;
-        let bucket_start = id * 2;
-        let lcp = self.buckets[bucket_start] as usize;
-        let remaining = self.buckets[bucket_start + 1] as usize;
-
-        completion.clear();
-        completion.extend_from_slice(&self.headers[id..id + lcp]);
-        completion.extend_from_slice(&self.buckets[bucket_start + 2..bucket_start + 2 + remaining]);
-
-        (lcp + remaining) as u8
+    pub fn len(&self) -> usize {
+        self.strings.len()
     }
 
-    /// Get the number of strings in the dictionary
-    pub fn size(&self) -> usize {
-        self.size
+    pub fn is_empty(&self) -> bool {
+        self.strings.is_empty()
     }
 } 
\ No newline at end of file
diff --git a/autocomplete-rs/src/graphql.rs b/autocomplete-rs/src/graphql.rs
index 197c180..daf52ab 100644
--- a/autocomplete-rs/src/graphql.rs
+++ b/autocomplete-rs/src/graphql.rs
@@ -1,5 +1,7 @@
-use async_graphql::{Object, Schema, SimpleObject, InputObject};
+use async_graphql::{Object, Schema, SimpleObject, InputObject, EmptySubscription};
 use crate::autocomplete::Autocomplete;
+use std::sync::Arc;
+use tokio::sync::Mutex;
 
 #[derive(SimpleObject)]
 struct Completion {
@@ -31,13 +33,14 @@ struct StringScoreInput {
 }
 
 pub struct QueryRoot {
-    autocomplete: Autocomplete,
+    autocomplete: Arc<Mutex<Autocomplete>>,
 }
 
 #[Object]
 impl QueryRoot {
-    async fn complete(&self, prefix: String, max_results: Option<i32>) -> CompleteResponse {
-        let completions = self.autocomplete.complete(&prefix);
+    async fn complete(&self, prefix: String, _max_results: Option<i32>) -> CompleteResponse {
+        let autocomplete = self.autocomplete.lock().await;
+        let completions = autocomplete.complete(&prefix);
         let completions = completions.into_iter()
             .map(|(text, score)| Completion { text, score })
             .collect();
@@ -46,15 +49,16 @@ impl QueryRoot {
     }
 
     async fn stats(&self) -> Stats {
+        let autocomplete = self.autocomplete.lock().await;
         Stats {
-            num_terms: self.autocomplete.num_terms() as i32,
-            memory_bytes: self.autocomplete.bytes() as i64,
+            num_terms: autocomplete.num_terms() as i32,
+            memory_bytes: autocomplete.bytes() as i64,
         }
     }
 }
 
 pub struct MutationRoot {
-    autocomplete: Autocomplete,
+    autocomplete: Arc<Mutex<Autocomplete>>,
 }
 
 #[Object]
@@ -65,7 +69,8 @@ impl MutationRoot {
             .map(|s| (s.text, s.score))
             .collect();
             
-        match self.autocomplete.init(&strings) {
+        let mut autocomplete = self.autocomplete.lock().await;
+        match autocomplete.init(&strings) {
             Ok(_) => InitResponse {
                 success: true,
                 error: None,
@@ -78,13 +83,13 @@ impl MutationRoot {
     }
 }
 
-pub type AppSchema = Schema<QueryRoot, MutationRoot>;
+pub type AppSchema = Schema<QueryRoot, MutationRoot, EmptySubscription>;
 
-pub fn create_schema(autocomplete: Autocomplete) -> AppSchema {
+pub fn create_schema(autocomplete: Arc<Mutex<Autocomplete>>) -> AppSchema {
     Schema::build(
         QueryRoot { autocomplete: autocomplete.clone() },
         MutationRoot { autocomplete },
-        async_graphql::EmptySubscription,
+        EmptySubscription,
     )
     .finish()
 } 
\ No newline at end of file
diff --git a/autocomplete-rs/src/index.rs b/autocomplete-rs/src/index.rs
index 47d176e..2115e21 100644
--- a/autocomplete-rs/src/index.rs
+++ b/autocomplete-rs/src/index.rs
@@ -1,5 +1,4 @@
-use std::collections::HashMap;
-use crate::types::{IdType, global};
+use crate::types::IdType;
 
 /// Block in the inverted index
 struct Block {
@@ -32,8 +31,7 @@ impl Block {
 
 /// Blocked inverted index for efficient document retrieval
 pub struct BlockedInvertedIndex {
-    blocks: Vec<Block>,
-    term_to_block: HashMap<IdType, usize>,
+    blocks: Vec<Vec<IdType>>,
     block_size: usize,
 }
 
@@ -42,47 +40,21 @@ impl BlockedInvertedIndex {
     pub fn new(block_size: usize) -> Self {
         Self {
             blocks: Vec::new(),
-            term_to_block: HashMap::new(),
             block_size,
         }
     }
 
     /// Add a document to the index
-    pub fn add_doc(&mut self, term_id: IdType, doc_id: IdType) {
-        let block_idx = self.term_to_block.entry(term_id).or_insert_with(|| {
-            self.blocks.push(Block::new(term_id));
-            self.blocks.len() - 1
-        });
-
-        let block = &mut self.blocks[*block_idx];
-        block.add_doc(doc_id);
-
-        // If block is full, create a new one
-        if block.size() >= self.block_size {
-            self.blocks.push(Block::new(term_id));
-            *block_idx = self.blocks.len() - 1;
+    pub fn insert(&mut self, id: IdType) {
+        if self.blocks.is_empty() || self.blocks.last().unwrap().len() >= self.block_size {
+            self.blocks.push(Vec::with_capacity(self.block_size));
         }
+        self.blocks.last_mut().unwrap().push(id);
     }
 
     /// Get documents for a term
-    pub fn get_docs(&self, term_id: IdType) -> Vec<IdType> {
-        let mut docs = Vec::new();
-        
-        // Find all blocks for the term
-        let mut current_idx = self.term_to_block.get(&term_id).copied();
-        while let Some(idx) = current_idx {
-            let block = &self.blocks[idx];
-            docs.extend_from_slice(&block.docs);
-            
-            // Check if there's a next block for the same term
-            current_idx = if idx + 1 < self.blocks.len() && self.blocks[idx + 1].term_id == term_id {
-                Some(idx + 1)
-            } else {
-                None
-            };
-        }
-
-        docs
+    pub fn get(&self, block_id: usize) -> Option<&[IdType]> {
+        self.blocks.get(block_id).map(|v| v.as_slice())
     }
 
     /// Get the number of blocks
@@ -90,15 +62,9 @@ impl BlockedInvertedIndex {
         self.blocks.len()
     }
 
-    /// Get the total number of documents
-    pub fn num_docs(&self) -> usize {
-        self.blocks.iter().map(|b| b.size()).sum()
-    }
-
-    /// Clear the index
-    pub fn clear(&mut self) {
-        self.blocks.clear();
-        self.term_to_block.clear();
+    /// Get the block size
+    pub fn block_size(&self) -> usize {
+        self.block_size
     }
 }
 
diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs
index 4004de9..7c58280 100644
--- a/autocomplete-rs/src/lib.rs
+++ b/autocomplete-rs/src/lib.rs
@@ -7,6 +7,8 @@ pub mod trie;
 pub mod dictionary;
 pub mod index;
 pub mod autocomplete;
+pub mod graphql;
+pub mod server;
 
 pub use constants::*;
 pub use parameters::*;
diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs
index ee606d7..4bc2099 100644
--- a/autocomplete-rs/src/main.rs
+++ b/autocomplete-rs/src/main.rs
@@ -7,7 +7,6 @@ mod server;
 mod string_pool;
 mod trie;
 mod types;
-mod utils;
 
 /// Autocomplete service with gRPC and GraphQL support
 #[derive(Parser, Debug)]
diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs
index 733afef..d3eba26 100644
--- a/autocomplete-rs/src/server.rs
+++ b/autocomplete-rs/src/server.rs
@@ -1,15 +1,16 @@
-use std::net::SocketAddr;
 use tonic::{transport::Server as TonicServer, Request, Response, Status};
 use axum::{
     routing::{get, post},
     Router,
     extract::State,
     response::IntoResponse,
-    Json,
 };
 use async_graphql_axum::{GraphQLRequest, GraphQLResponse};
-use crate::autocomplete::{Autocomplete, Autocomplete2};
+use crate::autocomplete::Autocomplete;
 use crate::graphql::{create_schema, AppSchema};
+use std::sync::Arc;
+use tokio::sync::Mutex;
+use hyper::Server;
 
 pub mod autocomplete_proto {
     tonic::include_proto!("autocomplete");
@@ -22,8 +23,9 @@ use autocomplete_proto::{
     StatsRequest, StatsResponse,
 };
 
+#[derive(Clone)]
 pub struct AutocompleteServiceImpl {
-    autocomplete: Autocomplete,
+    autocomplete: Arc<Mutex<Autocomplete>>,
 }
 
 #[tonic::async_trait]
@@ -33,7 +35,8 @@ impl AutocompleteService for AutocompleteServiceImpl {
         request: Request<CompleteRequest>,
     ) -> Result<Response<CompleteResponse>, Status> {
         let req = request.into_inner();
-        let completions = self.autocomplete.complete(&req.prefix);
+        let autocomplete = self.autocomplete.lock().await;
+        let completions = autocomplete.complete(&req.prefix);
         
         let response = CompleteResponse {
             completions: completions.into_iter()
@@ -57,7 +60,8 @@ impl AutocompleteService for AutocompleteServiceImpl {
             .map(|s| (s.text, s.score))
             .collect();
             
-        match self.autocomplete.init(&strings) {
+        let mut autocomplete = self.autocomplete.lock().await;
+        match autocomplete.init(&strings) {
             Ok(_) => Ok(Response::new(InitResponse {
                 success: true,
                 error: String::new(),
@@ -73,9 +77,10 @@ impl AutocompleteService for AutocompleteServiceImpl {
         &self,
         _request: Request<StatsRequest>,
     ) -> Result<Response<StatsResponse>, Status> {
+        let autocomplete = self.autocomplete.lock().await;
         let response = StatsResponse {
-            num_terms: self.autocomplete.num_terms() as i32,
-            memory_bytes: self.autocomplete.bytes() as i64,
+            num_terms: autocomplete.num_terms() as i32,
+            memory_bytes: autocomplete.bytes() as i64,
         };
         
         Ok(Response::new(response))
@@ -96,7 +101,7 @@ async fn graphql_playground() -> impl IntoResponse {
 }
 
 pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box<dyn std::error::Error>> {
-    let autocomplete = Autocomplete::new();
+    let autocomplete = Arc::new(Mutex::new(Autocomplete::new()));
     let schema = create_schema(autocomplete.clone());
     
     // Create gRPC service
@@ -121,7 +126,7 @@ pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box<d
         TonicServer::builder()
             .add_service(AutocompleteServiceServer::new(grpc_service))
             .serve(grpc_addr),
-        axum::Server::bind(&graphql_addr).serve(app.into_make_service())
+        Server::bind(&graphql_addr).serve(app.into_make_service())
     );
 
     Ok(())
diff --git a/autocomplete-rs/src/string_pool.rs b/autocomplete-rs/src/string_pool.rs
index 0dc8ea5..332645a 100644
--- a/autocomplete-rs/src/string_pool.rs
+++ b/autocomplete-rs/src/string_pool.rs
@@ -9,21 +9,19 @@ pub struct ScoredByteRange {
 
 /// Manages a pool of scored strings
 pub struct ScoredStringPool {
-    scores: Vec<IdType>,
-    offsets: Vec<usize>,
     data: Vec<u8>,
+    offsets: Vec<usize>,
+    scores: Vec<f32>,
 }
 
 impl ScoredStringPool {
     /// Create a new empty string pool
     pub fn new() -> Self {
-        let mut pool = Self {
-            scores: Vec::new(),
-            offsets: Vec::new(),
+        Self {
             data: Vec::new(),
-        };
-        pool.init();
-        pool
+            offsets: vec![0],
+            scores: Vec::new(),
+        }
     }
 
     /// Initialize the pool
@@ -33,7 +31,7 @@ impl ScoredStringPool {
 
     /// Resize the pool
     pub fn resize(&mut self, num_bytes: usize, k: u32) {
-        self.scores.resize(k as usize, 0);
+        self.scores.resize(k as usize, 0.0);
         self.data.resize(num_bytes, 0);
     }
 
@@ -50,7 +48,9 @@ impl ScoredStringPool {
 
     /// Get the total number of bytes used
     pub fn bytes(&self) -> usize {
-        self.offsets.last().copied().unwrap_or(0)
+        std::mem::size_of_val(&self.data) +
+        std::mem::size_of_val(&self.offsets) +
+        std::mem::size_of_val(&self.scores)
     }
 
     /// Get a mutable reference to the data
@@ -64,25 +64,24 @@ impl ScoredStringPool {
     }
 
     /// Get a mutable reference to the scores
-    pub fn scores_mut(&mut self) -> &mut [IdType] {
+    pub fn scores_mut(&mut self) -> &mut [f32] {
         &mut self.scores
     }
 
     /// Get a reference to the scores
-    pub fn scores(&self) -> &[IdType] {
+    pub fn scores(&self) -> &[f32] {
         &self.scores
     }
 
     /// Get a scored byte range at the given index
-    pub fn get(&self, i: usize) -> ScoredByteRange {
-        assert!(i < self.size());
-        ScoredByteRange {
-            string: ByteRange {
-                begin: unsafe { self.data.as_ptr().add(self.offsets[i]) },
-                end: unsafe { self.data.as_ptr().add(self.offsets[i + 1]) },
-            },
-            score: self.scores[i],
+    pub fn get(&self, index: usize) -> ByteRange {
+        if index >= self.offsets.len() - 1 {
+            return ByteRange::new(0, 0);
         }
+        ByteRange::new(
+            self.offsets[index],
+            self.offsets[index + 1]
+        )
     }
 
     /// Set the offsets vector
@@ -91,7 +90,7 @@ impl ScoredStringPool {
     }
 
     /// Set the scores vector
-    pub fn set_scores(&mut self, scores: Vec<IdType>) {
+    pub fn set_scores(&mut self, scores: Vec<f32>) {
         self.scores = scores;
     }
 
@@ -99,6 +98,10 @@ impl ScoredStringPool {
     pub fn set_data(&mut self, data: Vec<u8>) {
         self.data = data;
     }
+
+    pub fn get_score(&self, index: usize) -> f32 {
+        self.scores.get(index).copied().unwrap_or(0.0)
+    }
 }
 
 /// Iterator over scored strings in the pool
@@ -134,7 +137,10 @@ impl<'a> Iterator for ScoredStringPoolIterator<'a> {
 
     fn next(&mut self) -> Option<Self::Item> {
         if self.pos < self.pool.size() {
-            let item = self.pool.get(self.pos);
+            let item = ScoredByteRange {
+                string: self.pool.get(self.pos),
+                score: self.pool.get_score(self.pos) as IdType,
+            };
             self.pos += 1;
             Some(item)
         } else {
diff --git a/autocomplete-rs/src/trie.rs b/autocomplete-rs/src/trie.rs
index 1b24c73..05f80e5 100644
--- a/autocomplete-rs/src/trie.rs
+++ b/autocomplete-rs/src/trie.rs
@@ -1,182 +1,147 @@
 use std::collections::HashMap;
-use crate::types::{IdType, CompletionType};
+use crate::types::IdType;
 
-/// A node in the completion trie
-pub struct TrieNode {
-    children: HashMap<char, TrieNode>,
-    is_terminal: bool,
-    completion_ids: Vec<IdType>,
+#[derive(Default, Clone)]
+struct TrieNode {
+    children: HashMap<char, Box<TrieNode>>,
+    id: Option<IdType>,
+    score: f32,
 }
 
 impl TrieNode {
-    /// Create a new trie node
-    pub fn new() -> Self {
+    fn new() -> Self {
         Self {
             children: HashMap::new(),
-            is_terminal: false,
-            completion_ids: Vec::new(),
+            id: None,
+            score: 0.0,
         }
     }
 
-    /// Add a child node
-    pub fn add_child(&mut self, c: char) -> &mut TrieNode {
-        self.children.entry(c).or_insert_with(TrieNode::new)
-    }
-
-    /// Get a child node
-    pub fn get_child(&self, c: char) -> Option<&TrieNode> {
-        self.children.get(&c)
-    }
-
-    /// Check if this is a terminal node
-    pub fn is_terminal(&self) -> bool {
-        self.is_terminal
-    }
-
-    /// Set this node as terminal
-    pub fn set_terminal(&mut self) {
-        self.is_terminal = true;
-    }
-
-    /// Add a completion ID
-    pub fn add_completion_id(&mut self, id: IdType) {
-        self.completion_ids.push(id);
-    }
-
-    /// Get completion IDs
-    pub fn completion_ids(&self) -> &[IdType] {
-        &self.completion_ids
+    fn is_terminal(&self) -> bool {
+        self.id.is_some()
     }
 }
 
-/// A trie for prefix-based completion
-pub struct CompletionTrie {
+#[derive(Clone)]
+pub struct Trie {
     root: TrieNode,
-    num_nodes: usize,
-    num_completions: usize,
 }
 
-impl CompletionTrie {
-    /// Create a new completion trie
+impl Trie {
     pub fn new() -> Self {
         Self {
             root: TrieNode::new(),
-            num_nodes: 1,
-            num_completions: 0,
-        }
-    }
-
-    /// Insert a completion string
-    pub fn insert(&mut self, completion: &str, id: IdType) {
-        let mut node = &mut self.root;
-        for c in completion.chars() {
-            node = node.add_child(c);
-            self.num_nodes += 1;
-        }
-        node.set_terminal();
-        node.add_completion_id(id);
-        self.num_completions += 1;
-    }
-
-    /// Find all completions for a prefix
-    pub fn complete(&self, prefix: &str) -> Vec<IdType> {
-        let mut node = &self.root;
-        for c in prefix.chars() {
-            match node.get_child(c) {
-                Some(next) => node = next,
-                None => return Vec::new(),
-            }
         }
-        self.collect_completions(node)
-    }
-
-    /// Collect all completion IDs from a node and its children
-    fn collect_completions(&self, node: &TrieNode) -> Vec<IdType> {
-        let mut completions = Vec::new();
-        self.collect_completions_recursive(node, &mut completions);
-        completions
     }
 
-    /// Recursive helper for collecting completions
-    fn collect_completions_recursive(&self, node: &TrieNode, completions: &mut Vec<IdType>) {
-        if node.is_terminal() {
-            completions.extend_from_slice(node.completion_ids());
-        }
-        for child in node.children.values() {
-            self.collect_completions_recursive(child, completions);
+    pub fn insert(&mut self, completion: &str, id: IdType, score: f32) {
+        let mut current = &mut self.root;
+        let chars: Vec<char> = completion.chars().collect();
+        
+        for &c in &chars {
+            current = current.children
+                .entry(c)
+                .or_insert_with(|| Box::new(TrieNode::new()));
         }
+        
+        current.id = Some(id);
+        current.score = score;
     }
 
-    /// Remove a completion string
     pub fn remove(&mut self, completion: &str) -> bool {
-        let mut chars: Vec<char> = completion.chars().collect();
-        if chars.is_empty() {
-            return false;
-        }
-
-        // First, find if the completion exists and build the path
         let mut path = Vec::new();
-        let mut current = &self.root;
+        let mut current = &mut self.root;
         
-        for &c in &chars {
-            match current.get_child(c) {
-                Some(next) => {
-                    path.push(c);
-                    current = next;
-                }
-                None => return false,
+        // First pass: find the path to the node
+        for c in completion.chars() {
+            if let Some(next) = current.children.get_mut(&c) {
+                path.push(c);
+                current = next;
+            } else {
+                return false; // String not found
             }
         }
-
+        
+        // If the node is not a terminal, the string wasn't in the trie
         if !current.is_terminal() {
             return false;
         }
-
-        // Now remove it by traversing the path again
+        
+        // Remove the terminal marker
+        current.id = None;
+        current.score = 0.0;
+        
+        // Second pass: remove empty nodes
         let mut current = &mut self.root;
-        let mut parent = None;
+        for &c in &path[..path.len()-1] {
+            current = current.children.get_mut(&c).unwrap();
+        }
         
-        for &c in &path {
-            if let Some(next) = current.children.get_mut(&c) {
-                parent = Some((c, current));
-                current = next;
-            }
+        // Remove the last node if it's empty
+        if current.children.is_empty() && !current.is_terminal() {
+            current.children.remove(&path[path.len()-1]);
         }
+        
+        true
+    }
 
-        // Remove the completion
-        current.completion_ids.clear();
-        current.is_terminal = false;
-        self.num_completions -= 1;
-
-        // Clean up empty nodes
-        while let Some((c, p)) = parent {
-            if current.children.is_empty() && !current.is_terminal() {
-                p.children.remove(&c);
-                self.num_nodes -= 1;
-                current = p;
-                parent = None;
+    pub fn complete(&self, prefix: &str) -> Vec<(IdType, f32)> {
+        let mut current = &self.root;
+        
+        // Navigate to the prefix node
+        for c in prefix.chars() {
+            if let Some(next) = current.children.get(&c) {
+                current = next;
             } else {
-                break;
+                return Vec::new(); // Prefix not found
             }
         }
-
-        true
+        
+        // Collect all completions from this node
+        let mut results = Vec::new();
+        self.collect_completions(current, &mut results);
+        results
     }
 
-    /// Clear the trie
-    pub fn clear(&mut self) {
-        self.root = TrieNode::new();
-        self.num_nodes = 1;
-        self.num_completions = 0;
+    fn collect_completions(&self, node: &TrieNode, results: &mut Vec<(IdType, f32)>) {
+        if let Some(id) = node.id {
+            results.push((id, node.score));
+        }
+        
+        for child in node.children.values() {
+            self.collect_completions(child, results);
+        }
     }
+}
 
-    /// Get the number of nodes
-    pub fn num_nodes(&self) -> usize {
-        self.num_nodes
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_trie_insert_and_complete() {
+        let mut trie = Trie::new();
+        trie.insert("hello", 1, 1.0);
+        trie.insert("help", 2, 0.8);
+        trie.insert("world", 3, 0.5);
+        
+        let completions = trie.complete("hel");
+        assert_eq!(completions.len(), 2);
+        assert!(completions.contains(&(1, 1.0)));
+        assert!(completions.contains(&(2, 0.8)));
     }
 
-    /// Get the number of completions
-    pub fn num_completions(&self) -> usize {
-        self.num_completions
+    #[test]
+    fn test_trie_remove() {
+        let mut trie = Trie::new();
+        trie.insert("hello", 1, 1.0);
+        trie.insert("help", 2, 0.8);
+        
+        assert!(trie.remove("hello"));
+        assert!(!trie.remove("hello")); // Already removed
+        assert!(trie.remove("help"));
+        
+        let completions = trie.complete("hel");
+        assert_eq!(completions.len(), 0);
     }
 } 
\ No newline at end of file
diff --git a/autocomplete-rs/src/types.rs b/autocomplete-rs/src/types.rs
index 5490d59..cbd9316 100644
--- a/autocomplete-rs/src/types.rs
+++ b/autocomplete-rs/src/types.rs
@@ -1,11 +1,12 @@
-use std::ops::Range;
-
 /// Type alias for document and term IDs
 pub type IdType = u32;
 
 /// Type alias for completion type (vector of term IDs)
 pub type CompletionType = Vec<IdType>;
 
+/// Type alias for score type
+pub type ScoreType = f32;
+
 /// Represents a range of values
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct ValueRange {
@@ -48,8 +49,18 @@ impl ScoredRange {
 /// Represents a byte range
 #[derive(Debug, Clone, Copy)]
 pub struct ByteRange {
-    pub begin: *const u8,
-    pub end: *const u8,
+    pub start: usize,
+    pub end: usize,
+}
+
+impl ByteRange {
+    pub fn new(start: usize, end: usize) -> Self {
+        Self { start, end }
+    }
+
+    pub fn len(&self) -> usize {
+        self.end - self.start
+    }
 }
 
 /// Represents a range of 32-bit integers
@@ -78,9 +89,10 @@ pub mod global {
 
 /// Convert a string to a byte range
 pub fn string_to_byte_range(s: &str) -> ByteRange {
-    let begin = s.as_ptr();
-    let end = unsafe { begin.add(s.len()) };
-    ByteRange { begin, end }
+    ByteRange {
+        start: 0,
+        end: s.len(),
+    }
 }
 
 /// Convert a completion to a uint32 range

From 0f1af5d8711a3cca879de497dbbec58091e736de Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Fri, 23 May 2025 17:21:07 -0400
Subject: [PATCH 099/102] server partially running

---
 autocomplete-rs/Cargo.toml          |  2 +-
 autocomplete-rs/src/autocomplete.rs |  2 +-
 autocomplete-rs/src/index.rs        | 52 ++++++++++++++++++-----------
 autocomplete-rs/src/lib.rs          | 16 +++++----
 autocomplete-rs/src/main.rs         | 12 ++-----
 autocomplete-rs/src/server.rs       | 22 +++++++++++-
 6 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml
index b799bbe..276f817 100644
--- a/autocomplete-rs/Cargo.toml
+++ b/autocomplete-rs/Cargo.toml
@@ -12,7 +12,7 @@ async-graphql = "6.0"
 async-graphql-axum = "6.0"
 axum = { version = "0.6", features = ["macros"] }
 tower = "0.4"
-tower-http = { version = "0.4", features = ["trace"] }
+tower-http = { version = "0.4", features = ["trace", "cors"] }
 hyper = { version = "0.14", features = ["full"] }
 clap = { version = "4.4", features = ["derive"] }
 
diff --git a/autocomplete-rs/src/autocomplete.rs b/autocomplete-rs/src/autocomplete.rs
index b910078..bb9ffa6 100644
--- a/autocomplete-rs/src/autocomplete.rs
+++ b/autocomplete-rs/src/autocomplete.rs
@@ -1,6 +1,6 @@
 use crate::types::ScoreType;
 use crate::trie::Trie;
-use crate::dictionary::Dictionary;
+use super::dictionary::Dictionary;
 
 #[derive(Clone)]
 pub struct Autocomplete {
diff --git a/autocomplete-rs/src/index.rs b/autocomplete-rs/src/index.rs
index 2115e21..29da316 100644
--- a/autocomplete-rs/src/index.rs
+++ b/autocomplete-rs/src/index.rs
@@ -1,31 +1,43 @@
-use crate::types::IdType;
-
-/// Block in the inverted index
-struct Block {
-    term_id: IdType,
-    num_docs: usize,
-    docs: Vec<IdType>,
+use crate::types::{IdType, ScoreType};
+use crate::trie::Trie;
+use crate::dictionary::Dictionary;
+
+#[derive(Clone)]
+pub struct Index {
+    trie: Trie,
+    dictionary: Dictionary,
 }
 
-impl Block {
-    /// Create a new block
-    fn new(term_id: IdType) -> Self {
+impl Index {
+    pub fn new() -> Self {
         Self {
-            term_id,
-            num_docs: 0,
-            docs: Vec::new(),
+            trie: Trie::new(),
+            dictionary: Dictionary::new(),
         }
     }
 
-    /// Add a document to the block
-    fn add_doc(&mut self, doc_id: IdType) {
-        self.docs.push(doc_id);
-        self.num_docs += 1;
+    pub fn add_doc(&mut self, _doc_id: IdType, text: &str, score: ScoreType) {
+        let id = self.dictionary.insert(text.to_string());
+        self.trie.insert(text, id, score);
+    }
+
+    pub fn search(&self, prefix: &str) -> Vec<(IdType, ScoreType)> {
+        let completions = self.trie.complete(prefix);
+        completions
+            .into_iter()
+            .filter_map(|(id, score)| {
+                self.dictionary.get(id).map(|_| (id, score))
+            })
+            .collect()
     }
 
-    /// Get the number of documents in the block
-    fn size(&self) -> usize {
-        self.num_docs
+    pub fn num_terms(&self) -> usize {
+        self.dictionary.len()
+    }
+
+    pub fn bytes(&self) -> usize {
+        // TODO: Implement actual memory usage calculation
+        0
     }
 }
 
diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs
index 7c58280..7d1cb44 100644
--- a/autocomplete-rs/src/lib.rs
+++ b/autocomplete-rs/src/lib.rs
@@ -1,21 +1,23 @@
+pub mod dictionary;
+pub mod types;
+pub mod trie;
 pub mod constants;
 pub mod parameters;
 pub mod probe;
-pub mod types;
 pub mod string_pool;
-pub mod trie;
-pub mod dictionary;
 pub mod index;
 pub mod autocomplete;
 pub mod graphql;
 pub mod server;
 
+pub use dictionary::Dictionary;
+pub use types::*;
+pub use trie::*;
 pub use constants::*;
 pub use parameters::*;
 pub use probe::*;
-pub use types::*;
 pub use string_pool::*;
-pub use trie::*;
-pub use dictionary::*;
 pub use index::*;
-pub use autocomplete::*; 
\ No newline at end of file
+pub use autocomplete::*;
+pub use graphql::*;
+pub use server::*; 
\ No newline at end of file
diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs
index 4bc2099..c5214c2 100644
--- a/autocomplete-rs/src/main.rs
+++ b/autocomplete-rs/src/main.rs
@@ -1,23 +1,17 @@
 use std::error::Error;
 use clap::Parser;
-
-mod autocomplete;
-mod graphql;
-mod server;
-mod string_pool;
-mod trie;
-mod types;
+use autocomplete_rs::server;
 
 /// Autocomplete service with gRPC and GraphQL support
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
     /// gRPC server address
-    #[arg(short, long, default_value = "[::1]:50051")]
+    #[arg(short = 'r', long, default_value = "[::1]:50051")]
     grpc_addr: String,
 
     /// GraphQL server address
-    #[arg(short, long, default_value = "[::1]:8000")]
+    #[arg(short = 'g', long, default_value = "[::1]:8000")]
     graphql_addr: String,
 }
 
diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs
index d3eba26..358bbb2 100644
--- a/autocomplete-rs/src/server.rs
+++ b/autocomplete-rs/src/server.rs
@@ -4,6 +4,7 @@ use axum::{
     Router,
     extract::State,
     response::IntoResponse,
+    http::HeaderValue,
 };
 use async_graphql_axum::{GraphQLRequest, GraphQLResponse};
 use crate::autocomplete::Autocomplete;
@@ -11,6 +12,7 @@ use crate::graphql::{create_schema, AppSchema};
 use std::sync::Arc;
 use tokio::sync::Mutex;
 use hyper::Server;
+use tower_http::cors::{CorsLayer, Any};
 
 pub mod autocomplete_proto {
     tonic::include_proto!("autocomplete");
@@ -97,6 +99,7 @@ async fn graphql_handler(
 async fn graphql_playground() -> impl IntoResponse {
     async_graphql::http::playground_source(
         async_graphql::http::GraphQLPlaygroundConfig::new("/graphql")
+            .subscription_endpoint("/graphql")
     )
 }
 
@@ -109,10 +112,18 @@ pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box<d
         autocomplete: autocomplete.clone(),
     };
 
+    // Configure CORS
+    let cors = CorsLayer::new()
+        .allow_origin("*".parse::<HeaderValue>().unwrap())
+        .allow_methods(Any)
+        .allow_headers(Any);
+
     // Create GraphQL router
     let app = Router::new()
         .route("/graphql", post(graphql_handler))
+        .route("/", get(graphql_playground))
         .route("/playground", get(graphql_playground))
+        .layer(cors)
         .with_state(schema);
 
     // Start both servers
@@ -121,13 +132,22 @@ pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box<d
 
     println!("gRPC server listening on {}", grpc_addr);
     println!("GraphQL server listening on {}", graphql_addr);
+    println!("GraphQL Playground available at: http://localhost:8000/playground");
 
-    tokio::join!(
+    let (grpc_result, graphql_result) = tokio::join!(
         TonicServer::builder()
             .add_service(AutocompleteServiceServer::new(grpc_service))
             .serve(grpc_addr),
         Server::bind(&graphql_addr).serve(app.into_make_service())
     );
 
+    // Handle any errors from the servers
+    if let Err(e) = grpc_result {
+        return Err(Box::new(e));
+    }
+    if let Err(e) = graphql_result {
+        return Err(Box::new(e));
+    }
+
     Ok(())
 } 
\ No newline at end of file

From 50ee172d2df2e31071de8c1539a4ac3a8dee2ad7 Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Sun, 25 May 2025 12:19:33 -0400
Subject: [PATCH 100/102] removed server until function works

---
 autocomplete-rs/.gitignore               |   18 +
 autocomplete-rs/Cargo.lock               | 1488 +---------------------
 autocomplete-rs/Cargo.toml               |    8 -
 autocomplete-rs/build.rs                 |    5 +-
 autocomplete-rs/proto/autocomplete.proto |   58 -
 autocomplete-rs/src/graphql.rs           |   95 --
 autocomplete-rs/src/main.rs              |   26 +-
 autocomplete-rs/src/server.rs            |  153 ---
 8 files changed, 75 insertions(+), 1776 deletions(-)
 create mode 100644 autocomplete-rs/.gitignore
 delete mode 100644 autocomplete-rs/proto/autocomplete.proto
 delete mode 100644 autocomplete-rs/src/graphql.rs
 delete mode 100644 autocomplete-rs/src/server.rs

diff --git a/autocomplete-rs/.gitignore b/autocomplete-rs/.gitignore
new file mode 100644
index 0000000..da95885
--- /dev/null
+++ b/autocomplete-rs/.gitignore
@@ -0,0 +1,18 @@
+# Cargo
+target/
+
+# IDEs
+.vscode/
+.idea/
+
+# OS
+.DS_Store
+
+# Rust
+
+# Build
+build.rs
+
+# Cargo.lock
+Cargo.lock
+
diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock
index bd45602..6222344 100644
--- a/autocomplete-rs/Cargo.lock
+++ b/autocomplete-rs/Cargo.lock
@@ -2,16 +2,6 @@
 # It is not intended for manual editing.
 version = 4
 
-[[package]]
-name = "Inflector"
-version = "0.11.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3"
-dependencies = [
- "lazy_static",
- "regex",
-]
-
 [[package]]
 name = "addr2line"
 version = "0.24.2"
@@ -92,137 +82,6 @@ version = "1.0.98"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
 
-[[package]]
-name = "ascii_utils"
-version = "0.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71938f30533e4d95a6d17aa530939da3842c2ab6f4f84b9dae68447e4129f74a"
-
-[[package]]
-name = "async-graphql"
-version = "6.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "298a5d587d6e6fdb271bf56af2dc325a80eb291fd0fc979146584b9a05494a8c"
-dependencies = [
- "async-graphql-derive",
- "async-graphql-parser",
- "async-graphql-value",
- "async-stream",
- "async-trait",
- "base64 0.13.1",
- "bytes",
- "fast_chemail",
- "fnv",
- "futures-util",
- "handlebars",
- "http",
- "indexmap 2.9.0",
- "mime",
- "multer",
- "num-traits",
- "once_cell",
- "pin-project-lite",
- "regex",
- "serde",
- "serde_json",
- "serde_urlencoded",
- "static_assertions",
- "tempfile",
- "thiserror 1.0.69",
-]
-
-[[package]]
-name = "async-graphql-axum"
-version = "6.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01a1c20a2059bffbc95130715b23435a05168c518fba9709c81fa2a38eed990c"
-dependencies = [
- "async-graphql",
- "async-trait",
- "axum",
- "bytes",
- "futures-util",
- "serde_json",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tower-service",
-]
-
-[[package]]
-name = "async-graphql-derive"
-version = "6.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7f329c7eb9b646a72f70c9c4b516c70867d356ec46cb00dcac8ad343fd006b0"
-dependencies = [
- "Inflector",
- "async-graphql-parser",
- "darling",
- "proc-macro-crate",
- "proc-macro2",
- "quote",
- "strum",
- "syn",
- "thiserror 1.0.69",
-]
-
-[[package]]
-name = "async-graphql-parser"
-version = "6.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6139181845757fd6a73fbb8839f3d036d7150b798db0e9bb3c6e83cdd65bd53b"
-dependencies = [
- "async-graphql-value",
- "pest",
- "serde",
- "serde_json",
-]
-
-[[package]]
-name = "async-graphql-value"
-version = "6.0.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "323a5143f5bdd2030f45e3f2e0c821c9b1d36e79cf382129c64299c50a7f3750"
-dependencies = [
- "bytes",
- "indexmap 2.9.0",
- "serde",
- "serde_json",
-]
-
-[[package]]
-name = "async-stream"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
-dependencies = [
- "async-stream-impl",
- "futures-core",
- "pin-project-lite",
-]
-
-[[package]]
-name = "async-stream-impl"
-version = "0.3.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "async-trait"
-version = "0.1.88"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.4.0"
@@ -233,85 +92,11 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 name = "autocomplete-rs"
 version = "0.1.0"
 dependencies = [
- "async-graphql",
- "async-graphql-axum",
- "axum",
  "clap",
  "futures",
- "hyper",
- "prost",
  "tempfile",
  "tokio",
- "tonic",
  "tonic-build",
- "tower",
- "tower-http",
-]
-
-[[package]]
-name = "axum"
-version = "0.6.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
-dependencies = [
- "async-trait",
- "axum-core",
- "axum-macros",
- "base64 0.21.7",
- "bitflags 1.3.2",
- "bytes",
- "futures-util",
- "headers",
- "http",
- "http-body",
- "hyper",
- "itoa",
- "matchit",
- "memchr",
- "mime",
- "percent-encoding",
- "pin-project-lite",
- "rustversion",
- "serde",
- "serde_json",
- "serde_path_to_error",
- "serde_urlencoded",
- "sha1",
- "sync_wrapper",
- "tokio",
- "tokio-tungstenite",
- "tower",
- "tower-layer",
- "tower-service",
-]
-
-[[package]]
-name = "axum-core"
-version = "0.3.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
-dependencies = [
- "async-trait",
- "bytes",
- "futures-util",
- "http",
- "http-body",
- "mime",
- "rustversion",
- "tower-layer",
- "tower-service",
-]
-
-[[package]]
-name = "axum-macros"
-version = "0.3.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdca6a10ecad987bda04e95606ef85a5417dcaac1a78455242d72e031e2b6b62"
-dependencies = [
- "heck 0.4.1",
- "proc-macro2",
- "quote",
- "syn",
 ]
 
 [[package]]
@@ -329,53 +114,17 @@ dependencies = [
  "windows-targets",
 ]
 
-[[package]]
-name = "base64"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
-
-[[package]]
-name = "base64"
-version = "0.21.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
-
-[[package]]
-name = "bitflags"
-version = "1.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
-
 [[package]]
 name = "bitflags"
 version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
 
-[[package]]
-name = "block-buffer"
-version = "0.10.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
-dependencies = [
- "generic-array",
-]
-
-[[package]]
-name = "byteorder"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
-
 [[package]]
 name = "bytes"
 version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
-dependencies = [
- "serde",
-]
 
 [[package]]
 name = "cfg-if"
@@ -411,7 +160,7 @@ version = "4.5.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "syn",
@@ -429,102 +178,12 @@ version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
 
-[[package]]
-name = "cpufeatures"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "crypto-common"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
-dependencies = [
- "generic-array",
- "typenum",
-]
-
-[[package]]
-name = "darling"
-version = "0.20.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee"
-dependencies = [
- "darling_core",
- "darling_macro",
-]
-
-[[package]]
-name = "darling_core"
-version = "0.20.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e"
-dependencies = [
- "fnv",
- "ident_case",
- "proc-macro2",
- "quote",
- "strsim",
- "syn",
-]
-
-[[package]]
-name = "darling_macro"
-version = "0.20.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
-dependencies = [
- "darling_core",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "data-encoding"
-version = "2.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
-
-[[package]]
-name = "digest"
-version = "0.10.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
-dependencies = [
- "block-buffer",
- "crypto-common",
-]
-
-[[package]]
-name = "displaydoc"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "either"
 version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 
-[[package]]
-name = "encoding_rs"
-version = "0.8.35"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
-dependencies = [
- "cfg-if",
-]
-
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -541,15 +200,6 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
-[[package]]
-name = "fast_chemail"
-version = "0.9.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "495a39d30d624c2caabe6312bfead73e7717692b44e0b32df168c275a2e8e9e4"
-dependencies = [
- "ascii_utils",
-]
-
 [[package]]
 name = "fastrand"
 version = "2.3.0"
@@ -562,21 +212,6 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
-[[package]]
-name = "fnv"
-version = "1.0.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
-
-[[package]]
-name = "form_urlencoded"
-version = "1.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
-dependencies = [
- "percent-encoding",
-]
-
 [[package]]
 name = "futures"
 version = "0.3.31"
@@ -666,27 +301,6 @@ dependencies = [
  "slab",
 ]
 
-[[package]]
-name = "generic-array"
-version = "0.14.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
-dependencies = [
- "typenum",
- "version_check",
-]
-
-[[package]]
-name = "getrandom"
-version = "0.2.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
-dependencies = [
- "cfg-if",
- "libc",
- "wasi 0.11.0+wasi-snapshot-preview1",
-]
-
 [[package]]
 name = "getrandom"
 version = "0.3.3"
@@ -705,45 +319,6 @@ version = "0.31.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
 
-[[package]]
-name = "h2"
-version = "0.3.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
-dependencies = [
- "bytes",
- "fnv",
- "futures-core",
- "futures-sink",
- "futures-util",
- "http",
- "indexmap 2.9.0",
- "slab",
- "tokio",
- "tokio-util",
- "tracing",
-]
-
-[[package]]
-name = "handlebars"
-version = "4.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faa67bab9ff362228eb3d00bd024a4965d8231bbb7921167f0cfa66c6626b225"
-dependencies = [
- "log",
- "pest",
- "pest_derive",
- "serde",
- "serde_json",
- "thiserror 1.0.69",
-]
-
-[[package]]
-name = "hashbrown"
-version = "0.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
-
 [[package]]
 name = "hashbrown"
 version = "0.15.3"
@@ -751,296 +326,48 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
 
 [[package]]
-name = "headers"
-version = "0.3.9"
+name = "heck"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06683b93020a07e3dbcf5f8c0f6d40080d725bea7936fc01ad345c01b97dc270"
-dependencies = [
- "base64 0.21.7",
- "bytes",
- "headers-core",
- "http",
- "httpdate",
- "mime",
- "sha1",
-]
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 
 [[package]]
-name = "headers-core"
-version = "0.2.0"
+name = "indexmap"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429"
+checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
 dependencies = [
- "http",
+ "equivalent",
+ "hashbrown",
 ]
 
 [[package]]
-name = "heck"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
-
-[[package]]
-name = "heck"
-version = "0.5.0"
+name = "is_terminal_polyfill"
+version = "1.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
 
 [[package]]
-name = "http"
-version = "0.2.12"
+name = "itertools"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
 dependencies = [
- "bytes",
- "fnv",
- "itoa",
+ "either",
 ]
 
 [[package]]
-name = "http-body"
-version = "0.4.6"
+name = "libc"
+version = "0.2.172"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
-dependencies = [
- "bytes",
- "http",
- "pin-project-lite",
-]
+checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
 
 [[package]]
-name = "http-range-header"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f"
-
-[[package]]
-name = "httparse"
-version = "1.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
-
-[[package]]
-name = "httpdate"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
-
-[[package]]
-name = "hyper"
-version = "0.14.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7"
-dependencies = [
- "bytes",
- "futures-channel",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "httparse",
- "httpdate",
- "itoa",
- "pin-project-lite",
- "socket2",
- "tokio",
- "tower-service",
- "tracing",
- "want",
-]
-
-[[package]]
-name = "hyper-timeout"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
-dependencies = [
- "hyper",
- "pin-project-lite",
- "tokio",
- "tokio-io-timeout",
-]
-
-[[package]]
-name = "icu_collections"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47"
-dependencies = [
- "displaydoc",
- "potential_utf",
- "yoke",
- "zerofrom",
- "zerovec",
-]
-
-[[package]]
-name = "icu_locale_core"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a"
-dependencies = [
- "displaydoc",
- "litemap",
- "tinystr",
- "writeable",
- "zerovec",
-]
-
-[[package]]
-name = "icu_normalizer"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979"
-dependencies = [
- "displaydoc",
- "icu_collections",
- "icu_normalizer_data",
- "icu_properties",
- "icu_provider",
- "smallvec",
- "zerovec",
-]
-
-[[package]]
-name = "icu_normalizer_data"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3"
-
-[[package]]
-name = "icu_properties"
-version = "2.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b"
-dependencies = [
- "displaydoc",
- "icu_collections",
- "icu_locale_core",
- "icu_properties_data",
- "icu_provider",
- "potential_utf",
- "zerotrie",
- "zerovec",
-]
-
-[[package]]
-name = "icu_properties_data"
-version = "2.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632"
-
-[[package]]
-name = "icu_provider"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af"
-dependencies = [
- "displaydoc",
- "icu_locale_core",
- "stable_deref_trait",
- "tinystr",
- "writeable",
- "yoke",
- "zerofrom",
- "zerotrie",
- "zerovec",
-]
-
-[[package]]
-name = "ident_case"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
-
-[[package]]
-name = "idna"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
-dependencies = [
- "idna_adapter",
- "smallvec",
- "utf8_iter",
-]
-
-[[package]]
-name = "idna_adapter"
-version = "1.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
-dependencies = [
- "icu_normalizer",
- "icu_properties",
-]
-
-[[package]]
-name = "indexmap"
-version = "1.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
-dependencies = [
- "autocfg",
- "hashbrown 0.12.3",
-]
-
-[[package]]
-name = "indexmap"
-version = "2.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
-dependencies = [
- "equivalent",
- "hashbrown 0.15.3",
- "serde",
-]
-
-[[package]]
-name = "is_terminal_polyfill"
-version = "1.70.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
-
-[[package]]
-name = "itertools"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
-
-[[package]]
-name = "itoa"
-version = "1.0.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
-
-[[package]]
-name = "lazy_static"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
-
-[[package]]
-name = "libc"
-version = "0.2.172"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.9.4"
+name = "linux-raw-sys"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
 
-[[package]]
-name = "litemap"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
-
 [[package]]
 name = "lock_api"
 version = "0.4.12"
@@ -1057,24 +384,12 @@ version = "0.4.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
 
-[[package]]
-name = "matchit"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
-
 [[package]]
 name = "memchr"
 version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
-[[package]]
-name = "mime"
-version = "0.3.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
-
 [[package]]
 name = "miniz_oxide"
 version = "0.8.8"
@@ -1095,39 +410,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "multer"
-version = "2.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01acbdc23469fd8fe07ab135923371d5f5a422fbf9c522158677c8eb15bc51c2"
-dependencies = [
- "bytes",
- "encoding_rs",
- "futures-util",
- "http",
- "httparse",
- "log",
- "memchr",
- "mime",
- "spin",
- "version_check",
-]
-
 [[package]]
 name = "multimap"
 version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
 
-[[package]]
-name = "num-traits"
-version = "0.2.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
-dependencies = [
- "autocfg",
-]
-
 [[package]]
 name = "object"
 version = "0.36.7"
@@ -1172,57 +460,6 @@ dependencies = [
  "windows-targets",
 ]
 
-[[package]]
-name = "percent-encoding"
-version = "2.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
-
-[[package]]
-name = "pest"
-version = "2.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "198db74531d58c70a361c42201efde7e2591e976d518caf7662a47dc5720e7b6"
-dependencies = [
- "memchr",
- "thiserror 2.0.12",
- "ucd-trie",
-]
-
-[[package]]
-name = "pest_derive"
-version = "2.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d725d9cfd79e87dccc9341a2ef39d1b6f6353d68c4b33c177febbe1a402c97c5"
-dependencies = [
- "pest",
- "pest_generator",
-]
-
-[[package]]
-name = "pest_generator"
-version = "2.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db7d01726be8ab66ab32f9df467ae8b1148906685bbe75c82d1e65d7f5b3f841"
-dependencies = [
- "pest",
- "pest_meta",
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "pest_meta"
-version = "2.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f9f832470494906d1fca5329f8ab5791cc60beb230c74815dff541cbd2b5ca0"
-dependencies = [
- "once_cell",
- "pest",
- "sha2",
-]
-
 [[package]]
 name = "petgraph"
 version = "0.6.5"
@@ -1230,27 +467,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db"
 dependencies = [
  "fixedbitset",
- "indexmap 2.9.0",
-]
-
-[[package]]
-name = "pin-project"
-version = "1.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
-dependencies = [
- "pin-project-internal",
-]
-
-[[package]]
-name = "pin-project-internal"
-version = "1.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
+ "indexmap",
 ]
 
 [[package]]
@@ -1265,24 +482,6 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
-[[package]]
-name = "potential_utf"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585"
-dependencies = [
- "zerovec",
-]
-
-[[package]]
-name = "ppv-lite86"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
-dependencies = [
- "zerocopy",
-]
-
 [[package]]
 name = "prettyplease"
 version = "0.2.32"
@@ -1293,16 +492,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "proc-macro-crate"
-version = "1.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919"
-dependencies = [
- "once_cell",
- "toml_edit",
-]
-
 [[package]]
 name = "proc-macro2"
 version = "1.0.95"
@@ -1329,7 +518,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
  "bytes",
- "heck 0.5.0",
+ "heck",
  "itertools",
  "log",
  "multimap",
@@ -1380,43 +569,13 @@ version = "5.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
 
-[[package]]
-name = "rand"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
-dependencies = [
- "libc",
- "rand_chacha",
- "rand_core",
-]
-
-[[package]]
-name = "rand_chacha"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
-dependencies = [
- "ppv-lite86",
- "rand_core",
-]
-
-[[package]]
-name = "rand_core"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
-dependencies = [
- "getrandom 0.2.16",
-]
-
 [[package]]
 name = "redox_syscall"
 version = "0.5.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
 ]
 
 [[package]]
@@ -1460,25 +619,13 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266"
 dependencies = [
- "bitflags 2.9.1",
+ "bitflags",
  "errno",
  "libc",
  "linux-raw-sys",
  "windows-sys 0.59.0",
 ]
 
-[[package]]
-name = "rustversion"
-version = "1.0.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d"
-
-[[package]]
-name = "ryu"
-version = "1.0.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
-
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -1486,161 +633,45 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
-name = "serde"
-version = "1.0.219"
+name = "signal-hook-registry"
+version = "1.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410"
 dependencies = [
- "serde_derive",
+ "libc",
 ]
 
 [[package]]
-name = "serde_derive"
-version = "1.0.219"
+name = "slab"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
 dependencies = [
- "proc-macro2",
- "quote",
- "syn",
+ "autocfg",
 ]
 
 [[package]]
-name = "serde_json"
-version = "1.0.140"
+name = "smallvec"
+version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
-dependencies = [
- "itoa",
- "memchr",
- "ryu",
- "serde",
-]
+checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
 
 [[package]]
-name = "serde_path_to_error"
-version = "0.1.17"
+name = "socket2"
+version = "0.5.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a"
+checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
 dependencies = [
- "itoa",
- "serde",
+ "libc",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
-name = "serde_urlencoded"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
-dependencies = [
- "form_urlencoded",
- "itoa",
- "ryu",
- "serde",
-]
-
-[[package]]
-name = "sha1"
-version = "0.10.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "digest",
-]
-
-[[package]]
-name = "sha2"
-version = "0.10.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "digest",
-]
-
-[[package]]
-name = "signal-hook-registry"
-version = "1.4.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "slab"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
-dependencies = [
- "autocfg",
-]
-
-[[package]]
-name = "smallvec"
-version = "1.15.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
-
-[[package]]
-name = "socket2"
-version = "0.5.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
-dependencies = [
- "libc",
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "spin"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-
-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
-[[package]]
-name = "static_assertions"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
-
-[[package]]
-name = "strsim"
-version = "0.11.1"
+name = "strsim"
+version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
-[[package]]
-name = "strum"
-version = "0.25.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
-dependencies = [
- "strum_macros",
-]
-
-[[package]]
-name = "strum_macros"
-version = "0.25.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
-dependencies = [
- "heck 0.4.1",
- "proc-macro2",
- "quote",
- "rustversion",
- "syn",
-]
-
 [[package]]
 name = "syn"
 version = "2.0.101"
@@ -1652,23 +683,6 @@ dependencies = [
  "unicode-ident",
 ]
 
-[[package]]
-name = "sync_wrapper"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
-
-[[package]]
-name = "synstructure"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "tempfile"
 version = "3.20.0"
@@ -1676,62 +690,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1"
 dependencies = [
  "fastrand",
- "getrandom 0.3.3",
+ "getrandom",
  "once_cell",
  "rustix",
  "windows-sys 0.59.0",
 ]
 
-[[package]]
-name = "thiserror"
-version = "1.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
-dependencies = [
- "thiserror-impl 1.0.69",
-]
-
-[[package]]
-name = "thiserror"
-version = "2.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
-dependencies = [
- "thiserror-impl 2.0.12",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "1.0.69"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "thiserror-impl"
-version = "2.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "tinystr"
-version = "0.8.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b"
-dependencies = [
- "displaydoc",
- "zerovec",
-]
-
 [[package]]
 name = "tokio"
 version = "1.45.0"
@@ -1750,16 +714,6 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "tokio-io-timeout"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf"
-dependencies = [
- "pin-project-lite",
- "tokio",
-]
-
 [[package]]
 name = "tokio-macros"
 version = "2.5.0"
@@ -1771,87 +725,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "tokio-stream"
-version = "0.1.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
-dependencies = [
- "futures-core",
- "pin-project-lite",
- "tokio",
-]
-
-[[package]]
-name = "tokio-tungstenite"
-version = "0.20.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "212d5dcb2a1ce06d81107c3d0ffa3121fe974b73f068c8282cb1c32328113b6c"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite",
-]
-
-[[package]]
-name = "tokio-util"
-version = "0.7.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df"
-dependencies = [
- "bytes",
- "futures-core",
- "futures-io",
- "futures-sink",
- "pin-project-lite",
- "tokio",
-]
-
-[[package]]
-name = "toml_datetime"
-version = "0.6.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3"
-
-[[package]]
-name = "toml_edit"
-version = "0.19.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421"
-dependencies = [
- "indexmap 2.9.0",
- "toml_datetime",
- "winnow",
-]
-
-[[package]]
-name = "tonic"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
-dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64 0.21.7",
- "bytes",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost",
- "tokio",
- "tokio-stream",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
 [[package]]
 name = "tonic-build"
 version = "0.10.2"
@@ -1865,176 +738,18 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "tower"
-version = "0.4.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
-dependencies = [
- "futures-core",
- "futures-util",
- "indexmap 1.9.3",
- "pin-project",
- "pin-project-lite",
- "rand",
- "slab",
- "tokio",
- "tokio-util",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
-[[package]]
-name = "tower-http"
-version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
-dependencies = [
- "bitflags 2.9.1",
- "bytes",
- "futures-core",
- "futures-util",
- "http",
- "http-body",
- "http-range-header",
- "pin-project-lite",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
-[[package]]
-name = "tower-layer"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
-
-[[package]]
-name = "tower-service"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
-
-[[package]]
-name = "tracing"
-version = "0.1.41"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
-dependencies = [
- "log",
- "pin-project-lite",
- "tracing-attributes",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-attributes"
-version = "0.1.28"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "tracing-core"
-version = "0.1.33"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
-dependencies = [
- "once_cell",
-]
-
-[[package]]
-name = "try-lock"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
-
-[[package]]
-name = "tungstenite"
-version = "0.20.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9"
-dependencies = [
- "byteorder",
- "bytes",
- "data-encoding",
- "http",
- "httparse",
- "log",
- "rand",
- "sha1",
- "thiserror 1.0.69",
- "url",
- "utf-8",
-]
-
-[[package]]
-name = "typenum"
-version = "1.18.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
-
-[[package]]
-name = "ucd-trie"
-version = "0.1.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
-
 [[package]]
 name = "unicode-ident"
 version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
 
-[[package]]
-name = "url"
-version = "2.5.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
-dependencies = [
- "form_urlencoded",
- "idna",
- "percent-encoding",
-]
-
-[[package]]
-name = "utf-8"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
-
-[[package]]
-name = "utf8_iter"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
-
 [[package]]
 name = "utf8parse"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
-[[package]]
-name = "version_check"
-version = "0.9.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
-
-[[package]]
-name = "want"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
-dependencies = [
- "try-lock",
-]
-
 [[package]]
 name = "wasi"
 version = "0.11.0+wasi-snapshot-preview1"
@@ -2132,124 +847,11 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
-[[package]]
-name = "winnow"
-version = "0.5.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "wit-bindgen-rt"
 version = "0.39.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
 dependencies = [
- "bitflags 2.9.1",
-]
-
-[[package]]
-name = "writeable"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
-
-[[package]]
-name = "yoke"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc"
-dependencies = [
- "serde",
- "stable_deref_trait",
- "yoke-derive",
- "zerofrom",
-]
-
-[[package]]
-name = "yoke-derive"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "synstructure",
-]
-
-[[package]]
-name = "zerocopy"
-version = "0.8.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
-dependencies = [
- "zerocopy-derive",
-]
-
-[[package]]
-name = "zerocopy-derive"
-version = "0.8.25"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "zerofrom"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
-dependencies = [
- "zerofrom-derive",
-]
-
-[[package]]
-name = "zerofrom-derive"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "synstructure",
-]
-
-[[package]]
-name = "zerotrie"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595"
-dependencies = [
- "displaydoc",
- "yoke",
- "zerofrom",
-]
-
-[[package]]
-name = "zerovec"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428"
-dependencies = [
- "yoke",
- "zerofrom",
- "zerovec-derive",
-]
-
-[[package]]
-name = "zerovec-derive"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
+ "bitflags",
 ]
diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml
index 276f817..68ed87f 100644
--- a/autocomplete-rs/Cargo.toml
+++ b/autocomplete-rs/Cargo.toml
@@ -4,16 +4,8 @@ version = "0.1.0"
 edition = "2021"
 
 [dependencies]
-tonic = { version = "0.10", features = ["transport"] }
-prost = "0.12"
 tokio = { version = "1.0", features = ["full"] }
 futures = "0.3"
-async-graphql = "6.0"
-async-graphql-axum = "6.0"
-axum = { version = "0.6", features = ["macros"] }
-tower = "0.4"
-tower-http = { version = "0.4", features = ["trace", "cors"] }
-hyper = { version = "0.14", features = ["full"] }
 clap = { version = "4.4", features = ["derive"] }
 
 [dev-dependencies]
diff --git a/autocomplete-rs/build.rs b/autocomplete-rs/build.rs
index 7d082f1..ed0ba48 100644
--- a/autocomplete-rs/build.rs
+++ b/autocomplete-rs/build.rs
@@ -1,4 +1,3 @@
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    tonic_build::compile_protos("proto/autocomplete.proto")?;
-    Ok(())
+fn main() {
+    // No build-time code generation needed
 } 
\ No newline at end of file
diff --git a/autocomplete-rs/proto/autocomplete.proto b/autocomplete-rs/proto/autocomplete.proto
deleted file mode 100644
index 12c2e74..0000000
--- a/autocomplete-rs/proto/autocomplete.proto
+++ /dev/null
@@ -1,58 +0,0 @@
-syntax = "proto3";
-
-package autocomplete;
-
-// The autocomplete service definition
-service AutocompleteService {
-  // Get completions for a prefix
-  rpc Complete (CompleteRequest) returns (CompleteResponse) {}
-  
-  // Initialize the autocomplete system with strings and scores
-  rpc Init (InitRequest) returns (InitResponse) {}
-  
-  // Get system statistics
-  rpc GetStats (StatsRequest) returns (StatsResponse) {}
-}
-
-// Request message for completion
-message CompleteRequest {
-  string prefix = 1;
-  int32 max_results = 2;  // Optional: limit number of results
-}
-
-// Response message containing completions
-message CompleteResponse {
-  repeated Completion completions = 1;
-}
-
-// A single completion result
-message Completion {
-  string text = 1;
-  float score = 2;
-}
-
-// Request message for initialization
-message InitRequest {
-  repeated StringScore strings = 1;
-}
-
-// A string with its score
-message StringScore {
-  string text = 1;
-  float score = 2;
-}
-
-// Response message for initialization
-message InitResponse {
-  bool success = 1;
-  string error = 2;  // Empty if success is true
-}
-
-// Request message for stats
-message StatsRequest {}
-
-// Response message containing system statistics
-message StatsResponse {
-  int32 num_terms = 1;
-  int64 memory_bytes = 2;
-} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/graphql.rs b/autocomplete-rs/src/graphql.rs
deleted file mode 100644
index daf52ab..0000000
--- a/autocomplete-rs/src/graphql.rs
+++ /dev/null
@@ -1,95 +0,0 @@
-use async_graphql::{Object, Schema, SimpleObject, InputObject, EmptySubscription};
-use crate::autocomplete::Autocomplete;
-use std::sync::Arc;
-use tokio::sync::Mutex;
-
-#[derive(SimpleObject)]
-struct Completion {
-    text: String,
-    score: f32,
-}
-
-#[derive(SimpleObject)]
-struct CompleteResponse {
-    completions: Vec<Completion>,
-}
-
-#[derive(SimpleObject)]
-struct Stats {
-    num_terms: i32,
-    memory_bytes: i64,
-}
-
-#[derive(SimpleObject)]
-struct InitResponse {
-    success: bool,
-    error: Option<String>,
-}
-
-#[derive(InputObject)]
-struct StringScoreInput {
-    text: String,
-    score: f32,
-}
-
-pub struct QueryRoot {
-    autocomplete: Arc<Mutex<Autocomplete>>,
-}
-
-#[Object]
-impl QueryRoot {
-    async fn complete(&self, prefix: String, _max_results: Option<i32>) -> CompleteResponse {
-        let autocomplete = self.autocomplete.lock().await;
-        let completions = autocomplete.complete(&prefix);
-        let completions = completions.into_iter()
-            .map(|(text, score)| Completion { text, score })
-            .collect();
-        
-        CompleteResponse { completions }
-    }
-
-    async fn stats(&self) -> Stats {
-        let autocomplete = self.autocomplete.lock().await;
-        Stats {
-            num_terms: autocomplete.num_terms() as i32,
-            memory_bytes: autocomplete.bytes() as i64,
-        }
-    }
-}
-
-pub struct MutationRoot {
-    autocomplete: Arc<Mutex<Autocomplete>>,
-}
-
-#[Object]
-impl MutationRoot {
-    async fn init(&self, strings: Vec<StringScoreInput>) -> InitResponse {
-        let strings: Vec<(String, f32)> = strings
-            .into_iter()
-            .map(|s| (s.text, s.score))
-            .collect();
-            
-        let mut autocomplete = self.autocomplete.lock().await;
-        match autocomplete.init(&strings) {
-            Ok(_) => InitResponse {
-                success: true,
-                error: None,
-            },
-            Err(e) => InitResponse {
-                success: false,
-                error: Some(e.to_string()),
-            },
-        }
-    }
-}
-
-pub type AppSchema = Schema<QueryRoot, MutationRoot, EmptySubscription>;
-
-pub fn create_schema(autocomplete: Arc<Mutex<Autocomplete>>) -> AppSchema {
-    Schema::build(
-        QueryRoot { autocomplete: autocomplete.clone() },
-        MutationRoot { autocomplete },
-        EmptySubscription,
-    )
-    .finish()
-} 
\ No newline at end of file
diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs
index c5214c2..751189f 100644
--- a/autocomplete-rs/src/main.rs
+++ b/autocomplete-rs/src/main.rs
@@ -1,31 +1,25 @@
 use std::error::Error;
 use clap::Parser;
-use autocomplete_rs::server;
 
-/// Autocomplete service with gRPC and GraphQL support
+/// Autocomplete service
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
-    /// gRPC server address
-    #[arg(short = 'r', long, default_value = "[::1]:50051")]
-    grpc_addr: String,
-
-    /// GraphQL server address
-    #[arg(short = 'g', long, default_value = "[::1]:8000")]
-    graphql_addr: String,
+    /// Input file path
+    #[arg(short, long)]
+    input: Option<String>,
 }
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn Error>> {
     let args = Args::parse();
 
-    println!("Starting Autocomplete Service...");
-    println!("gRPC server will listen on: {}", args.grpc_addr);
-    println!("GraphQL server will listen on: {}", args.graphql_addr);
-    println!("GraphQL Playground available at: http://{}/playground", args.graphql_addr);
-
-    // Start both servers
-    server::run_server(&args.grpc_addr, &args.graphql_addr).await?;
+    println!("Autocomplete Service");
+    
+    if let Some(input) = args.input {
+        println!("Processing input file: {}", input);
+        // TODO: Implement file processing logic
+    }
 
     Ok(())
 }
diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs
deleted file mode 100644
index 358bbb2..0000000
--- a/autocomplete-rs/src/server.rs
+++ /dev/null
@@ -1,153 +0,0 @@
-use tonic::{transport::Server as TonicServer, Request, Response, Status};
-use axum::{
-    routing::{get, post},
-    Router,
-    extract::State,
-    response::IntoResponse,
-    http::HeaderValue,
-};
-use async_graphql_axum::{GraphQLRequest, GraphQLResponse};
-use crate::autocomplete::Autocomplete;
-use crate::graphql::{create_schema, AppSchema};
-use std::sync::Arc;
-use tokio::sync::Mutex;
-use hyper::Server;
-use tower_http::cors::{CorsLayer, Any};
-
-pub mod autocomplete_proto {
-    tonic::include_proto!("autocomplete");
-}
-
-use autocomplete_proto::{
-    autocomplete_service_server::{AutocompleteService, AutocompleteServiceServer},
-    CompleteRequest, CompleteResponse, Completion,
-    InitRequest, InitResponse,
-    StatsRequest, StatsResponse,
-};
-
-#[derive(Clone)]
-pub struct AutocompleteServiceImpl {
-    autocomplete: Arc<Mutex<Autocomplete>>,
-}
-
-#[tonic::async_trait]
-impl AutocompleteService for AutocompleteServiceImpl {
-    async fn complete(
-        &self,
-        request: Request<CompleteRequest>,
-    ) -> Result<Response<CompleteResponse>, Status> {
-        let req = request.into_inner();
-        let autocomplete = self.autocomplete.lock().await;
-        let completions = autocomplete.complete(&req.prefix);
-        
-        let response = CompleteResponse {
-            completions: completions.into_iter()
-                .map(|(text, score)| Completion {
-                    text,
-                    score,
-                })
-                .collect(),
-        };
-        
-        Ok(Response::new(response))
-    }
-
-    async fn init(
-        &self,
-        request: Request<InitRequest>,
-    ) -> Result<Response<InitResponse>, Status> {
-        let req = request.into_inner();
-        let strings: Vec<(String, f32)> = req.strings
-            .into_iter()
-            .map(|s| (s.text, s.score))
-            .collect();
-            
-        let mut autocomplete = self.autocomplete.lock().await;
-        match autocomplete.init(&strings) {
-            Ok(_) => Ok(Response::new(InitResponse {
-                success: true,
-                error: String::new(),
-            })),
-            Err(e) => Ok(Response::new(InitResponse {
-                success: false,
-                error: e.to_string(),
-            })),
-        }
-    }
-
-    async fn get_stats(
-        &self,
-        _request: Request<StatsRequest>,
-    ) -> Result<Response<StatsResponse>, Status> {
-        let autocomplete = self.autocomplete.lock().await;
-        let response = StatsResponse {
-            num_terms: autocomplete.num_terms() as i32,
-            memory_bytes: autocomplete.bytes() as i64,
-        };
-        
-        Ok(Response::new(response))
-    }
-}
-
-async fn graphql_handler(
-    State(schema): State<AppSchema>,
-    req: GraphQLRequest,
-) -> GraphQLResponse {
-    schema.execute(req.into_inner()).await.into()
-}
-
-async fn graphql_playground() -> impl IntoResponse {
-    async_graphql::http::playground_source(
-        async_graphql::http::GraphQLPlaygroundConfig::new("/graphql")
-            .subscription_endpoint("/graphql")
-    )
-}
-
-pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box<dyn std::error::Error>> {
-    let autocomplete = Arc::new(Mutex::new(Autocomplete::new()));
-    let schema = create_schema(autocomplete.clone());
-    
-    // Create gRPC service
-    let grpc_service = AutocompleteServiceImpl {
-        autocomplete: autocomplete.clone(),
-    };
-
-    // Configure CORS
-    let cors = CorsLayer::new()
-        .allow_origin("*".parse::<HeaderValue>().unwrap())
-        .allow_methods(Any)
-        .allow_headers(Any);
-
-    // Create GraphQL router
-    let app = Router::new()
-        .route("/graphql", post(graphql_handler))
-        .route("/", get(graphql_playground))
-        .route("/playground", get(graphql_playground))
-        .layer(cors)
-        .with_state(schema);
-
-    // Start both servers
-    let grpc_addr = grpc_addr.parse()?;
-    let graphql_addr = graphql_addr.parse()?;
-
-    println!("gRPC server listening on {}", grpc_addr);
-    println!("GraphQL server listening on {}", graphql_addr);
-    println!("GraphQL Playground available at: http://localhost:8000/playground");
-
-    let (grpc_result, graphql_result) = tokio::join!(
-        TonicServer::builder()
-            .add_service(AutocompleteServiceServer::new(grpc_service))
-            .serve(grpc_addr),
-        Server::bind(&graphql_addr).serve(app.into_make_service())
-    );
-
-    // Handle any errors from the servers
-    if let Err(e) = grpc_result {
-        return Err(Box::new(e));
-    }
-    if let Err(e) = graphql_result {
-        return Err(Box::new(e));
-    }
-
-    Ok(())
-} 
\ No newline at end of file

From 5923ff306334e85bc3d5246df4d41eabe358134d Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Sun, 25 May 2025 12:28:13 -0400
Subject: [PATCH 101/102] add unit testing

---
 autocomplete-rs/src/lib.rs                |   6 +-
 autocomplete-rs/tests/dictionary_tests.rs | 119 ++++++++++++++++++++++
 2 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 autocomplete-rs/tests/dictionary_tests.rs

diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs
index 7d1cb44..70048c6 100644
--- a/autocomplete-rs/src/lib.rs
+++ b/autocomplete-rs/src/lib.rs
@@ -7,8 +7,6 @@ pub mod probe;
 pub mod string_pool;
 pub mod index;
 pub mod autocomplete;
-pub mod graphql;
-pub mod server;
 
 pub use dictionary::Dictionary;
 pub use types::*;
@@ -18,6 +16,4 @@ pub use parameters::*;
 pub use probe::*;
 pub use string_pool::*;
 pub use index::*;
-pub use autocomplete::*;
-pub use graphql::*;
-pub use server::*; 
\ No newline at end of file
+pub use autocomplete::*; 
\ No newline at end of file
diff --git a/autocomplete-rs/tests/dictionary_tests.rs b/autocomplete-rs/tests/dictionary_tests.rs
new file mode 100644
index 0000000..1aab6d8
--- /dev/null
+++ b/autocomplete-rs/tests/dictionary_tests.rs
@@ -0,0 +1,119 @@
+use autocomplete_rs::dictionary::Dictionary;
+use autocomplete_rs::types::IdType;
+
+#[test]
+fn test_dictionary_new() {
+    let dict = Dictionary::new();
+    assert!(dict.is_empty());
+    assert_eq!(dict.len(), 0);
+}
+
+#[test]
+fn test_dictionary_insert() {
+    let mut dict = Dictionary::new();
+    
+    // Test first insertion
+    let id1 = dict.insert("hello".to_string());
+    assert_eq!(id1, 0);
+    assert_eq!(dict.len(), 1);
+    
+    // Test duplicate insertion
+    let id2 = dict.insert("hello".to_string());
+    assert_eq!(id2, id1);
+    assert_eq!(dict.len(), 1);
+    
+    // Test new insertion
+    let id3 = dict.insert("world".to_string());
+    assert_eq!(id3, 1);
+    assert_eq!(dict.len(), 2);
+}
+
+#[test]
+fn test_dictionary_get() {
+    let mut dict = Dictionary::new();
+    
+    // Insert test data
+    let id1 = dict.insert("hello".to_string());
+    let id2 = dict.insert("world".to_string());
+    
+    // Test valid gets
+    assert_eq!(dict.get(id1), Some("hello"));
+    assert_eq!(dict.get(id2), Some("world"));
+    
+    // Test invalid id
+    assert_eq!(dict.get(999), None);
+}
+
+#[test]
+fn test_dictionary_get_id() {
+    let mut dict = Dictionary::new();
+    
+    // Insert test data
+    let id1 = dict.insert("hello".to_string());
+    let id2 = dict.insert("world".to_string());
+    
+    // Test valid gets
+    assert_eq!(dict.get_id("hello"), Some(id1));
+    assert_eq!(dict.get_id("world"), Some(id2));
+    
+    // Test non-existent string
+    assert_eq!(dict.get_id("nonexistent"), None);
+}
+
+#[test]
+fn test_dictionary_len_and_empty() {
+    let mut dict = Dictionary::new();
+    
+    // Test empty state
+    assert!(dict.is_empty());
+    assert_eq!(dict.len(), 0);
+    
+    // Test after insertions
+    dict.insert("hello".to_string());
+    assert!(!dict.is_empty());
+    assert_eq!(dict.len(), 1);
+    
+    dict.insert("world".to_string());
+    assert!(!dict.is_empty());
+    assert_eq!(dict.len(), 2);
+    
+    // Test duplicate insertion doesn't change length
+    dict.insert("hello".to_string());
+    assert_eq!(dict.len(), 2);
+}
+
+#[test]
+fn test_dictionary_id_sequence() {
+    let mut dict = Dictionary::new();
+    
+    // Test that IDs are assigned sequentially
+    let id1 = dict.insert("first".to_string());
+    let id2 = dict.insert("second".to_string());
+    let id3 = dict.insert("third".to_string());
+    
+    assert_eq!(id1, 0);
+    assert_eq!(id2, 1);
+    assert_eq!(id3, 2);
+}
+
+#[test]
+fn test_dictionary_large_insertions() {
+    let mut dict = Dictionary::new();
+    let num_insertions = 1000;
+    
+    // Insert many strings
+    for i in 0..num_insertions {
+        let s = format!("string_{}", i);
+        let id = dict.insert(s);
+        assert_eq!(id, i as IdType);
+    }
+    
+    assert_eq!(dict.len(), num_insertions);
+    
+    // Verify all strings can be retrieved
+    for i in 0..num_insertions {
+        let s = format!("string_{}", i);
+        assert_eq!(dict.get(i as IdType), Some(s.as_str()));
+        assert_eq!(dict.get_id(&s), Some(i as IdType));
+    }
+} 
\ No newline at end of file

From ea4b22238019f029e1988fc6bab458464c06af3d Mon Sep 17 00:00:00 2001
From: Ahmed Awadallah <aawadall@ualberta.ca>
Date: Sun, 25 May 2025 18:26:51 -0400
Subject: [PATCH 102/102] Add target directory to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 2d7573c..b884c82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -110,3 +110,4 @@ Thumbs.db
 *.inverted
 *.forward
 *.bin
+target/