From a9429ae0fbd617d005de18121b1992d7aba0f7f9 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Mon, 21 Oct 2019 12:00:18 +0200 Subject: [PATCH 001/102] Fix typo --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b1a0946..e6cc74c 100644 --- a/README.md +++ b/README.md @@ -105,12 +105,12 @@ Input data format The input file should list all completions in *lexicographical* order. -For example, see the the file `test_data/trec05_efficiency_queries/trec05_efficiency_queries.completions`. +For example, see the the file `test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`. The first column represent the ID of the completion; the other columns contain the tokens separated by white spaces. -(The IDs for the file `trec05_efficiency_queries.completions` are +(The IDs for the file `trec_05_efficiency_queries.completions` are fake, i.e., they do not take into account any particular assignment.) @@ -119,49 +119,49 @@ preparing the datasets for indexing: 1. The command - $ extract_dict.py trec05_efficiency_queries/trec05_efficiency_queries.completions + $ extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions extract the dictionary from a file listing all completions in textual form. 2. The command - $ python map_dataset.py trec05_efficiency_queries/trec05_efficiency_queries.completions + $ python map_dataset.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions maps strings to integer ids. 3. The command - $ python build_stats.py trec05_efficiency_queries/trec05_efficiency_queries.completions.mapped + $ python build_stats.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions.mapped calulcates the dataset statistics. 4. The command - $ python build_inverted_and_forward.py trec05_efficiency_queries/trec05_efficiency_queries.completions + $ python build_inverted_and_forward.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions builds the inverted and forward files. If you run the scripts in the reported order, you will get: -- `trec05_efficiency_queries.completions.dict`: lists all the distinct +- `trec_05_efficiency_queries.completions.dict`: lists all the distinct tokens in the completions sorted in lexicographical order. -- `trec05_efficiency_queries.completions.mapped`: lists all completions +- `trec_05_efficiency_queries.completions.mapped`: lists all completions whose tokens have been mapped to integer ids as assigned by a lexicographically-sorted string dictionary (that should be built from the -tokens listed in `trec05_efficiency_queries.completions.dict`). +tokens listed in `trec_05_efficiency_queries.completions.dict`). Each completion terminates with the id `0`. -- `trec05_efficiency_queries.completions.mapped.stats` contains some +- `trec_05_efficiency_queries.completions.mapped.stats` contains some statistics about the datasets, needed to build the data structures more efficiently. - `trec05_efficiency_queries.completions.inverted` is the inverted file. -- `trec05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec05_efficiency_queries.completions.mapped` but sorted in docID order. +- `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order. Benchmarks ---------- @@ -174,4 +174,4 @@ Live demo ---------- Start the web server with the program `./web_server ` and access the demo at -`localhost:`. \ No newline at end of file +`localhost:`. From 5f97e36d6a196bbd0c9dd61a5e6a201f9a009612 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Mon, 21 Oct 2019 12:50:55 +0200 Subject: [PATCH 002/102] Add python command --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e6cc74c..4f3c123 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ preparing the datasets for indexing: 1. The command - $ extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions + $ python extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions extract the dictionary from a file listing all completions in textual form. From 6a772e06d308b578b76bb0c6fce9653ebe4217b7 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 21 Oct 2019 13:42:34 +0200 Subject: [PATCH 003/102] script updated --- test_data/build_inverted_and_forward.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py index c627699..743b491 100644 --- a/test_data/build_inverted_and_forward.py +++ b/test_data/build_inverted_and_forward.py @@ -21,6 +21,7 @@ with open(input_filename + ".mapped.stats") as f: num_terms = int(f.readline()) print num_terms + f.readline() # skip line containing max num. of query terms num_docs = int(f.readline()) print num_docs From b44ca7eeadd4a80f49a2f9a2b7f1a38385519b2f Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 21 Oct 2019 14:44:03 +0200 Subject: [PATCH 004/102] more to README --- README.md | 42 +++++++++++++++++-- .../collect_results_by_varying_percentage.py | 3 +- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 4f3c123..fb803eb 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Query autocompletion in C++. 1. [Description](#descr) 2. [Compiling the code](#compiling) 3. [Input data format](#input) +4. [Building an index](#building) 4. [Benchmarks](#benchmarks) 5. [Live demo](#demo) @@ -91,7 +92,7 @@ Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs. For the best of performance, we recommend compiling with: - $ `cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On` + $ cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On For a testing environment, use the following instead: @@ -163,12 +164,47 @@ the data structures more efficiently. - `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order. +Building an index +----------- + +After compiling the code, run the program `./build` to build an index. You can specify the type of the index and the name of the file +where the index will be written. + +For example, with + + $ ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin + +we can build an index of type `ef_type1` from the test file `../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`, that will be serialized to the file `trec05.ef_type1.bin`. + +Possible types are `ef_type1`, `ef_type2`, `ef_type3` and `ef_type4`. + + Benchmarks ---------- -Run `benchmark/benchmark_prefix_topk` and `benchmark/benchmark_conjunctive_topk`. +To run the top-k benchmarks in the `/benchmark` directory, +we first need some query logs. + +You can use -See the directory `results` for the results on the AOL and MSN query log. + python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions + +to partition the input completions by number of query terms. + +Then the command + + ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 1000 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.length=3 + +will execute 1000 top-10 queries with 3 terms, from which only 25% +of the prefix of the last token is retained. +(For no locality, it is suggested to shuffle the queries at random, for example using `gshuf` on Mac.) + +We automated the collection of results with the script `script/collected_results_by_varying_percentage.py`. +From within the `/build` directory, run + + $ python ../script/collect_results_by_varying_percentage.py ef_type1 trec05.ef_type3.bin trec_05_efficiency_queries 10 5000 + +You can also specify the option "--breakdown" to record timings breakdowns. Live demo ---------- diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py index cc1b9a0..f520405 100644 --- a/script/collect_results_by_varying_percentage.py +++ b/script/collect_results_by_varying_percentage.py @@ -5,12 +5,11 @@ dataset_name = sys.argv[3] k = sys.argv[4] num_queries = sys.argv[5] -collect_breakdowns = int(sys.argv[6]) # 0 or 1 output_filename = dataset_name + "." + type breakdown = "" -if collect_breakdowns != 0: +if len(sys.argv) > 6 and sys.argv[6] == "--breakdown": breakdown = "--breakdown" output_filename += ".breakdown" From fcc8165d41c4a702cfa9aa4d5cceda1f47a23306 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 21 Oct 2019 23:28:40 +0200 Subject: [PATCH 005/102] benchmark locate_prefix --- README.md | 10 +- benchmark/CMakeLists.txt | 3 +- benchmark/benchmark_locate_prefix.cpp | 98 +++++++++++++++++++ include/completion_trie.hpp | 8 +- ...te_prefix_results_by_varying_percentage.py | 15 +++ ...ect_topk_results_by_varying_percentage.py} | 0 6 files changed, 124 insertions(+), 10 deletions(-) create mode 100644 benchmark/benchmark_locate_prefix.cpp create mode 100644 script/collect_locate_prefix_results_by_varying_percentage.py rename script/{collect_results_by_varying_percentage.py => collect_topk_results_by_varying_percentage.py} (100%) diff --git a/README.md b/README.md index fb803eb..6c97ea8 100644 --- a/README.md +++ b/README.md @@ -187,24 +187,24 @@ we first need some query logs. You can use - python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions + $ python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions to partition the input completions by number of query terms. Then the command - ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 1000 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.length=3 + $ ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 1000 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.length=3 will execute 1000 top-10 queries with 3 terms, from which only 25% of the prefix of the last token is retained. (For no locality, it is suggested to shuffle the queries at random, for example using `gshuf` on Mac.) -We automated the collection of results with the script `script/collected_results_by_varying_percentage.py`. +We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`. From within the `/build` directory, run - $ python ../script/collect_results_by_varying_percentage.py ef_type1 trec05.ef_type3.bin trec_05_efficiency_queries 10 5000 + $ python ../script/collect_topk_results_by_varying_percentage.py ef_type1 trec05.ef_type3.bin trec_05_efficiency_queries 10 5000 -You can also specify the option "--breakdown" to record timings breakdowns. +You can also specify the option `--breakdown` to record timings breakdowns. Live demo ---------- diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index cf8359f..d7f9433 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -2,4 +2,5 @@ add_executable(benchmark_topk benchmark_topk.cpp) add_executable(benchmark_prefix_topk benchmark_prefix_topk.cpp) add_executable(benchmark_conjunctive_topk benchmark_conjunctive_topk.cpp) add_executable(benchmark_fc_dictionary benchmark_fc_dictionary.cpp) -add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp) \ No newline at end of file +add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp) +add_executable(benchmark_locate_prefix benchmark_locate_prefix.cpp) \ No newline at end of file diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp new file mode 100644 index 0000000..6e9a1ab --- /dev/null +++ b/benchmark/benchmark_locate_prefix.cpp @@ -0,0 +1,98 @@ +#include + +#include "types.hpp" +#include "statistics.hpp" +#include "benchmark_common.hpp" + +using namespace autocomplete; + +template +void benchmark_locate_prefix(parameters const& params, + fc_dictionary_type const& dict, + uint32_t max_num_queries, float keep, + essentials::json_lines& result) { + Index index; + { + typename Index::builder builder(params); + builder.build(index); + } + + typedef std::pair query_type; + std::vector strings; + std::vector queries; + uint32_t num_queries = 0; + + { + num_queries = load_queries(strings, max_num_queries, keep, std::cin); + for (auto const& string : strings) { + completion_type prefix; + byte_range suffix; + parse(dict, string, prefix, suffix); + range suffix_lex_range = dict.locate_prefix(suffix); + queries.emplace_back(prefix, suffix_lex_range); + } + } + + auto musec_per_query = [&](double time) { + return time / (runs * num_queries); + }; + + essentials::timer_type timer; + timer.start(); + for (uint32_t run = 0; run != runs; ++run) { + for (auto& query : queries) { + auto r = index.locate_prefix(query.first, query.second); + essentials::do_not_optimize_away(r.end - r.begin); + } + } + timer.stop(); + result.add("musec_per_query", + std::to_string(musec_per_query(timer.elapsed()))); +} + +int main(int argc, char** argv) { + int mandatory = 5; + if (argc < mandatory + 1) { + std::cout << argv[0] + << " " + " < queries" + << std::endl; + std::cout << " is a float in [0,1] and specifies how much " + "we keep of the last token in a query " + << std::endl; + return 1; + } + + std::string type(argv[1]); + parameters params; + params.collection_basename = argv[2]; + params.load(); + + std::string num_terms_per_query(argv[3]); + uint32_t max_num_queries = std::atoi(argv[4]); + float keep = std::atof(argv[5]); + + fc_dictionary_type dict; + { + fc_dictionary_type::builder builder(params); + builder.build(dict); + } + + essentials::json_lines result; + result.new_line(); + result.add("num_terms_per_query", num_terms_per_query); + result.add("percentage", std::to_string(keep)); + + if (type == "trie") { + benchmark_locate_prefix( + params, dict, max_num_queries, keep, result); + } else if (type == "fc") { + benchmark_locate_prefix( + params, dict, max_num_queries, keep, result); + } else { + return 1; + } + + result.print(); + return 0; +} \ No newline at end of file diff --git a/include/completion_trie.hpp b/include/completion_trie.hpp index 8ae9036..3d52ee5 100644 --- a/include/completion_trie.hpp +++ b/include/completion_trie.hpp @@ -166,16 +166,16 @@ struct completion_trie { completion_trie() {} // If the last token of the query is not completely specified, - // then we search for its lexicographic range among the children of c. + // then we search for its lexicographic range among the children of prefix. // Return [a,b) - range locate_prefix(completion_type const& c, + range locate_prefix(completion_type const& prefix, range suffix_lex_range) const { range r{global::not_found, global::not_found}; range pointer{0, m_nodes.front().size()}; uint32_t i = 0; - for (; i < c.size(); ++i) { - uint64_t pos = m_nodes[i].find(pointer, c[i]); + for (; i < prefix.size(); ++i) { + uint64_t pos = m_nodes[i].find(pointer, prefix[i]); if (pos == global::not_found) return global::invalid_range; pointer = m_pointers[i][pos]; } diff --git a/script/collect_locate_prefix_results_by_varying_percentage.py b/script/collect_locate_prefix_results_by_varying_percentage.py new file mode 100644 index 0000000..889fa94 --- /dev/null +++ b/script/collect_locate_prefix_results_by_varying_percentage.py @@ -0,0 +1,15 @@ +import sys, os + +type = sys.argv[1] # 'trie' or 'fc' +collection_basename = sys.argv[2] +dataset_name = sys.argv[3] +num_queries = sys.argv[4] + +output_filename = dataset_name + "." + type + ".locate_prefix.timings.json" + +percentages = ["0.0", "0.25", "0.50", "0.75"] + +for perc in percentages: + for terms in range(2,8): # (1,8) + os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename) + os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_topk_results_by_varying_percentage.py similarity index 100% rename from script/collect_results_by_varying_percentage.py rename to script/collect_topk_results_by_varying_percentage.py From 3ce021ff7c35ffa1a338ff2a8207336bfc7a0a7e Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 22 Oct 2019 10:29:18 +0200 Subject: [PATCH 006/102] script for benchmarking locate_prefix --- ...ate_prefix_results_by_varying_percentage.py | 2 +- script/collect_results.py | 18 ------------------ 2 files changed, 1 insertion(+), 19 deletions(-) delete mode 100644 script/collect_results.py diff --git a/script/collect_locate_prefix_results_by_varying_percentage.py b/script/collect_locate_prefix_results_by_varying_percentage.py index 889fa94..e9142d9 100644 --- a/script/collect_locate_prefix_results_by_varying_percentage.py +++ b/script/collect_locate_prefix_results_by_varying_percentage.py @@ -10,6 +10,6 @@ percentages = ["0.0", "0.25", "0.50", "0.75"] for perc in percentages: - for terms in range(2,8): # (1,8) + for terms in range(1,8): os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename) os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) diff --git a/script/collect_results.py b/script/collect_results.py deleted file mode 100644 index 9d0dd22..0000000 --- a/script/collect_results.py +++ /dev/null @@ -1,18 +0,0 @@ -import sys, os - -type = sys.argv[1] -exe = sys.argv[2] # prefix_top, conjunctive_topk, topk -dataset_name = sys.argv[3] -k = sys.argv[4] -num_queries = sys.argv[5] -collect_breakdowns = int(sys.argv[6]) # 0 or 1 - -breakdown = "" -if collect_breakdowns != 0: - breakdown = "--breakdown" - -output_filename = dataset_name + "." + exe + ".timings.json" - -for i in range(1, 8): - os.system("../build/benchmark_" + exe + " " + type + " " + k + " ../build/" + dataset_name + ".bin " + str(i) + " " + str(num_queries) + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(i) + ".shuffled 2>> " + output_filename) -os.system("../build/benchmark_" + exe + " " + type + " " + k + " ../build/" + dataset_name + ".bin 8+ " + str(num_queries) + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) From ebe971c09cb5c7026d3187292eda31ee4b4dc016 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 22 Oct 2019 12:06:44 +0200 Subject: [PATCH 007/102] typo in README --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 6c97ea8..d222323 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ For a testing environment, use the following instead: $ cd debug_build $ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On $ make - + Input data format ----------------- @@ -119,28 +119,28 @@ The scripts in the directory `test_data` help in preparing the datasets for indexing: 1. The command - + $ python extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions - + extract the dictionary from a file listing all completions in textual form. 2. The command $ python map_dataset.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions - + maps strings to integer ids. 3. The command $ python build_stats.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions.mapped - + calulcates the dataset statistics. 4. The command $ python build_inverted_and_forward.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions - + builds the inverted and forward files. If you run the scripts in the reported order, you will get: @@ -165,7 +165,7 @@ the data structures more efficiently. - `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order. Building an index ------------ +----------- After compiling the code, run the program `./build` to build an index. You can specify the type of the index and the name of the file where the index will be written. @@ -173,7 +173,7 @@ where the index will be written. For example, with $ ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin - + we can build an index of type `ef_type1` from the test file `../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`, that will be serialized to the file `trec05.ef_type1.bin`. Possible types are `ef_type1`, `ef_type2`, `ef_type3` and `ef_type4`. @@ -202,8 +202,8 @@ of the prefix of the last token is retained. We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`. From within the `/build` directory, run - $ python ../script/collect_topk_results_by_varying_percentage.py ef_type1 trec05.ef_type3.bin trec_05_efficiency_queries 10 5000 - + $ python ../script/collect_topk_results_by_varying_percentage.py ef_type1 trec05.ef_type1.bin trec_05_efficiency_queries 10 5000 + You can also specify the option `--breakdown` to record timings breakdowns. Live demo From 3e072c6ee8bdfb5062d5b2ad7b009629dead9416 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Tue, 22 Oct 2019 16:45:21 +0200 Subject: [PATCH 008/102] Add driver for scripts --- test_data/preprocess.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100755 test_data/preprocess.sh diff --git a/test_data/preprocess.sh b/test_data/preprocess.sh new file mode 100755 index 0000000..ab4dbeb --- /dev/null +++ b/test_data/preprocess.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +collections=`find . | grep "\\.completions$"` + +for collection in $collections; do + echo $collection + python extract_dict.py $collection + python map_dataset.py $collection + python build_stats.py $collection.mapped + python build_inverted_and_forward.py $collection +done From 74e6a3c8cc2b1e1861cf95331a193bfb124ec527 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Tue, 22 Oct 2019 18:20:33 +0200 Subject: [PATCH 009/102] Removing utf-8 encoding --- test_data/build_inverted_and_forward.py | 2 +- test_data/build_stats.py | 3 ++- test_data/extract_dict.py | 4 ++-- test_data/map_dataset.py | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py index 743b491..c47ea17 100644 --- a/test_data/build_inverted_and_forward.py +++ b/test_data/build_inverted_and_forward.py @@ -36,7 +36,7 @@ discard = False for i in range(1, len(x)): try: - term = x[i].encode('utf-8') + term = x[i] try: term_id = tokens[term] if term_id not in mapped: diff --git a/test_data/build_stats.py b/test_data/build_stats.py index f9923f0..5fdfdb7 100644 --- a/test_data/build_stats.py +++ b/test_data/build_stats.py @@ -35,4 +35,5 @@ output_file.write(str(len(nodes_per_level)) + "\n") for key, value in sorted(nodes_per_level.iteritems(), key = lambda kv: kv[0]): output_file.write(str(value) + "\n") -output_file.close() \ No newline at end of file +output_file.close() + diff --git a/test_data/extract_dict.py b/test_data/extract_dict.py index 875f85b..0672351 100644 --- a/test_data/extract_dict.py +++ b/test_data/extract_dict.py @@ -21,5 +21,5 @@ dict_file = open(input_filename + ".dict", 'w') for key in sorted(tokens): - dict_file.write(key.encode('utf-8') + "\n") -dict_file.close() \ No newline at end of file + dict_file.write(key + "\n") +dict_file.close() diff --git a/test_data/map_dataset.py b/test_data/map_dataset.py index 86e6357..beb7155 100644 --- a/test_data/map_dataset.py +++ b/test_data/map_dataset.py @@ -24,7 +24,7 @@ string_len = 0; mapped = [x[0]] for i in range(1, len(x)): # x[0] stores the docID - t = x[i].encode('utf-8') + t = x[i] try: id = tokens[t] mapped.append(id) @@ -48,4 +48,4 @@ stats_file.write(str(len(tokens)) + "\n") stats_file.write(str(max_string_len) + "\n") -stats_file.close() \ No newline at end of file +stats_file.close() From 2c0b0debb906a108504060f9d7a79126a4191755 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 22 Oct 2019 19:29:16 +0200 Subject: [PATCH 010/102] automated testing with doctest --- .gitmodules | 3 + CMakeLists.txt | 9 +- external/CMakeLists.txt | 5 +- external/doctest | 1 + include/integer_fc_dictionary.hpp | 32 +++- test/CMakeLists.txt | 9 +- test/test_common.hpp | 54 ++++++ test/test_completion_trie.cpp | 109 +++--------- test/test_fc_dictionary.cpp | 209 +++++++----------------- test/test_integer_fc_dictionary.cpp | 177 +++++--------------- test/test_locate_prefix.cpp | 131 +++++---------- test_data/build_inverted_and_forward.py | 2 +- test_data/extract_dict.py | 2 +- test_data/map_dataset.py | 2 +- 14 files changed, 273 insertions(+), 472 deletions(-) create mode 160000 external/doctest create mode 100644 test/test_common.hpp diff --git a/.gitmodules b/.gitmodules index 72f21cd..60c5af2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "external/mongoose"] path = external/mongoose url = https://github.com/cesanta/mongoose.git +[submodule "external/doctest"] + path = external/doctest + url = https://github.com/onqtam/doctest.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c90e49..1b2fa97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,4 +50,11 @@ include_directories(${AUTOCOMPLETE_SOURCE_DIR}/include) add_subdirectory(external) add_subdirectory(src) add_subdirectory(benchmark) -add_subdirectory(test) \ No newline at end of file + +enable_testing() +file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp) +foreach(TEST_SRC ${TEST_SOURCES}) + get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension + add_executable(${TEST_SRC_NAME} ${TEST_SRC}) + add_test(${TEST_SRC_NAME} ${TEST_SRC_NAME}) +endforeach(TEST_SRC) diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt index d4722aa..5d0ee92 100644 --- a/external/CMakeLists.txt +++ b/external/CMakeLists.txt @@ -1 +1,4 @@ -include_directories(essentials/include) \ No newline at end of file +include_directories(essentials/include) + +set(DOCTEST_INCLUDE_DIR ${AUTOCOMPLETE_SOURCE_DIR}/external/doctest) +include_directories(${DOCTEST_INCLUDE_DIR}) \ No newline at end of file diff --git a/external/doctest b/external/doctest new file mode 160000 index 0000000..7ac22cc --- /dev/null +++ b/external/doctest @@ -0,0 +1 @@ +Subproject commit 7ac22cc2190eb090ff66509015fb2d995bce957e diff --git a/include/integer_fc_dictionary.hpp b/include/integer_fc_dictionary.hpp index 218cacf..e0b228b 100644 --- a/include/integer_fc_dictionary.hpp +++ b/include/integer_fc_dictionary.hpp @@ -166,8 +166,9 @@ struct integer_fc_dictionary { prefix.push_back(global::invalid_term_id); } - locate_bucket(completion_to_uint32_range(prefix), h_end, bucket_id_end, - bucket_id_begin // hint + locate_right_bucket(completion_to_uint32_range(prefix), h_end, + bucket_id_end, + bucket_id_begin // hint ); uint32_t p_end = bucket_id_end * (BucketSize + 1); p_end += right_locate(completion_to_uint32_range(prefix), h_end, @@ -276,6 +277,33 @@ struct integer_fc_dictionary { return false; } + void locate_right_bucket(uint32_range t, uint32_range& h, + id_type& bucket_id, + int lower_bound_hint = 0) const { + int lo = lower_bound_hint, hi = buckets() - 1, mi = 0, cmp = 0; + size_t n = t.end - t.begin; + while (lo <= hi) { + mi = (lo + hi) / 2; + h = header(mi); + cmp = uint32_range_compare(h, t, n); + if (cmp > 0) { + hi = mi - 1; + } else if (cmp < 0) { + lo = mi + 1; + } else { + bucket_id = mi; + return; + } + } + + if (cmp < 0) { + bucket_id = mi; + } else { + bucket_id = mi - 1; + h = header(bucket_id); + } + } + #define INT_FC_DICT_LOCATE_INIT \ static uint32_t decoded[2 * constants::MAX_NUM_TERMS_PER_QUERY]; \ memcpy(decoded, h.begin, (h.end - h.begin) * sizeof(uint32_t)); \ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0687354..4d62c01 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,10 +1,11 @@ -add_executable(test_completion_trie test_completion_trie.cpp) -add_executable(test_fc_dictionary test_fc_dictionary.cpp) -add_executable(test_integer_fc_dictionary test_integer_fc_dictionary.cpp) +# add_executable(test_completion_trie test_completion_trie.cpp) +# add_executable(test_fc_dictionary test_fc_dictionary.cpp) +# add_executable(test_locate_prefix test_locate_prefix.cpp) +# add_executable(test_integer_fc_dictionary test_integer_fc_dictionary.cpp) + add_executable(test_cartesian_tree test_cartesian_tree.cpp) add_executable(test_inverted_index test_inverted_index.cpp) add_executable(test_forward_index test_forward_index.cpp) add_executable(test_unsorted_list test_unsorted_list.cpp) add_executable(test_autocomplete test_autocomplete.cpp) -add_executable(test_locate_prefix test_locate_prefix.cpp) add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp) \ No newline at end of file diff --git a/test/test_common.hpp b/test/test_common.hpp new file mode 100644 index 0000000..0bc701a --- /dev/null +++ b/test/test_common.hpp @@ -0,0 +1,54 @@ +#pragma once + +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN +#include "../external/doctest/doctest/doctest.h" + +#include + +#include "types.hpp" +#include "../benchmark/benchmark_common.hpp" + +namespace autocomplete { +namespace testing { + +static std::string test_filename( + "../test_data/trec_05_efficiency_queries/" + "trec_05_efficiency_queries.completions"); + +static std::string tmp_filename("tmp.bin"); + +id_type locate(std::vector const& terms, std::string const& t) { + return std::distance(terms.begin(), + std::lower_bound(terms.begin(), terms.end(), t)) + + 1; +} + +range locate_prefix(std::vector const& strings, + std::string const& p) { + auto comp_l = [](std::string const& l, std::string const& r) { + if (l.size() < r.size()) { + return strncmp(l.c_str(), r.c_str(), l.size()) <= 0; + } + return strcmp(l.c_str(), r.c_str()) < 0; + }; + + auto comp_r = [](std::string const& l, std::string const& r) { + if (l.size() < r.size()) { + return strncmp(l.c_str(), r.c_str(), l.size()) < 0; + } + return strcmp(l.c_str(), r.c_str()) < 0; + }; + + range r; + r.begin = std::distance( + strings.begin(), + std::lower_bound(strings.begin(), strings.end(), p, comp_l)); + r.end = std::distance( + strings.begin(), + std::upper_bound(strings.begin(), strings.end(), p, comp_r)); + + return r; +} + +} // namespace testing +} // namespace autocomplete \ No newline at end of file diff --git a/test/test_completion_trie.cpp b/test/test_completion_trie.cpp index 1aba989..c5155e1 100644 --- a/test/test_completion_trie.cpp +++ b/test/test_completion_trie.cpp @@ -1,106 +1,37 @@ -#include - -#include "types.hpp" -#include "statistics.hpp" +#include "test_common.hpp" using namespace autocomplete; -struct completion_comparator { - bool operator()(completion_type const& lhs, - completion_type const& rhs) const { - size_t l = 0; // |lcp(lhs,rhs)| - while (l < lhs.size() - 1 and l < rhs.size() - 1 and lhs[l] == rhs[l]) { - ++l; - } - return lhs[l] < rhs[l]; - } -}; - -range locate_prefix(std::vector const& completions, - completion_type const& c) { - completion_comparator comp; - auto b = std::lower_bound(completions.begin(), completions.end(), c, comp); - uint64_t begin = std::distance(completions.begin(), b); - auto e = std::upper_bound(completions.begin() + begin, completions.end(), c, - comp); - uint64_t end = std::distance(completions.begin(), e); - return {begin, end}; -} - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } +typedef ef_completion_trie completion_trie_type; +TEST_CASE("test completion_trie::is_member()") { + char const* output_filename = testing::tmp_filename.c_str(); parameters params; - params.collection_basename = argv[1]; + params.collection_basename = testing::test_filename.c_str(); params.load(); - // typedef uint64_completion_trie completion_trie_type; - typedef ef_completion_trie completion_trie_type; - { completion_trie_type::builder builder(params); completion_trie_type ct; builder.build(ct); - ct.print_stats(); - - if (output_filename) { - essentials::logger("saving data structure to disk..."); - essentials::save(ct, output_filename); - essentials::logger("DONE"); - } + REQUIRE(ct.size() == params.num_completions); + essentials::save(ct, output_filename); } { - if (output_filename) { - completion_trie_type ct; - essentials::logger("loading data structure from disk..."); - essentials::load(ct, output_filename); - essentials::logger("DONE"); - // essentials::print_size(ct); - std::cout << "using " << ct.bytes() << " bytes" << std::endl; - - std::vector completions; - completions.reserve(params.num_completions); - std::ifstream input(params.collection_basename + ".mapped", - std::ios_base::in); - if (!input.good()) { - throw std::runtime_error("File not found"); - } - - completion_iterator it(params, input); - while (input) { - auto& record = *it; - completions.push_back(std::move(record.completion)); - ++it; - } - input.close(); - - // check all completions - essentials::logger("testing is_member()..."); - for (auto const& c : completions) { - if (!ct.is_member(c)) { - print_completion(c); - std::cout << " not found!" << std::endl; - return 1; - } - } - essentials::logger("DONE..."); + completion_trie_type ct; + essentials::load(ct, output_filename); + REQUIRE(ct.size() == params.num_completions); + std::ifstream input(params.collection_basename + ".mapped", + std::ios_base::in); + INFO("testing is_member()"); + completion_iterator it(params, input); + while (input) { + auto& record = *it; + REQUIRE(ct.is_member(record.completion)); + ++it; } + input.close(); + std::remove(output_filename); } - - return 0; } diff --git a/test/test_fc_dictionary.cpp b/test/test_fc_dictionary.cpp index 3f79d1e..50d12b0 100644 --- a/test/test_fc_dictionary.cpp +++ b/test/test_fc_dictionary.cpp @@ -1,175 +1,86 @@ -#include - -#include "types.hpp" +#include "test_common.hpp" using namespace autocomplete; -id_type locate(std::vector const& terms, std::string const& t) { - return std::distance(terms.begin(), - std::lower_bound(terms.begin(), terms.end(), t)) + - 1; -} - -range locate_prefix(std::vector const& terms, - std::string const& p) { - auto comp_l = [](std::string const& l, std::string const& r) { - if (l.size() < r.size()) { - return strncmp(l.c_str(), r.c_str(), l.size()) <= 0; - } - return strcmp(l.c_str(), r.c_str()) < 0; - }; - - auto comp_r = [](std::string const& l, std::string const& r) { - if (l.size() < r.size()) { - return strncmp(l.c_str(), r.c_str(), l.size()) < 0; - } - return strcmp(l.c_str(), r.c_str()) < 0; - }; - - range r; - r.begin = std::distance( - terms.begin(), std::lower_bound(terms.begin(), terms.end(), p, comp_l)); - r.end = - std::distance(terms.begin(), - std::upper_bound(terms.begin(), terms.end(), p, comp_r)) - - 1; - - return r; -} - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - +TEST_CASE("test fc_dictionary") { + char const* output_filename = testing::tmp_filename.c_str(); parameters params; - params.collection_basename = argv[1]; + params.collection_basename = testing::test_filename.c_str(); params.load(); { - // build, print and write fc_dictionary_type::builder builder(params); fc_dictionary_type dict; builder.build(dict); - std::cout << "using " << dict.bytes() << " bytes" << std::endl; - - if (output_filename) { - // essentials::print_size(dict); - essentials::logger("saving data structure to disk..."); - essentials::save(dict, output_filename); - essentials::logger("DONE"); - } + essentials::save(dict, output_filename); } { - if (output_filename) { - fc_dictionary_type dict; - essentials::logger("loading data structure from disk..."); - essentials::load(dict, output_filename); - essentials::logger("DONE"); - // essentials::print_size(dict); - std::cout << "using " << dict.bytes() << " bytes" << std::endl; - - // test locate() and extract for all strings - std::vector terms; - terms.reserve(params.num_terms); - std::ifstream input((params.collection_basename + ".dict").c_str(), - std::ios_base::in); - if (!input.good()) { - throw std::runtime_error("File not found"); - } - std::string term; - term.reserve(256 + 1); + fc_dictionary_type dict; + essentials::load(dict, output_filename); + + // test locate() and extract for all strings + std::vector terms; + terms.reserve(params.num_terms); + std::ifstream input((params.collection_basename + ".dict").c_str(), + std::ios_base::in); + if (!input.good()) { + throw std::runtime_error("File not found"); + } + std::string term; + term.reserve(256 + 1); + input >> term; + while (input) { + terms.push_back(std::move(term)); input >> term; - while (input) { - terms.push_back(std::move(term)); - input >> term; - } - input.close(); - - std::cout << "terms.size() " << terms.size() << std::endl; - - std::vector decoded(2 * - constants::MAX_NUM_CHARS_PER_QUERY); - - for (auto const& t : terms) { - id_type expected = locate(terms, t); - id_type got = dict.locate(string_to_byte_range(t)); + } + input.close(); - std::cout << "locating term '" << t << "'" << std::endl; - if (got != expected) { - std::cout << "Error: expected id " << expected << "," - << " but got id " << got << std::endl; - return 1; - } + std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); - std::cout << "extracting term '" << t << "'" << std::endl; - uint8_t string_len = dict.extract(got, decoded.data()); + for (auto const& t : terms) { + id_type expected = testing::locate(terms, t); + id_type got = dict.locate(string_to_byte_range(t)); - if (string_len != t.size()) { - std::cout << "Error: expected size " << t.size() << "," - << " but got size " << string_len << std::endl; - return 1; - } + REQUIRE_MESSAGE(got == expected, "expected id " << expected + << ", but got id " + << got); - auto s = reinterpret_cast(decoded.data()); - for (uint8_t i = 0; i != string_len; ++i) { - if (t[i] != s[i]) { - std::cout << "Error: expected char " << t[i] - << " but got " << s[i] << std::endl; - return 1; - } - } + uint8_t string_len = dict.extract(got, decoded.data()); + REQUIRE_MESSAGE(string_len == t.size(), + "expected size " << t.size() << ", but got size " + << string_len); - std::cout << "lexicographic id of '" << t << "' is " << got - << std::endl; + auto s = reinterpret_cast(decoded.data()); + for (uint8_t i = 0; i != string_len; ++i) { + REQUIRE_MESSAGE(t[i] == s[i], "expected char " << t[i] + << " but got " + << s[i]); } + } - // test locate_prefix() for all strings - std::string prefix; - prefix.reserve(256 + 1); - for (auto const& t : terms) { - uint32_t n = t.size(); - for (uint32_t prefix_len = 1; prefix_len <= n; ++prefix_len) { - prefix.clear(); - for (uint32_t i = 0; i != prefix_len; ++i) { - prefix.push_back(t[i]); - } - - std::cout << "locating prefix '" << prefix << "'" - << std::endl; - range expected = locate_prefix(terms, prefix); - range got = - dict.locate_prefix(string_to_byte_range(prefix)); - - if ((got.begin != expected.begin) or - (got.end != expected.end)) { - std::cout << "Error for prefix '" << prefix - << "' : expected [" << expected.begin << "," - << expected.end << "] but got [" << got.begin - << "," << got.end << "]" << std::endl; - return 1; - } - - std::cout << "prefix range of '" << prefix << "' is [" - << got.begin << "," << got.end << "]" - << std::endl; + // test locate_prefix() for all strings + std::string prefix; + prefix.reserve(256 + 1); + for (auto const& t : terms) { + uint32_t n = t.size(); + for (uint32_t prefix_len = 1; prefix_len <= n; ++prefix_len) { + prefix.clear(); + for (uint32_t i = 0; i != prefix_len; ++i) { + prefix.push_back(t[i]); } + + range expected = testing::locate_prefix(terms, prefix); + range got = dict.locate_prefix(string_to_byte_range(prefix)); + REQUIRE_MESSAGE((got.begin == expected.begin and + got.end == expected.end - 1), + "Error for prefix '" + << prefix << "' : expected [" + << expected.begin << "," << expected.end - 1 + << "] but got [" << got.begin << "," + << got.end << "]"); } } + std::remove(output_filename); } - - return 0; } diff --git a/test/test_integer_fc_dictionary.cpp b/test/test_integer_fc_dictionary.cpp index 4f78052..b67879d 100644 --- a/test/test_integer_fc_dictionary.cpp +++ b/test/test_integer_fc_dictionary.cpp @@ -1,155 +1,62 @@ -#include - -#include "types.hpp" +#include "test_common.hpp" using namespace autocomplete; -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - +TEST_CASE("test integer_fc_dictionary") { + char const* output_filename = testing::tmp_filename.c_str(); parameters params; - params.collection_basename = argv[1]; + params.collection_basename = testing::test_filename.c_str(); params.load(); { - // build, print and write integer_fc_dictionary_type::builder builder(params); integer_fc_dictionary_type dict; builder.build(dict); - std::cout << "using " << dict.bytes() << " bytes" << std::endl; - - if (output_filename) { - // essentials::print_size(dict); - essentials::logger("saving data structure to disk..."); - essentials::save(dict, output_filename); - essentials::logger("DONE"); - } + essentials::save(dict, output_filename); } { - if (output_filename) { - integer_fc_dictionary_type dict; - essentials::logger("loading data structure from disk..."); - essentials::load(dict, output_filename); - essentials::logger("DONE"); - std::cout << "using " << dict.bytes() << " bytes" << std::endl; - - { - essentials::logger("testing extract() and locate()..."); - std::ifstream input( - (params.collection_basename + ".mapped").c_str(), - std::ios_base::in); - completion_iterator it(params, input); - - completion_type decoded(2 * constants::MAX_NUM_TERMS_PER_QUERY); - for (id_type id = 0; id != params.num_completions; ++id, ++it) { - auto const& expected = (*it).completion; - assert(expected.size() > 0); - uint8_t size = dict.extract(id, decoded); - if (expected.size() - 1 != size) { - std::cout << "Error in decoding the " << id - << "-th string: expected size " - << expected.size() - 1 << "," - << " but got size " << int(size) << std::endl; - return 1; - } - - for (uint8_t i = 0; i != size; ++i) { - if (decoded[i] != expected[i]) { - std::cout - << "Error in decoding the " << id - << "-th string: expected " << expected[i] << "," - << " but got " << decoded[i] << " at position " - << int(i) << std::endl; - return 1; - } - } - - id_type got_id = - dict.locate({decoded.data(), decoded.data() + size}); - if (got_id != id) { - std::cout << "Error in locating the " << id - << "-th string: expected id " << id << "," - << " but got id " << got_id << std::endl; - return 1; - } + integer_fc_dictionary_type dict; + essentials::load(dict, output_filename); + + { + std::ifstream input( + (params.collection_basename + ".mapped").c_str(), + std::ios_base::in); + completion_iterator it(params, input); + + completion_type decoded(2 * constants::MAX_NUM_TERMS_PER_QUERY); + for (id_type id = 0; id != params.num_completions; ++id, ++it) { + auto const& expected = (*it).completion; + REQUIRE(expected.size() > 0); + uint8_t size = dict.extract(id, decoded); + + REQUIRE_MESSAGE(expected.size() - 1 == size, + "Error in decoding the " + << id << "-th string: expected size " + << expected.size() - 1 << "," + << " but got size " << int(size)); + + for (uint8_t i = 0; i != size; ++i) { + REQUIRE_MESSAGE(decoded[i] == expected[i], + "Error in decoding the " + << id << "-th string: expected " + << expected[i] << "," + << " but got " << decoded[i] + << " at position " << int(i)); } - input.close(); - essentials::logger("it's all good"); + id_type got_id = + dict.locate({decoded.data(), decoded.data() + size}); + REQUIRE_MESSAGE(got_id == id, "Error in locating the " + << id + << "-th string: expected id " + << id << "," + << " but got id " << got_id); } - // { - // uint64_completion_trie::builder builder(params); - // uint64_completion_trie ct; - // builder.build(ct); - // std::cout << "using " << ct.bytes() << " bytes" << std::endl; - - // essentials::logger("testing locate_prefix()..."); - - // std::ifstream input( - // (params.collection_basename + ".mapped").c_str(), - // std::ios_base::in); - // completion_iterator it(params, input); - - // uint32_t num_checks = - // std::min(params.num_completions, 30000); - - // completion_type prefix; - // for (uint32_t i = 0; i != num_checks; ++i, ++it) { - // auto const& expected = (*it).completion; - // assert(expected.size() > 0); - - // for (uint32_t prefix_len = 1; - // prefix_len <= expected.size() - 1; ++prefix_len) { - // prefix.clear(); - // for (uint32_t i = 0; i != prefix_len; ++i) { - // prefix.push_back(expected[i]); - // } - - // range expected = ct.locate_prefix(prefix); - // range got = dict.locate_prefix( - // completion_to_uint32_range(prefix)); - - // if ((got.begin != expected.begin) or - // (got.end != expected.end - 1)) { - // std::cout << "Error for prefix "; - // print_completion(prefix); - // std::cout << ": expected [" << expected.begin << - // "," - // << expected.end - 1 << "] but got [" - // << got.begin << "," << got.end << "]" - // << std::endl; - // return 1; - // } - - // // std::cout << "prefix range of "; - // // print_completion(prefix); - // // std::cout << " is [" << got.begin << "," << - // got.end - // // << "]" << std::endl; - // } - // } - - // input.close(); - // essentials::logger("it's all good"); - // } + input.close(); } + std::remove(output_filename); } - - return 0; } diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp index fd3dcb4..8938965 100644 --- a/test/test_locate_prefix.cpp +++ b/test/test_locate_prefix.cpp @@ -1,81 +1,35 @@ -#include - -#include "types.hpp" -#include "../benchmark/benchmark_common.hpp" +#include "test_common.hpp" using namespace autocomplete; -range locate_prefix(std::vector const& strings, - std::string const& p) { - auto comp_l = [](std::string const& l, std::string const& r) { - if (l.size() < r.size()) { - return strncmp(l.c_str(), r.c_str(), l.size()) <= 0; - } - return strcmp(l.c_str(), r.c_str()) < 0; - }; - - auto comp_r = [](std::string const& l, std::string const& r) { - if (l.size() < r.size()) { - return strncmp(l.c_str(), r.c_str(), l.size()) < 0; - } - return strcmp(l.c_str(), r.c_str()) < 0; - }; - - range r; - r.begin = std::distance( - strings.begin(), - std::lower_bound(strings.begin(), strings.end(), p, comp_l)); - r.end = std::distance( - strings.begin(), - std::upper_bound(strings.begin(), strings.end(), p, comp_r)); - - return r; -} +typedef ef_completion_trie completion_trie_type; template -int test_locate_prefix(Dictionary const& dict, Index const& index, - std::vector const& queries, - std::vector const& strings) { +void test_locate_prefix(Dictionary const& dict, Index const& index, + std::vector const& queries, + std::vector const& strings) { for (auto const& query : queries) { - std::string query_copy = query; - range expected = locate_prefix(strings, query); - - // std::cout << "query: '" << query << "'" << std::endl; + range expected = testing::locate_prefix(strings, query); completion_type prefix; byte_range suffix; - parse(dict, query_copy, prefix, suffix); - - // print_completion(prefix); - // std::cout << std::endl; - // print(suffix); - // std::cout << std::endl; + parse(dict, query, prefix, suffix); range suffix_lex_range = dict.locate_prefix(suffix); suffix_lex_range.begin += 1; suffix_lex_range.end += 1; range got = index.locate_prefix(prefix, suffix_lex_range); - if ((got.begin != expected.begin) or (got.end != expected.end)) { - std::cout << "Error for query '" << query << "': "; - std::cout << "expected [" << expected.begin << "," << expected.end - << ") but got [" << got.begin << "," << got.end << ")" - << std::endl; - return 1; - } + REQUIRE_MESSAGE( + (got.begin == expected.begin and got.end == expected.end), + "Error for query '" << query << "': expected [" << expected.begin + << "," << expected.end << ") but got [" + << got.begin << "," << got.end << ")"); } - - return 0; } -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " " << std::endl; - return 1; - } - +TEST_CASE("test locate_prefix()") { parameters params; - params.collection_basename = argv[1]; + params.collection_basename = testing::test_filename.c_str(); params.load(); fc_dictionary_type dict; @@ -102,40 +56,41 @@ int main(int argc, char** argv) { " strings"); } - uint32_t max_num_queries = std::atoi(argv[2]); + constexpr uint32_t max_num_queries = 5000; std::vector queries; - essentials::logger("loading queries..."); - uint32_t num_queries = - load_queries(queries, max_num_queries, true, std::cin); - essentials::logger("loaded " + std::to_string(num_queries) + " queries"); + static std::vector percentages = {0.0, 0.25, 0.50, 0.75, 1.0}; + static std::vector query_terms = {1, 2, 3, 4, 5, 6, 7}; - { - // typedef uint64_completion_trie completion_trie_type; - typedef ef_completion_trie completion_trie_type; + completion_trie_type ct_index; + integer_fc_dictionary_type fc_index; - completion_trie_type index; - { - completion_trie_type::builder builder(params); - builder.build(index); - } - essentials::logger("testing locate_prefix() for completion_trie..."); - int ret = test_locate_prefix(dict, index, queries, strings); - if (ret) return 1; - essentials::logger("it's all good"); + { + completion_trie_type::builder builder(params); + builder.build(ct_index); + REQUIRE(ct_index.size() == params.num_completions); } { - integer_fc_dictionary_type index; - { - integer_fc_dictionary_type::builder builder(params); - builder.build(index); - } - essentials::logger( - "testing locate_prefix() for integer_fc_dictionary..."); - int ret = test_locate_prefix(dict, index, queries, strings); - if (ret) return 1; - essentials::logger("it's all good"); + integer_fc_dictionary_type::builder builder(params); + builder.build(fc_index); + REQUIRE(fc_index.size() == params.num_completions); } - return 0; + for (auto perc : percentages) { + for (auto num_terms : query_terms) { + std::cout << "percentage " << perc * 100.0 << "%, num_terms " + << num_terms << std::endl; + { + queries.clear(); + std::ifstream querylog((params.collection_basename + + ".length=" + std::to_string(num_terms)) + .c_str()); + load_queries(queries, max_num_queries, perc, querylog); + querylog.close(); + } + + test_locate_prefix(dict, ct_index, queries, strings); + test_locate_prefix(dict, fc_index, queries, strings); + } + } } diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py index 743b491..c47ea17 100644 --- a/test_data/build_inverted_and_forward.py +++ b/test_data/build_inverted_and_forward.py @@ -36,7 +36,7 @@ discard = False for i in range(1, len(x)): try: - term = x[i].encode('utf-8') + term = x[i] try: term_id = tokens[term] if term_id not in mapped: diff --git a/test_data/extract_dict.py b/test_data/extract_dict.py index 875f85b..e3c05b5 100644 --- a/test_data/extract_dict.py +++ b/test_data/extract_dict.py @@ -21,5 +21,5 @@ dict_file = open(input_filename + ".dict", 'w') for key in sorted(tokens): - dict_file.write(key.encode('utf-8') + "\n") + dict_file.write(key + "\n") dict_file.close() \ No newline at end of file diff --git a/test_data/map_dataset.py b/test_data/map_dataset.py index 86e6357..1a8fd13 100644 --- a/test_data/map_dataset.py +++ b/test_data/map_dataset.py @@ -24,7 +24,7 @@ string_len = 0; mapped = [x[0]] for i in range(1, len(x)): # x[0] stores the docID - t = x[i].encode('utf-8') + t = x[i] try: id = tokens[t] mapped.append(id) From 5afbdf530d9c1ce767c777ad84cf875299ce8896 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 22 Oct 2019 20:33:12 +0200 Subject: [PATCH 011/102] automated testing with doctest --- CMakeLists.txt | 2 +- test/CMakeLists.txt | 7 --- test/test_cartesian_tree.cpp | 83 -------------------------- test/test_unsorted_list.cpp | 110 ++++++++++------------------------- 4 files changed, 32 insertions(+), 170 deletions(-) delete mode 100644 test/test_cartesian_tree.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b2fa97..5b89fe7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,7 +52,7 @@ add_subdirectory(src) add_subdirectory(benchmark) enable_testing() -file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp) +file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp) foreach(TEST_SRC ${TEST_SOURCES}) get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension add_executable(${TEST_SRC_NAME} ${TEST_SRC}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4d62c01..c220919 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,11 +1,4 @@ -# add_executable(test_completion_trie test_completion_trie.cpp) -# add_executable(test_fc_dictionary test_fc_dictionary.cpp) -# add_executable(test_locate_prefix test_locate_prefix.cpp) -# add_executable(test_integer_fc_dictionary test_integer_fc_dictionary.cpp) - -add_executable(test_cartesian_tree test_cartesian_tree.cpp) add_executable(test_inverted_index test_inverted_index.cpp) add_executable(test_forward_index test_forward_index.cpp) -add_executable(test_unsorted_list test_unsorted_list.cpp) add_executable(test_autocomplete test_autocomplete.cpp) add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp) \ No newline at end of file diff --git a/test/test_cartesian_tree.cpp b/test/test_cartesian_tree.cpp deleted file mode 100644 index 0c4fd38..0000000 --- a/test/test_cartesian_tree.cpp +++ /dev/null @@ -1,83 +0,0 @@ -#include -#include - -#include "types.hpp" - -using namespace autocomplete; - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - { - // build and write - - // std::vector doc_ids = {23, 2, 4, 0, 88, 23, 2, 4, 55, 3, - // 7, 6, 90, 34, 2, 3, 1, 12, 23}; - - std::vector doc_ids; - doc_ids.reserve(params.num_completions); - std::ifstream input(params.collection_basename + ".mapped", - std::ios_base::in); - if (!input.good()) { - throw std::runtime_error("File not found"); - } - completion_iterator it(params, input); - while (input) { - auto const& record = *it; - doc_ids.push_back(record.doc_id); - ++it; - } - input.close(); - - cartesian_tree rmq; - rmq.build(doc_ids, std::less()); - assert(rmq.size() == doc_ids.size()); - std::cout << "using " << rmq.bytes() << " bytes" << std::endl; - - if (output_filename) { - // essentials::print_size(rmq); - essentials::logger("saving data structure to disk..."); - essentials::save(rmq, output_filename); - essentials::logger("DONE"); - } - } - - { - // load and print - if (output_filename) { - cartesian_tree rmq; - essentials::logger("loading data structure from disk..."); - essentials::load(rmq, output_filename); - essentials::logger("DONE"); - - std::cout << "using " << rmq.bytes() << " bytes" << std::endl; - - for (size_t i = 0; i != rmq.size(); ++i) { - for (size_t j = i; j != rmq.size(); ++j) { - std::cout << "rmq[" << i << "," << j - << "] = " << rmq.rmq(i, j) << std::endl; - } - } - } - } - - return 0; -} diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp index 9b9b000..44abc5e 100644 --- a/test/test_unsorted_list.cpp +++ b/test/test_unsorted_list.cpp @@ -1,15 +1,7 @@ -#include -#include - -#include "types.hpp" +#include "test_common.hpp" using namespace autocomplete; -static const uint32_t max_k = 15; -static const uint32_t k = 10; -static_assert(k <= max_k, "k must be less than max allowed"); -static const uint32_t num_queries = 10000; - std::vector naive_topk(std::vector const& input, range r, uint32_t k) { uint32_t range_len = r.end - r.begin; @@ -41,37 +33,22 @@ std::vector gen_random_queries(uint32_t num_queries, return queries; } -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - +TEST_CASE("test unsorted_list") { + char const* output_filename = testing::tmp_filename.c_str(); parameters params; - params.collection_basename = argv[1]; + params.collection_basename = testing::test_filename.c_str(); params.load(); + static const uint32_t k = 10; + static_assert(k <= constants::MAX_K, "k must be less than max allowed"); + static const uint32_t num_queries = 5000; + std::vector doc_ids; { - // build and write doc_ids.reserve(params.num_completions); std::ifstream input(params.collection_basename + ".mapped", std::ios_base::in); - if (!input.good()) { - throw std::runtime_error("File not found"); - } completion_iterator it(params, input); while (input) { auto const& record = *it; @@ -85,64 +62,39 @@ int main(int argc, char** argv) { std::vector tmp = doc_ids; std::sort(tmp.begin(), tmp.end()); for (id_type id = 0; id != doc_ids.size(); ++id) { - if (tmp[id] != id) { - std::cout << "Error: id " << id << " not found" - << std::endl; - return 1; - } + REQUIRE_MESSAGE(tmp[id] == id, + "Error: id " << id << " not found"); } } succinct_rmq list; list.build(doc_ids); - assert(list.size() == doc_ids.size()); - std::cout << "using " << list.bytes() << " bytes" << std::endl; + REQUIRE(list.size() == doc_ids.size()); - if (output_filename) { - // essentials::print_size(list); - essentials::logger("saving data structure to disk..."); - essentials::save(list, output_filename); - essentials::logger("DONE"); - } + essentials::save(list, output_filename); } { - if (output_filename) { - succinct_rmq list; - essentials::logger("loading data structure from disk..."); - essentials::load(list, output_filename); - essentials::logger("DONE"); - - std::cout << "using " << list.bytes() << " bytes" << std::endl; - - std::vector topk(max_k); - auto queries = gen_random_queries(num_queries, doc_ids.size()); - std::cout << "testing top-" << k << " " << num_queries - << " random queries..." << std::endl; - - for (auto q : queries) { - auto expected = naive_topk(doc_ids, q, k); - uint32_t num_elements = list.topk(q, k, topk); - - if (expected.size() != num_elements) { - std::cout << "Error: expected " << expected.size() - << " topk elements but got " << num_elements - << std::endl; - return 1; - } - - for (uint32_t i = 0; i != num_elements; ++i) { - if (topk[i] != expected[i]) { - std::cout << "Error: expected " << expected[i] - << " but got " << topk[i] << std::endl; - return 1; - } - } + succinct_rmq list; + essentials::load(list, output_filename); + + std::vector topk(constants::MAX_K); + auto queries = gen_random_queries(num_queries, doc_ids.size()); + + for (auto q : queries) { + auto expected = naive_topk(doc_ids, q, k); + uint32_t results = list.topk(q, k, topk); + REQUIRE_MESSAGE(expected.size() == results, + "Error: expected " << expected.size() + << " topk elements but got " + << results); + for (uint32_t i = 0; i != results; ++i) { + REQUIRE_MESSAGE(topk[i] == expected[i], + "Error: expected " << expected[i] << " but got " + << topk[i]); } - - std::cout << "it's all good" << std::endl; } - } - return 0; + std::remove(output_filename); + } } From f0eee6dfc70f52f5405b9dc324f37f91a918e3f1 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 22 Oct 2019 20:34:23 +0200 Subject: [PATCH 012/102] empty todo --- TODO.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/TODO.md b/TODO.md index 082ced9..e69de29 100644 --- a/TODO.md +++ b/TODO.md @@ -1,2 +0,0 @@ - -- Study the effect of compression. From 48fab02dfa10f3136159b2fa7163a82bf12bf8f4 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 22 Oct 2019 20:46:54 +0200 Subject: [PATCH 013/102] README updated --- README.md | 52 +++++++++++++++++++++------------------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index d222323..a117f62 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,10 @@ Query autocompletion in C++. 1. [Description](#descr) 2. [Compiling the code](#compiling) 3. [Input data format](#input) -4. [Building an index](#building) -4. [Benchmarks](#benchmarks) -5. [Live demo](#demo) +4. [Running the unit tests](#testing) +5. [Building an index](#building) +6. [Benchmarks](#benchmarks) +7. [Live demo](#demo) Description ----------- @@ -115,35 +116,14 @@ tokens separated by white spaces. fake, i.e., they do not take into account any particular assignment.) -The scripts in the directory `test_data` help in -preparing the datasets for indexing: +The script `preprocess.sh` in the directory `test_data` helps +in preparing the data for indexing. +Thus, from within the directory `test_data`, it is sufficient +to do: + + $ bash preprocess.sh -1. The command - - $ python extract_dict.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions - - extract the dictionary -from a file listing all completions in textual form. - -2. The command - - $ python map_dataset.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions - - maps strings to integer ids. - -3. The command - - $ python build_stats.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions.mapped - - calulcates the dataset statistics. - -4. The command - - $ python build_inverted_and_forward.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions - - builds the inverted and forward files. - -If you run the scripts in the reported order, you will get: +If you run the script, you will get: - `trec_05_efficiency_queries.completions.dict`: lists all the distinct tokens in the completions sorted in lexicographical @@ -164,6 +144,16 @@ the data structures more efficiently. - `trec_05_efficiency_queries.completions.forward` is the forward file. Note that each list is *not* sorted, thus the lists are the same as the ones contained in `trec_05_efficiency_queries.completions.mapped` but sorted in docID order. +Running the unit tests +----------- + +The unit tests are written using [doctest](https://github.com/onqtam/doctest). + +After compilation and preparation of the data for indexing (see Section [Input data format](#input)), it is advised +to run the unit tests with: + + $ make test + Building an index ----------- From 2f70613697170ee08192f5073ae0a4490063e616 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 23 Oct 2019 12:08:07 +0200 Subject: [PATCH 014/102] better testing --- CMakeLists.txt | 2 +- test/CMakeLists.txt | 1 - test/test_inverted_index.cpp | 187 +++++++++++++++++------------------ test/test_unsorted_list.cpp | 90 +++++++++++++++-- 4 files changed, 173 insertions(+), 107 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b89fe7..3fa9125 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,7 +52,7 @@ add_subdirectory(src) add_subdirectory(benchmark) enable_testing() -file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp) +file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp test/test_inverted_index.cpp) foreach(TEST_SRC ${TEST_SOURCES}) get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension add_executable(${TEST_SRC_NAME} ${TEST_SRC}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c220919..a78df87 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,3 @@ -add_executable(test_inverted_index test_inverted_index.cpp) add_executable(test_forward_index test_forward_index.cpp) add_executable(test_autocomplete test_autocomplete.cpp) add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp) \ No newline at end of file diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp index 81f913e..aefdaae 100644 --- a/test/test_inverted_index.cpp +++ b/test/test_inverted_index.cpp @@ -1,127 +1,124 @@ -#include - -#include "types.hpp" +#include "test_common.hpp" using namespace autocomplete; -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; +typedef ef_inverted_index inverted_index_type; +typedef std::vector term_ids; + +std::vector gen_random_queries(uint32_t num_queries, + uint32_t max_num_terms, + uint32_t max_range_len) { + assert(max_num_terms > 1); + std::vector queries; + queries.reserve(num_queries); + essentials::uniform_int_rng random_num_terms(2, max_num_terms); + essentials::uniform_int_rng random_term_id(1, max_range_len); + + for (uint32_t i = 0; i != num_queries; ++i) { + term_ids q; + uint32_t num_terms = random_num_terms.gen(); + q.reserve(num_terms); + uint32_t num_distinct_terms = 0; + while (true) { + q.clear(); + for (uint32_t i = 0; i != num_terms; ++i) { + auto t = random_term_id.gen(); + assert(t >= 1 and t <= max_range_len); + q.push_back(t); + } + std::sort(q.begin(), q.end()); + auto end = std::unique(q.begin(), q.end()); + num_distinct_terms = std::distance(q.begin(), end); + if (num_distinct_terms >= 2) break; } + q.resize(num_distinct_terms); + queries.push_back(q); } + return queries; +} + +TEST_CASE("test inverted_index::intersection_iterator") { + char const* output_filename = testing::tmp_filename.c_str(); parameters params; - params.collection_basename = argv[1]; + params.collection_basename = testing::test_filename.c_str(); params.load(); - typedef ef_inverted_index inverted_index_type; - { - // build, print and write inverted_index_type::builder builder(params); inverted_index_type index; builder.build(index); - std::cout << "using " << index.bytes() << " bytes" << std::endl; - std::cout << "num docs " << index.num_docs() << std::endl; - std::cout << "num terms " << index.num_terms() << std::endl; - - if (output_filename) { - essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); - essentials::logger("DONE"); - } + REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_terms() == params.num_terms); + essentials::save(index, output_filename); } { - if (output_filename) { - inverted_index_type index; - essentials::logger("loading data structure from disk..."); - essentials::load(index, output_filename); - essentials::logger("DONE"); - std::cout << "using " << index.bytes() << " bytes" << std::endl; - std::cout << "num docs " << index.num_docs() << std::endl; - std::cout << "num terms " << index.num_terms() << std::endl; - - std::vector intersection(index.num_docs()); // at most - std::vector term_ids; - term_ids.reserve(2); - - // id_type i = 293; - // id_type j = 294; - // id_type i = 899; - // id_type j = 822; - id_type i = 2401599 - 1; - id_type j = 1752198 - 1; - term_ids.push_back(i + 1); - term_ids.push_back(j + 1); - // uint64_t size = index.intersect(term_ids, intersection); + inverted_index_type index; + essentials::load(index, output_filename); + REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_terms() == params.num_terms); + + static const uint32_t num_queries = 1000000; + static const uint32_t max_num_terms = 5; + auto queries = + gen_random_queries(num_queries, max_num_terms, index.num_terms()); + + std::vector first(index.num_docs()); + std::vector second(index.num_docs()); + std::vector intersection(index.num_docs()); + + for (auto const& q : queries) { + uint32_t first_size = 0; + uint32_t second_size = 0; + assert(q.size() >= 2); { - std::cout << "intersection between " << i << " and " << j - << " is: "; - uint32_t i = 0; - auto intersec_it = index.intersection_iterator(term_ids); - while (intersec_it.has_next()) { - id_type doc_id = *intersec_it; - std::cout << doc_id << " "; - ++i; - ++intersec_it; + auto it = index.iterator(q[0] - 1); + first_size = it.size(); + for (uint32_t i = 0; i != first_size; ++i) { + first[i] = it.access(i); } - std::cout << std::endl; } - std::vector a; { - auto it = index.iterator(i); - a.resize(it.size()); - for (uint32_t i = 0; i != a.size(); ++i) { - a[i] = it.access(i); + auto it = index.iterator(q[1] - 1); + second_size = it.size(); + for (uint32_t i = 0; i != second_size; ++i) { + second[i] = it.access(i); } } - std::vector b; - { - auto it = index.iterator(j); - b.resize(it.size()); - for (uint32_t i = 0; i != b.size(); ++i) { - b[i] = it.access(i); + auto end = std::set_intersection( + first.begin(), first.begin() + first_size, second.begin(), + second.begin() + second_size, intersection.begin()); + first_size = std::distance(intersection.begin(), end); + first.swap(intersection); + + for (uint32_t i = 2; i != q.size(); ++i) { + auto it = index.iterator(q[i] - 1); + second_size = it.size(); + for (uint32_t i = 0; i != second_size; ++i) { + second[i] = it.access(i); } + end = std::set_intersection( + first.begin(), first.begin() + first_size, second.begin(), + second.begin() + second_size, intersection.begin()); + first_size = std::distance(intersection.begin(), end); + first.swap(intersection); } - auto it = std::set_intersection(a.begin(), a.end(), b.begin(), - b.end(), intersection.begin()); - intersection.resize(it - intersection.begin()); - std::cout << "intersection between " << i << " and " << j - << " is: "; - for (auto x : intersection) { - std::cout << x << " "; + auto it = index.intersection_iterator(q); + uint32_t n = 0; + for (; it.has_next(); ++n, ++it) { + auto doc_id = *it; + REQUIRE_MESSAGE( + doc_id == first[n], + "expected doc_id " << first[n] << " but got " << doc_id); } - std::cout << std::endl; - - // for (uint32_t i = 1; i != index.num_terms() + 1; ++i) { - // for (uint32_t j = i; j != index.num_terms() + 1; ++j) { - // term_ids.clear(); - // term_ids.push_back(i); - // term_ids.push_back(j); - // uint64_t size = index.intersect(term_ids, intersection); - // std::cout << "size of intersection between " << i << " - // and " - // << j << " is " << size << std::endl; - // } - // } + REQUIRE_MESSAGE(n == first_size, "expected " << first_size + << " results, but got " + << n); } } - - return 0; } diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp index 44abc5e..8e791bb 100644 --- a/test/test_unsorted_list.cpp +++ b/test/test_unsorted_list.cpp @@ -2,16 +2,21 @@ using namespace autocomplete; -std::vector naive_topk(std::vector const& input, range r, - uint32_t k) { +uint32_t naive_topk(std::vector const& input, range r, uint32_t k, + std::vector& topk, bool unique = false) { uint32_t range_len = r.end - r.begin; - std::vector topk(range_len); for (uint32_t i = 0; i != range_len; ++i) { topk[i] = input[r.begin + i]; } std::sort(topk.begin(), topk.begin() + range_len); - topk.resize(std::min(k, range_len)); - return topk; + uint32_t results = 0; + if (unique) { + auto end = std::unique(topk.begin(), topk.begin() + range_len); + results = std::min(k, std::distance(topk.begin(), end)); + } else { + results = std::min(k, range_len); + } + return results; } std::vector gen_random_queries(uint32_t num_queries, @@ -33,7 +38,7 @@ std::vector gen_random_queries(uint32_t num_queries, return queries; } -TEST_CASE("test unsorted_list") { +TEST_CASE("test unsorted_list on doc_ids") { char const* output_filename = testing::tmp_filename.c_str(); parameters params; params.collection_basename = testing::test_filename.c_str(); @@ -70,7 +75,6 @@ TEST_CASE("test unsorted_list") { succinct_rmq list; list.build(doc_ids); REQUIRE(list.size() == doc_ids.size()); - essentials::save(list, output_filename); } @@ -80,12 +84,13 @@ TEST_CASE("test unsorted_list") { std::vector topk(constants::MAX_K); auto queries = gen_random_queries(num_queries, doc_ids.size()); + std::vector expected(params.num_completions); for (auto q : queries) { - auto expected = naive_topk(doc_ids, q, k); + uint32_t expected_results = naive_topk(doc_ids, q, k, expected); uint32_t results = list.topk(q, k, topk); - REQUIRE_MESSAGE(expected.size() == results, - "Error: expected " << expected.size() + REQUIRE_MESSAGE(expected_results == results, + "Error: expected " << expected_results << " topk elements but got " << results); for (uint32_t i = 0; i != results; ++i) { @@ -98,3 +103,68 @@ TEST_CASE("test unsorted_list") { std::remove(output_filename); } } + +TEST_CASE("test unsorted_list on minimal doc_ids") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + static const uint32_t k = 10; + static_assert(k <= constants::MAX_K, "k must be less than max allowed"); + static const uint32_t num_queries = 5000; + + std::vector doc_ids; + + { + doc_ids.reserve(params.num_terms); + std::ifstream input((params.collection_basename + ".inverted").c_str(), + std::ios_base::in); + id_type first; + for (uint64_t i = 0; i != params.num_terms; ++i) { + uint32_t n = 0; + input >> n; + input >> first; + doc_ids.push_back(first); + for (uint64_t k = 1; k != n; ++k) { + id_type x; + input >> x; + (void)x; // discard + } + } + input.close(); + REQUIRE(doc_ids.size() == params.num_terms); + + succinct_rmq list; + list.build(doc_ids); + REQUIRE(list.size() == doc_ids.size()); + essentials::save(list, output_filename); + } + + { + succinct_rmq list; + essentials::load(list, output_filename); + + std::vector topk(constants::MAX_K); + auto queries = gen_random_queries(num_queries, doc_ids.size()); + constexpr bool unique = true; + std::vector expected(params.num_terms); + + for (auto q : queries) { + uint32_t expected_results = + naive_topk(doc_ids, q, k, expected, unique); + uint32_t results = list.topk(q, k, topk, unique); + REQUIRE_MESSAGE(expected_results == results, + "Error: expected " << expected_results + << " topk elements but got " + << results); + for (uint32_t i = 0; i != results; ++i) { + REQUIRE_MESSAGE(topk[i] == expected[i], + "Error: expected " << expected[i] << " but got " + << topk[i]); + } + } + + std::remove(output_filename); + } +} \ No newline at end of file From 1562354f18307edb27f9190da9ccf93064b42393 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 23 Oct 2019 13:59:57 +0200 Subject: [PATCH 015/102] more testing --- CMakeLists.txt | 2 +- include/delta_forward_index.hpp | 149 --------------------- include/forward_index.hpp | 201 ---------------------------- include/types.hpp | 13 +- test/CMakeLists.txt | 1 - test/test_compact_forward_index.cpp | 47 +++++++ test/test_forward_index.cpp | 58 -------- test/test_inverted_index.cpp | 45 +++++++ 8 files changed, 98 insertions(+), 418 deletions(-) delete mode 100644 include/delta_forward_index.hpp delete mode 100644 include/forward_index.hpp create mode 100644 test/test_compact_forward_index.cpp delete mode 100644 test/test_forward_index.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 3fa9125..bc8c298 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,7 +52,7 @@ add_subdirectory(src) add_subdirectory(benchmark) enable_testing() -file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp test/test_inverted_index.cpp) +file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp test/test_inverted_index.cpp test/test_compact_forward_index.cpp) foreach(TEST_SRC ${TEST_SOURCES}) get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension add_executable(${TEST_SRC_NAME} ${TEST_SRC}) diff --git a/include/delta_forward_index.hpp b/include/delta_forward_index.hpp deleted file mode 100644 index 6a302ab..0000000 --- a/include/delta_forward_index.hpp +++ /dev/null @@ -1,149 +0,0 @@ -#pragma once - -#include "parameters.hpp" -#include "bit_vector.hpp" -#include "ef/ef_sequence.hpp" - -namespace autocomplete { - -struct delta_forward_index { - struct builder { - builder() {} - - builder(parameters const& params) - : m_num_integers(0) - , m_num_terms(params.num_terms) { - essentials::logger("building forward_index..."); - uint64_t num_completions = params.num_completions; - std::ifstream input( - (params.collection_basename + ".forward").c_str(), - std::ios_base::in); - m_pointers.push_back(0); - for (uint64_t i = 0; i != num_completions; ++i) { - uint32_t n = 0; - input >> n; - assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY); - write_gamma_nonzero(m_data, n); - m_num_integers += n; - for (uint64_t k = 0; k != n; ++k) { - id_type x; - input >> x; - write_delta(m_data, x); - } - m_pointers.push_back(m_data.size()); - } - m_pointers.pop_back(); - input.close(); - essentials::logger("DONE"); - } - - void swap(delta_forward_index::builder& other) { - std::swap(other.m_num_integers, m_num_integers); - std::swap(other.m_num_terms, m_num_terms); - other.m_pointers.swap(m_pointers); - other.m_data.swap(m_data); - } - - void build(delta_forward_index& fi) { - fi.m_num_integers = m_num_integers; - fi.m_num_terms = m_num_terms; - fi.m_pointers.build(m_pointers); - fi.m_data.build(&m_data); - builder().swap(*this); - } - - private: - uint64_t m_num_integers; - uint64_t m_num_terms; - std::vector m_pointers; - bit_vector_builder m_data; - }; - - delta_forward_index() {} - - struct forward_list_iterator_type { - forward_list_iterator_type(bits_iterator const& it, - uint64_t n) - : m_it(it) - , m_n(n) - , m_i(0) {} - - uint64_t size() const { - return m_n; - } - - void operator++() { - m_i += 1; - } - - id_type operator*() { - return read_delta(m_it); - } - - bool intersects(const range r) { - for (uint64_t i = 0; i != size(); ++i) { - auto val = operator*(); - if (r.contains(val)) return true; - } - return false; - } - - private: - bits_iterator m_it; - uint64_t m_n; - uint64_t m_i; - }; - - forward_list_iterator_type iterator(id_type doc_id) { - uint64_t offset = m_pointers.access(doc_id); - bits_iterator it(m_data, offset); - uint64_t n = read_gamma_nonzero(it); - return {it, n}; - } - - bool intersects(const id_type doc_id, const range r) { - return iterator(doc_id).intersects(r); - } - - uint64_t num_integers() const { - return m_num_integers; - } - - uint64_t num_terms() const { - return m_num_terms; - } - - uint64_t num_docs() const { - return m_pointers.size(); - } - - size_t data_bytes() const { - return m_data.bytes(); - } - - size_t pointer_bytes() const { - return m_pointers.bytes(); - } - - size_t bytes() const { - return essentials::pod_bytes(m_num_integers) + - essentials::pod_bytes(m_num_terms) + m_pointers.bytes() + - m_data.bytes(); - } - - template - void visit(Visitor& visitor) { - visitor.visit(m_num_integers); - visitor.visit(m_num_terms); - visitor.visit(m_pointers); - visitor.visit(m_data); - } - -private: - uint64_t m_num_integers; - uint64_t m_num_terms; - ef::ef_sequence m_pointers; - bit_vector m_data; -}; - -} // namespace autocomplete \ No newline at end of file diff --git a/include/forward_index.hpp b/include/forward_index.hpp deleted file mode 100644 index 51c7c63..0000000 --- a/include/forward_index.hpp +++ /dev/null @@ -1,201 +0,0 @@ -#pragma once - -#include "parameters.hpp" -#include "integer_codes.hpp" -#include "building_util.hpp" -#include "ef/ef_sequence.hpp" - -namespace autocomplete { - -template -struct forward_index { - typedef ListType forward_list_type; - typedef typename forward_list_type::iterator forward_list_iterator_type; - typedef uncompressed_list permutation_list_type; - typedef - typename permutation_list_type::iterator permutation_list_iterator_type; - - struct builder { - builder() {} - - builder(parameters const& params) - : m_num_integers(0) - , m_num_terms(params.num_terms) { - essentials::logger("building forward_index..."); - - uint64_t num_completions = params.num_completions; - - std::ifstream input( - (params.collection_basename + ".forward").c_str(), - std::ios_base::in); - - std::vector list; - std::vector sorted_permutation; - std::vector permutation; - - m_pointers.push_back(0); - - for (uint64_t i = 0; i != num_completions; ++i) { - list.clear(); - sorted_permutation.clear(); - permutation.clear(); - - uint32_t n = 0; - input >> n; - assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY); - m_num_integers += n; - list.reserve(n); - sorted_permutation.reserve(n); - - for (uint64_t k = 0; k != n; ++k) { - id_type x; - input >> x; - list.push_back(x); - sorted_permutation.push_back(k); - } - - write_gamma_nonzero(m_bvb, n); - if (ListType::is_byte_aligned) util::push_pad(m_bvb); - - std::sort( - sorted_permutation.begin(), sorted_permutation.end(), - [&](id_type l, id_type r) { return list[l] < list[r]; }); - - permutation.resize(n); - for (uint32_t i = 0; i != n; ++i) { - permutation[sorted_permutation[i]] = i; - } - - std::sort(list.begin(), list.end()); - forward_list_type::build(m_bvb, list.begin(), m_num_terms + 1, - n); - util::push_pad(m_bvb); - m_pointers.push_back(m_bvb.size()); - - permutation_list_type::build(m_bvb, permutation.begin(), n + 1, - n); - m_pointers.push_back(m_bvb.size()); - } - - m_pointers.pop_back(); - input.close(); - essentials::logger("DONE"); - } - - void swap(forward_index::builder& other) { - std::swap(other.m_num_integers, m_num_integers); - std::swap(other.m_num_terms, m_num_terms); - other.m_pointers.swap(m_pointers); - other.m_bvb.swap(m_bvb); - } - - void build(forward_index& fi) { - fi.m_num_integers = m_num_integers; - fi.m_num_terms = m_num_terms; - fi.m_pointers.build(m_pointers); - fi.m_data.build(&m_bvb); - builder().swap(*this); - } - - private: - uint64_t m_num_integers; - uint64_t m_num_terms; - std::vector m_pointers; - bit_vector_builder m_bvb; - }; - - forward_index() {} - - bool intersects(id_type doc_id, range r) { - return get(doc_id).intersects(r); - } - - struct permuting_iterator_type { - permuting_iterator_type(forward_list_iterator_type const& sorted, - permutation_list_iterator_type const& permuted) - : m_i(0) - , m_sorted(sorted) - , m_permuted(permuted) { - assert(sorted.size() == permuted.size()); - } - - uint32_t size() const { - return m_sorted.size(); - } - - id_type operator*() { - return m_sorted.access(m_permuted.access(m_i)); - } - - void operator++() { - ++m_i; - } - - private: - uint32_t m_i; - forward_list_iterator_type m_sorted; - permutation_list_iterator_type m_permuted; - }; - - permuting_iterator_type iterator(id_type doc_id) { - uint64_t offset = m_pointers.access(doc_id * 2); - bits_iterator it(m_data, offset); - uint64_t n = read_gamma_nonzero(it); - if (ListType::is_byte_aligned) util::eat_pad(it); - forward_list_iterator_type it_sorted(m_data, it.position(), - m_num_terms + 1, n); - offset = m_pointers.access(doc_id * 2 + 1); - permutation_list_iterator_type it_permutation(m_data, offset, n + 1, n); - return permuting_iterator_type(it_sorted, it_permutation); - } - - uint64_t num_integers() const { - return m_num_integers; - } - - uint64_t num_terms() const { - return m_num_terms; - } - - uint64_t num_docs() const { - return m_pointers.size(); - } - - size_t data_bytes() const { - return m_data.bytes(); - } - - size_t pointer_bytes() const { - return m_pointers.bytes(); - } - - size_t bytes() const { - return essentials::pod_bytes(m_num_integers) + - essentials::pod_bytes(m_num_terms) + m_pointers.bytes() + - m_data.bytes(); - } - - template - void visit(Visitor& visitor) { - visitor.visit(m_num_integers); - visitor.visit(m_num_terms); - visitor.visit(m_pointers); - visitor.visit(m_data); - } - -private: - uint64_t m_num_integers; - uint64_t m_num_terms; - ef::ef_sequence m_pointers; - bit_vector m_data; - - forward_list_iterator_type get(id_type doc_id) { - uint64_t offset = m_pointers.access(doc_id * 2); - bits_iterator it(m_data, offset); - uint64_t n = read_gamma_nonzero(it); - if (ListType::is_byte_aligned) util::eat_pad(it); - return {m_data, it.position(), m_num_terms + 1, n}; - } -}; - -} // namespace autocomplete \ No newline at end of file diff --git a/include/types.hpp b/include/types.hpp index 1083cfc..6481276 100644 --- a/include/types.hpp +++ b/include/types.hpp @@ -3,13 +3,12 @@ #include "completion_trie.hpp" #include "fc_dictionary.hpp" #include "integer_fc_dictionary.hpp" -#include "uint_vec.hpp" #include "unsorted_list.hpp" -#include "uncompressed_list.hpp" -#include "forward_index.hpp" +// #include "uint_vec.hpp" +// #include "uncompressed_list.hpp" + #include "compact_forward_index.hpp" -#include "delta_forward_index.hpp" #include "inverted_index.hpp" #include "blocked_inverted_index.hpp" @@ -40,14 +39,11 @@ typedef fc_dictionary<> fc_dictionary_type; typedef integer_fc_dictionary<> integer_fc_dictionary_type; typedef unsorted_list succinct_rmq; -typedef uncompressed_list uncompressed_list32_t; +// typedef uncompressed_list uncompressed_list32_t; // typedef inverted_index uncompressed_inverted_index; typedef inverted_index ef_inverted_index; -// typedef forward_index uncompressed_forward_index; -// typedef forward_index ef_forward_index; - // typedef blocked_inverted_index // uncompressed_blocked_inverted_index; typedef blocked_inverted_index ef_blocked_inverted_index; @@ -77,4 +73,5 @@ typedef autocomplete3 ef_autocomplete_type4; + } // namespace autocomplete \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a78df87..bc5f04f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,3 +1,2 @@ -add_executable(test_forward_index test_forward_index.cpp) add_executable(test_autocomplete test_autocomplete.cpp) add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp) \ No newline at end of file diff --git a/test/test_compact_forward_index.cpp b/test/test_compact_forward_index.cpp new file mode 100644 index 0000000..aa09403 --- /dev/null +++ b/test/test_compact_forward_index.cpp @@ -0,0 +1,47 @@ +#include "test_common.hpp" + +using namespace autocomplete; + +TEST_CASE("test compact_forward_index::iterator") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + compact_forward_index::builder builder(params); + compact_forward_index index; + builder.build(index); + REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_terms() == params.num_terms); + essentials::save(index, output_filename); + } + + { + compact_forward_index index; + essentials::load(index, output_filename); + REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_terms() == params.num_terms); + + std::ifstream input((params.collection_basename + ".forward").c_str(), + std::ios_base::in); + for (uint64_t i = 0; i != index.num_terms(); ++i) { + auto it = index.iterator(i); + uint32_t n = 0; + input >> n; + REQUIRE_MESSAGE(n == it.size(), "list has size " << it.size() + << " instead of " + << n); + for (uint64_t k = 0; k != n; ++k, ++it) { + id_type expected; + input >> expected; + auto got = *it; + REQUIRE_MESSAGE(got == expected, + "got " << got << " but expected " << expected); + } + } + input.close(); + + std::remove(output_filename); + } +}; diff --git a/test/test_forward_index.cpp b/test/test_forward_index.cpp deleted file mode 100644 index 576215d..0000000 --- a/test/test_forward_index.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include - -#include "types.hpp" - -using namespace autocomplete; - -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } - - parameters params; - params.collection_basename = argv[1]; - params.load(); - - typedef compact_forward_index forward_index_type; - - { - forward_index_type::builder builder(params); - forward_index_type index; - builder.build(index); - std::cout << "using " << index.bytes() << " bytes" << std::endl; - std::cout << "num docs " << index.num_docs() << std::endl; - std::cout << "num terms " << index.num_terms() << std::endl; - - if (output_filename) { - essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); - essentials::logger("DONE"); - } - } - - { - if (output_filename) { - forward_index_type index; - essentials::logger("loading data structure from disk..."); - essentials::load(index, output_filename); - essentials::logger("DONE"); - std::cout << "using " << index.bytes() << " bytes" << std::endl; - std::cout << "num docs " << index.num_docs() << std::endl; - std::cout << "num terms " << index.num_terms() << std::endl; - } - } - - return 0; -} diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp index aefdaae..ec93363 100644 --- a/test/test_inverted_index.cpp +++ b/test/test_inverted_index.cpp @@ -38,6 +38,50 @@ std::vector gen_random_queries(uint32_t num_queries, return queries; } +TEST_CASE("test inverted_index::iterator") { + char const* output_filename = testing::tmp_filename.c_str(); + parameters params; + params.collection_basename = testing::test_filename.c_str(); + params.load(); + + { + inverted_index_type::builder builder(params); + inverted_index_type index; + builder.build(index); + REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_terms() == params.num_terms); + essentials::save(index, output_filename); + } + + { + inverted_index_type index; + essentials::load(index, output_filename); + REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_terms() == params.num_terms); + + std::ifstream input((params.collection_basename + ".inverted").c_str(), + std::ios_base::in); + for (uint64_t i = 0; i != index.num_terms(); ++i) { + auto it = index.iterator(i); + uint32_t n = 0; + input >> n; + REQUIRE_MESSAGE(n == it.size(), "list has size " << it.size() + << " instead of " + << n); + for (uint64_t k = 0; k != n; ++k, ++it) { + id_type expected; + input >> expected; + auto got = *it; + REQUIRE_MESSAGE(got == expected, + "got " << got << " but expected " << expected); + } + } + input.close(); + + std::remove(output_filename); + } +}; + TEST_CASE("test inverted_index::intersection_iterator") { char const* output_filename = testing::tmp_filename.c_str(); parameters params; @@ -120,5 +164,6 @@ TEST_CASE("test inverted_index::intersection_iterator") { << " results, but got " << n); } + std::remove(output_filename); } } From 90aa2e7f1bd2817c6ad7395e6808744eb7ebcc67 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 23 Oct 2019 14:11:21 +0200 Subject: [PATCH 016/102] install.sh script --- README.md | 22 ++++++++++++++++------ install.sh | 11 +++++++++++ 2 files changed, 27 insertions(+), 6 deletions(-) create mode 100644 install.sh diff --git a/README.md b/README.md index a117f62..209aafb 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,13 @@ Query autocompletion in C++. ##### Table of contents 1. [Description](#descr) -2. [Compiling the code](#compiling) -3. [Input data format](#input) -4. [Running the unit tests](#testing) -5. [Building an index](#building) -6. [Benchmarks](#benchmarks) -7. [Live demo](#demo) +2. [Installation](#install) +3. [Compiling the code](#compiling) +4. [Input data format](#input) +5. [Running the unit tests](#testing) +6. [Building an index](#building) +7. [Benchmarks](#benchmarks) +8. [Live demo](#demo) Description ----------- @@ -66,6 +67,15 @@ A recursive heap-based algorithm is used to produce the smallest docIDs in M[l,r The final string extraction step is identical to that of the prefix search. +Installation +------------------ + +Just run + + $ ./install.sh + +from the parent directory. The script builds the code; prepare the test data in the folder `test_data` for indexing; executes the unit tests. + Compiling the code ------------------ diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..9e8da9e --- /dev/null +++ b/install.sh @@ -0,0 +1,11 @@ +git submodule init +git submodule update +mkdir -p build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On +make +cd ../test_data +./preprocess.sh +cd ../build +make test +cd .. From 164df361295e49bea91a428030a3b3dd8280e1e6 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 23 Oct 2019 15:22:38 +0200 Subject: [PATCH 017/102] more testing --- CMakeLists.txt | 2 +- test/CMakeLists.txt | 2 - test/test_autocomplete.cpp | 147 +++++++++++---------------- test/test_blocked_inverted_index.cpp | 81 +++++++++------ test/test_common.hpp | 35 +++++++ test/test_inverted_index.cpp | 38 +------ 6 files changed, 149 insertions(+), 156 deletions(-) delete mode 100644 test/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index bc8c298..181c024 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,7 +52,7 @@ add_subdirectory(src) add_subdirectory(benchmark) enable_testing() -file(GLOB TEST_SOURCES test/test_completion_trie.cpp test/test_locate_prefix.cpp test/test_fc_dictionary.cpp test/test_integer_fc_dictionary.cpp test/test_unsorted_list.cpp test/test_cartesian_tree.cpp test/test_inverted_index.cpp test/test_compact_forward_index.cpp) +file(GLOB TEST_SOURCES test/test_*.cpp) foreach(TEST_SRC ${TEST_SOURCES}) get_filename_component (TEST_SRC_NAME ${TEST_SRC} NAME_WE) # without extension add_executable(${TEST_SRC_NAME} ${TEST_SRC}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt deleted file mode 100644 index bc5f04f..0000000 --- a/test/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_executable(test_autocomplete test_autocomplete.cpp) -add_executable(test_blocked_inverted_index test_blocked_inverted_index.cpp) \ No newline at end of file diff --git a/test/test_autocomplete.cpp b/test/test_autocomplete.cpp index d4fcefa..964a451 100644 --- a/test/test_autocomplete.cpp +++ b/test/test_autocomplete.cpp @@ -1,110 +1,81 @@ -#include - -#include "types.hpp" -#include "statistics.hpp" +#include "test_common.hpp" using namespace autocomplete; -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } +typedef ef_autocomplete_type1 index_type; +TEST_CASE("test autocomplete topk functions") { + char const* output_filename = testing::tmp_filename.c_str(); parameters params; - params.collection_basename = argv[1]; + params.collection_basename = testing::test_filename.c_str(); params.load(); - // typedef uncompressed_autocomplete_type index_type; - // typedef ef_autocomplete_type index_type; - typedef ef_autocomplete_type2 index_type; - { index_type index(params); - if (output_filename) { - essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); - essentials::logger("DONE"); - } + essentials::save(index, output_filename); } { - if (output_filename) { - index_type index; - essentials::logger("loading data structure from disk..."); - essentials::load(index, output_filename); - essentials::logger("DONE"); - index.print_stats(); - - { - essentials::logger("testing prefix_topk()..."); - uint32_t k = 7; - std::vector queries = { - "a", "10", "african", - "air", "commercial", "internet", - "paris", "somerset", "the", - "the new", "the perfect", "the starting line", - "yu gi oh", "for sale", "dave mat", - "florence", "florida be", "for s", - "for sa", "for sal", "for sale", - "ford a", "ford au", "ford m", - "ford mu", "for", "fo", - "f", "matt", "fl", - "florir", "fly", "the starting l", - "floridaaa"}; - - for (auto& query : queries) { - auto it = index.prefix_topk(query, k); - std::cout << "top-" << it.size() << " completions for '" - << query << "':\n"; - for (uint32_t i = 0; i != it.size(); ++i, ++it) { - auto completion = *it; - std::cout << "(" << completion.score << ", '"; - print(completion.string); - std::cout << "')" << std::endl; - } + index_type index; + essentials::load(index, output_filename); + + { + essentials::logger("testing prefix_topk()..."); + uint32_t k = 7; + std::vector queries = { + "a", "10", "african", + "air", "commercial", "internet", + "paris", "somerset", "the", + "the new", "the perfect", "the starting line", + "yu gi oh", "for sale", "dave mat", + "florence", "florida be", "for s", + "for sa", "for sal", "for sale", + "ford a", "ford au", "ford m", + "ford mu", "for", "fo", + "f", "matt", "fl", + "florir", "fly", "the starting l", + "floridaaa"}; + + for (auto& query : queries) { + auto it = index.prefix_topk(query, k); + std::cout << "top-" << it.size() << " completions for '" + << query << "':\n"; + for (uint32_t i = 0; i != it.size(); ++i, ++it) { + auto completion = *it; + std::cout << "(" << completion.score << ", '"; + print(completion.string); + std::cout << "')" << std::endl; } - - essentials::logger("DONE"); } - { - essentials::logger("testing conjunctive_topk()..."); - uint32_t k = 7; - std::vector queries = { - "dave mat", "florence", "florida be", "for s", - "for sa", "for sal", "for sale", "ford a", - "ford au", "ford m", "ford mu", "for", - "fo", "f", "matt", "fl", - "flor", "fly", "the starting l"}; + essentials::logger("DONE"); + } - for (auto& query : queries) { - auto it = index.conjunctive_topk(query, k); - std::cout << "top-" << it.size() << " completions for '" - << query << "':\n"; - for (uint32_t i = 0; i != it.size(); ++i, ++it) { - auto completion = *it; - std::cout << "(" << completion.score << ", '"; - print(completion.string); - std::cout << "')" << std::endl; - } + { + essentials::logger("testing conjunctive_topk()..."); + uint32_t k = 7; + std::vector queries = { + "dave mat", "florence", "florida be", "for s", + "for sa", "for sal", "for sale", "ford a", + "ford au", "ford m", "ford mu", "for", + "fo", "f", "matt", "fl", + "flor", "fly", "the starting l"}; + + for (auto& query : queries) { + auto it = index.conjunctive_topk(query, k); + std::cout << "top-" << it.size() << " completions for '" + << query << "':\n"; + for (uint32_t i = 0; i != it.size(); ++i, ++it) { + auto completion = *it; + std::cout << "(" << completion.score << ", '"; + print(completion.string); + std::cout << "')" << std::endl; } - - essentials::logger("DONE"); } + + essentials::logger("DONE"); } } - return 0; + std::remove(output_filename); } diff --git a/test/test_blocked_inverted_index.cpp b/test/test_blocked_inverted_index.cpp index 94fc274..80a9bc1 100644 --- a/test/test_blocked_inverted_index.cpp +++ b/test/test_blocked_inverted_index.cpp @@ -1,40 +1,63 @@ -#include - -#include "types.hpp" +#include "test_common.hpp" using namespace autocomplete; -int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory) { - std::cout << argv[0] << " [-o output_filename]" - << std::endl; - return 1; - } - - char const* output_filename = nullptr; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } - } +typedef ef_blocked_inverted_index blocked_inverted_index_type; +typedef ef_inverted_index inverted_index_type; +TEST_CASE("test blocked_inverted_index::intersection_iterator") { parameters params; - params.collection_basename = argv[1]; + params.collection_basename = testing::test_filename.c_str(); params.load(); - const float c = 0.01; + + inverted_index_type ii; { - // build, print and write - ef_blocked_inverted_index::builder builder(params, c); - ef_blocked_inverted_index bii; - builder.build(bii); - std::cout << "using " << bii.bytes() << " bytes" << std::endl; - std::cout << "num docs " << bii.num_docs() << std::endl; - std::cout << "num terms " << bii.num_terms() << std::endl; + inverted_index_type::builder ii_builder(params); + ii_builder.build(ii); + REQUIRE(ii.num_docs() == params.num_completions); + REQUIRE(ii.num_terms() == params.num_terms); } - return 0; + { + static const uint32_t num_queries = 10000; + static const uint32_t max_num_terms = 3; + auto queries = testing::gen_random_queries(num_queries, max_num_terms, + params.num_terms); + + static const std::vector C = {0.0125, 0.025, 0.05, 0.1}; + blocked_inverted_index_type blocked_ii; + uint64_t total; + + for (auto c : C) { + total = 0; + { + blocked_inverted_index_type::builder blocked_ii_builder(params, + c); + blocked_ii_builder.build(blocked_ii); + } + + REQUIRE(blocked_ii.num_docs() == params.num_completions); + REQUIRE(blocked_ii.num_terms() == params.num_terms); + + for (auto& q : queries) { + auto ii_it = ii.intersection_iterator(q); + auto blocked_ii_it = + blocked_ii.intersection_iterator(q, {0, 0}); + + uint32_t n = 0; + for (; ii_it.has_next(); ++n, ++ii_it, ++blocked_ii_it) { + auto got = *blocked_ii_it; + auto expected = *ii_it; + REQUIRE_MESSAGE(got == expected, "expected doc_id " + << expected + << " but got " << got); + } + if (n) total += n; + REQUIRE(blocked_ii_it.has_next() == false); + } + + std::cout << total << std::endl; + } + } } diff --git a/test/test_common.hpp b/test/test_common.hpp index 0bc701a..580a07e 100644 --- a/test/test_common.hpp +++ b/test/test_common.hpp @@ -50,5 +50,40 @@ range locate_prefix(std::vector const& strings, return r; } +typedef std::vector term_ids; + +std::vector gen_random_queries(uint32_t num_queries, + uint32_t max_num_terms, + uint32_t max_range_len) { + assert(max_num_terms > 1); + std::vector queries; + queries.reserve(num_queries); + essentials::uniform_int_rng random_num_terms(2, max_num_terms); + essentials::uniform_int_rng random_term_id(1, max_range_len); + + for (uint32_t i = 0; i != num_queries; ++i) { + term_ids q; + uint32_t num_terms = random_num_terms.gen(); + q.reserve(num_terms); + uint32_t num_distinct_terms = 0; + while (true) { + q.clear(); + for (uint32_t i = 0; i != num_terms; ++i) { + auto t = random_term_id.gen(); + assert(t >= 1 and t <= max_range_len); + q.push_back(t); + } + std::sort(q.begin(), q.end()); + auto end = std::unique(q.begin(), q.end()); + num_distinct_terms = std::distance(q.begin(), end); + if (num_distinct_terms >= 2) break; + } + q.resize(num_distinct_terms); + queries.push_back(q); + } + + return queries; +} + } // namespace testing } // namespace autocomplete \ No newline at end of file diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp index ec93363..b96b708 100644 --- a/test/test_inverted_index.cpp +++ b/test/test_inverted_index.cpp @@ -3,40 +3,6 @@ using namespace autocomplete; typedef ef_inverted_index inverted_index_type; -typedef std::vector term_ids; - -std::vector gen_random_queries(uint32_t num_queries, - uint32_t max_num_terms, - uint32_t max_range_len) { - assert(max_num_terms > 1); - std::vector queries; - queries.reserve(num_queries); - essentials::uniform_int_rng random_num_terms(2, max_num_terms); - essentials::uniform_int_rng random_term_id(1, max_range_len); - - for (uint32_t i = 0; i != num_queries; ++i) { - term_ids q; - uint32_t num_terms = random_num_terms.gen(); - q.reserve(num_terms); - uint32_t num_distinct_terms = 0; - while (true) { - q.clear(); - for (uint32_t i = 0; i != num_terms; ++i) { - auto t = random_term_id.gen(); - assert(t >= 1 and t <= max_range_len); - q.push_back(t); - } - std::sort(q.begin(), q.end()); - auto end = std::unique(q.begin(), q.end()); - num_distinct_terms = std::distance(q.begin(), end); - if (num_distinct_terms >= 2) break; - } - q.resize(num_distinct_terms); - queries.push_back(q); - } - - return queries; -} TEST_CASE("test inverted_index::iterator") { char const* output_filename = testing::tmp_filename.c_str(); @@ -105,8 +71,8 @@ TEST_CASE("test inverted_index::intersection_iterator") { static const uint32_t num_queries = 1000000; static const uint32_t max_num_terms = 5; - auto queries = - gen_random_queries(num_queries, max_num_terms, index.num_terms()); + auto queries = testing::gen_random_queries(num_queries, max_num_terms, + index.num_terms()); std::vector first(index.num_docs()); std::vector second(index.num_docs()); From a6941ef198fdec754211cb0216c1ef5e681385a0 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 23 Oct 2019 15:42:13 +0200 Subject: [PATCH 018/102] example.sh --- README.md | 12 +++++++++--- example.sh | 3 +++ src/web_server.cpp | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 example.sh diff --git a/README.md b/README.md index 209aafb..60911a4 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Query autocompletion in C++. ##### Table of contents 1. [Description](#descr) -2. [Installation](#install) +2. [Installation and quick start](#install) 3. [Compiling the code](#compiling) 4. [Input data format](#input) 5. [Running the unit tests](#testing) @@ -67,15 +67,21 @@ A recursive heap-based algorithm is used to produce the smallest docIDs in M[l,r The final string extraction step is identical to that of the prefix search. -Installation +Installation and quick start ------------------ Just run - $ ./install.sh + $ bash ./install.sh from the parent directory. The script builds the code; prepare the test data in the folder `test_data` for indexing; executes the unit tests. +For having a minimal running example, just run + + $ bash ./example.sh + +and then access the service [here](http://127.0.0.1:8000). + Compiling the code ------------------ diff --git a/example.sh b/example.sh new file mode 100644 index 0000000..4ac00bf --- /dev/null +++ b/example.sh @@ -0,0 +1,3 @@ +cd build +./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin +./web_server 8000 trec_05.ef_type1.bin \ No newline at end of file diff --git a/src/web_server.cpp b/src/web_server.cpp index 94a259b..7a0a61c 100644 --- a/src/web_server.cpp +++ b/src/web_server.cpp @@ -26,7 +26,7 @@ std::string escape_json(std::string const& s) { using namespace autocomplete; -typedef ef_autocomplete_type3 topk_index_type; +typedef ef_autocomplete_type1 topk_index_type; static std::string s_http_port("8000"); static struct mg_serve_http_opts s_http_server_opts; From 496960930651c5fae42267b46892d722ce41e0f4 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 24 Oct 2019 10:18:06 +0200 Subject: [PATCH 019/102] minor --- CMakeLists.txt | 2 +- include/autocomplete3.hpp | 2 +- include/blocked_inverted_index.hpp | 2 +- include/compact_vector.hpp | 2 +- include/completion_trie.hpp | 5 +++-- include/ef/ef_sequence.hpp | 6 +++--- include/fc_dictionary.hpp | 5 ++++- include/inverted_index.hpp | 4 ++-- include/parameters.hpp | 7 +++++-- include/uint_vec.hpp | 10 +++++----- include/util_types.hpp | 5 +++++ test/test_common.hpp | 4 +--- 12 files changed, 32 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 181c024..2908d2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ endif () if(UNIX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index 550aac5..ab0abb1 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -321,7 +321,7 @@ struct autocomplete3 { template uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) { - assert(!r.is_invalid()); + assert(r.is_valid()); auto& topk_scores = m_pool.scores(); min_priority_queue_type q; diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index 79319fe..dfd452d 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -263,7 +263,7 @@ struct blocked_inverted_index { : m_i(0) , m_num_docs(ii->num_docs()) , m_suffix(r) { - assert(!r.is_invalid()); + assert(r.is_valid()); if (!term_ids.empty()) { m_iterators.reserve(term_ids.size()); // at most diff --git a/include/compact_vector.hpp b/include/compact_vector.hpp index f0cd1bd..eb3f9b0 100644 --- a/include/compact_vector.hpp +++ b/include/compact_vector.hpp @@ -277,7 +277,7 @@ struct compact_vector { } uint64_t find(const range r, uint64_t id) { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); return util::find(*this, id, r.begin, r.end - 1); } diff --git a/include/completion_trie.hpp b/include/completion_trie.hpp index 3d52ee5..2bc68ea 100644 --- a/include/completion_trie.hpp +++ b/include/completion_trie.hpp @@ -170,7 +170,7 @@ struct completion_trie { // Return [a,b) range locate_prefix(completion_type const& prefix, range suffix_lex_range) const { - range r{global::not_found, global::not_found}; + range r = global::invalid_range; range pointer{0, m_nodes.front().size()}; uint32_t i = 0; @@ -195,10 +195,11 @@ struct completion_trie { r.end += size; } - assert(r.end > r.begin); + assert(r.is_valid()); return r; } + // NOTE: not used bool is_member(completion_type const& c) const { assert(c.size() > 0); range pointer{0, m_nodes.front().size()}; diff --git a/include/ef/ef_sequence.hpp b/include/ef/ef_sequence.hpp index 10970d6..0d1f436 100644 --- a/include/ef/ef_sequence.hpp +++ b/include/ef/ef_sequence.hpp @@ -142,14 +142,14 @@ struct ef_sequence { } uint64_t find(const range r, uint64_t id) const { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); uint64_t prev_upper = previous_range_upperbound(r); return util::find(*this, id + prev_upper, r.begin, r.end - 1); } range find(const range r, const range lex) const { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); auto prev_upper = previous_range_upperbound(r); @@ -251,7 +251,7 @@ struct ef_sequence { } uint64_t previous_range_upperbound(const range r) const { - assert(!r.is_invalid()); + assert(r.is_valid()); return r.begin ? access(r.begin - 1) : 0; } }; diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp index 271f970..1b5aa9b 100644 --- a/include/fc_dictionary.hpp +++ b/include/fc_dictionary.hpp @@ -307,10 +307,13 @@ struct fc_dictionary { // NOTE 1: excluding null terminators, allow us to use memcpy here // because we know exactly how many bytes to copy: this is much faster - // than looping until we hit '\0'. NOTE 2: always copying a fixed amount + // than looping until we hit '\0'. + + // NOTE 2: always copying a fixed amount // of bytes (constants::MAX_NUM_CHARS_PER_QUERY) is much faster than // copying an exact amount, e.g., suffix_len (althoung it could be // less), so do not do: memcpy(out+ l, in, suffix_len). + memcpy(out + l, in, constants::MAX_NUM_CHARS_PER_QUERY); return l + suffix_len; diff --git a/include/inverted_index.hpp b/include/inverted_index.hpp index 7c84bd7..cd4ad29 100644 --- a/include/inverted_index.hpp +++ b/include/inverted_index.hpp @@ -41,7 +41,7 @@ struct inverted_index { } m_minimal_doc_ids.push_back(list.front()); write_gamma_nonzero(m_bvb, n); - if (ListType::is_byte_aligned) util::push_pad(m_bvb); + if constexpr (ListType::is_byte_aligned) util::push_pad(m_bvb); ListType::build(m_bvb, list.begin(), m_num_docs, list.size()); m_pointers.push_back(m_bvb.size()); } @@ -86,7 +86,7 @@ struct inverted_index { uint64_t offset = m_pointers.access(term_id); bits_iterator it(m_data, offset); uint64_t n = read_gamma_nonzero(it); - if (ListType::is_byte_aligned) util::eat_pad(it); + if constexpr (ListType::is_byte_aligned) util::eat_pad(it); return {m_data, it.position(), m_num_docs, n}; } diff --git a/include/parameters.hpp b/include/parameters.hpp index db44d71..9d03783 100644 --- a/include/parameters.hpp +++ b/include/parameters.hpp @@ -41,8 +41,11 @@ struct parameters { } nodes_per_level.resize(num_levels, 0); - for (uint32_t i = 0; i != num_levels; ++i) { - input >> nodes_per_level[i]; + uint32_t i = 0; + for (; i != num_levels and input; ++i) input >> nodes_per_level[i]; + if (i != num_levels) { + throw std::runtime_error( + "File with statistics may be truncated or malformed"); } } diff --git a/include/uint_vec.hpp b/include/uint_vec.hpp index 86d60c4..adeaa8c 100644 --- a/include/uint_vec.hpp +++ b/include/uint_vec.hpp @@ -74,14 +74,14 @@ struct uint_vec { } uint64_t find(const range r, UintType id) const { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); - UintType prev_upper = previous_range_upperbound(r); + auto prev_upper = previous_range_upperbound(r); return util::find(*this, id + prev_upper, r.begin, r.end - 1); } range find(const range r, const range lex) const { - assert(!r.is_invalid()); + assert(r.is_valid()); assert(r.end <= size()); auto prev_upper = previous_range_upperbound(r); @@ -131,9 +131,9 @@ struct uint_vec { std::vector m_data; UintType previous_range_upperbound(const range r) const { - assert(!r.is_invalid()); + assert(r.is_valid()); return r.begin ? access(r.begin - 1) : 0; } -}; // namespace autocomplete +}; } // namespace autocomplete \ No newline at end of file diff --git a/include/util_types.hpp b/include/util_types.hpp index 7405378..e056bb6 100644 --- a/include/util_types.hpp +++ b/include/util_types.hpp @@ -36,6 +36,7 @@ struct range { uint64_t begin; uint64_t end; bool is_invalid() const; + bool is_valid() const; bool contains(uint64_t val) const; }; @@ -48,6 +49,10 @@ bool range::is_invalid() const { end == global::invalid_range.end or begin > end; } +bool range::is_valid() const { + return !is_invalid(); +} + bool range::contains(uint64_t val) const { if (val >= begin and val <= end) return true; return false; diff --git a/test/test_common.hpp b/test/test_common.hpp index 580a07e..24f4540 100644 --- a/test/test_common.hpp +++ b/test/test_common.hpp @@ -69,9 +69,7 @@ std::vector gen_random_queries(uint32_t num_queries, while (true) { q.clear(); for (uint32_t i = 0; i != num_terms; ++i) { - auto t = random_term_id.gen(); - assert(t >= 1 and t <= max_range_len); - q.push_back(t); + q.push_back(random_term_id.gen()); } std::sort(q.begin(), q.end()); auto end = std::unique(q.begin(), q.end()); From a642eefce34bc8e4a11a125e5da5a01646fb676b Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 24 Oct 2019 11:09:35 +0200 Subject: [PATCH 020/102] benchmark fc_dictionary::locate_prefix --- benchmark/benchmark_fc_dictionary.cpp | 26 +++++++++++++++++++++++++- include/fc_dictionary.hpp | 3 ++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp index f566edd..d8a53e5 100644 --- a/benchmark/benchmark_fc_dictionary.cpp +++ b/benchmark/benchmark_fc_dictionary.cpp @@ -8,7 +8,7 @@ using namespace autocomplete; template void perf_test(Dictionary const& dict, std::vector const& queries) { - std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); + static std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); essentials::timer_type timer; for (uint32_t i = 0; i != runs; ++i) { @@ -43,6 +43,30 @@ void perf_test(Dictionary const& dict, std::cout << "extract: " << (timer.average() * 1000.0) / ids.size() << " [ns/string]" << std::endl; + + static std::vector percentages = {0.0, 0.25, 0.50, 0.75, 1.0}; + // static std::vector percentages = {0.1, 0.2, 0.3, 0.4, 0.5, + // 0.6, 0.7, 0.8, 0.9, 1.0}; + for (auto p : percentages) { + timer.reset(); + for (uint32_t i = 0; i != runs; ++i) { + timer.start(); + for (auto const& query : queries) { + size_t size = query.size(); + size_t n = size * p; + if (n == 0) n += 1; // at least one char + uint8_t const* addr = + reinterpret_cast(query.data()); + range r = dict.locate_prefix({addr, addr + n}); + essentials::do_not_optimize_away(r.end - r.begin); + } + timer.stop(); + } + + std::cout << "locate_prefix-" << p * 100.0 + << "%: " << (timer.average() * 1000.0) / queries.size() + << " [ns/string]" << std::endl; + } } #define exe(BUCKET_SIZE) \ diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp index 1b5aa9b..bde263e 100644 --- a/include/fc_dictionary.hpp +++ b/include/fc_dictionary.hpp @@ -223,7 +223,8 @@ struct fc_dictionary { if (cmp < 0) { bucket_id = mi; } else { - bucket_id = mi - 1; + assert(cmp > 0); + bucket_id = hi; h = header(bucket_id); } From 8753a0a9e89c639382693396c993bc2a018d0456 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 24 Oct 2019 11:17:39 +0200 Subject: [PATCH 021/102] updated fc_dictionary results --- benchmark/benchmark_fc_dictionary.cpp | 2 +- results/fc_dictionary.md | 95 ++++++++++++++++++--------- 2 files changed, 66 insertions(+), 31 deletions(-) diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp index d8a53e5..1d94c8e 100644 --- a/benchmark/benchmark_fc_dictionary.cpp +++ b/benchmark/benchmark_fc_dictionary.cpp @@ -63,7 +63,7 @@ void perf_test(Dictionary const& dict, timer.stop(); } - std::cout << "locate_prefix-" << p * 100.0 + std::cout << "\tlocate_prefix-" << p * 100.0 << "%: " << (timer.average() * 1000.0) / queries.size() << " [ns/string]" << std::endl; } diff --git a/results/fc_dictionary.md b/results/fc_dictionary.md index 39e64b7..37ff080 100644 --- a/results/fc_dictionary.md +++ b/results/fc_dictionary.md @@ -1,40 +1,75 @@ #### Results on the AOL querylog. pibiri@rubino:~/autocomplete/build$ ./benchmark_fc_dictionary ../test_data/aol/aol.completions 1000000 < ../test_data/aol/aol.completions.dict_queries.1M.shuffled - 2019-10-14 14:54:24: loading queries... - 2019-10-14 14:54:24: loaded 1000000 queries - 2019-10-14 14:54:24: building fc_dictionary with bucket size 4... - 2019-10-14 14:54:25: DONE + 2019-10-24 11:11:49: loading queries... + 2019-10-24 11:11:49: loaded 1000000 queries + 2019-10-24 11:11:49: building fc_dictionary with bucket size 4... + 2019-10-24 11:11:50: DONE using 42938890 bytes - locate: 559.666 [ns/string] - extract: 165.846 [ns/string] - 2019-10-14 14:54:32: building fc_dictionary with bucket size 8... - 2019-10-14 14:54:33: DONE + locate: 557.091 [ns/string] + extract: 168.772 [ns/string] + locate_prefix-0%: 213.453 [ns/string] + locate_prefix-25%: 794.612 [ns/string] + locate_prefix-50%: 1064.44 [ns/string] + locate_prefix-75%: 912.04 [ns/string] + locate_prefix-100%: 702.745 [ns/string] + 2019-10-24 11:12:12: building fc_dictionary with bucket size 8... + 2019-10-24 11:12:12: DONE using 38111527 bytes - locate: 515.359 [ns/string] - extract: 151.121 [ns/string] - 2019-10-14 14:54:40: building fc_dictionary with bucket size 16... - 2019-10-14 14:54:40: DONE + locate: 511.503 [ns/string] + extract: 152.331 [ns/string] + locate_prefix-0%: 223.374 [ns/string] + locate_prefix-25%: 686.093 [ns/string] + locate_prefix-50%: 873.161 [ns/string] + locate_prefix-75%: 758.029 [ns/string] + locate_prefix-100%: 638.576 [ns/string] + 2019-10-24 11:12:32: building fc_dictionary with bucket size 16... + 2019-10-24 11:12:32: DONE using 35270205 bytes - locate: 474.319 [ns/string] - extract: 138.07 [ns/string] - 2019-10-14 14:54:47: building fc_dictionary with bucket size 32... - 2019-10-14 14:54:47: DONE + locate: 478.592 [ns/string] + extract: 139.109 [ns/string] + locate_prefix-0%: 228.416 [ns/string] + locate_prefix-25%: 662.483 [ns/string] + locate_prefix-50%: 769.227 [ns/string] + locate_prefix-75%: 685.358 [ns/string] + locate_prefix-100%: 615.757 [ns/string] + 2019-10-24 11:12:51: building fc_dictionary with bucket size 32... + 2019-10-24 11:12:51: DONE using 33722303 bytes - locate: 490 [ns/string] - extract: 150.671 [ns/string] - 2019-10-14 14:54:54: building fc_dictionary with bucket size 64... - 2019-10-14 14:54:54: DONE + locate: 484.72 [ns/string] + extract: 150.21 [ns/string] + locate_prefix-0%: 273.595 [ns/string] + locate_prefix-25%: 717.559 [ns/string] + locate_prefix-50%: 790.342 [ns/string] + locate_prefix-75%: 728.409 [ns/string] + locate_prefix-100%: 681.921 [ns/string] + 2019-10-24 11:13:11: building fc_dictionary with bucket size 64... + 2019-10-24 11:13:11: DONE using 32910194 bytes - locate: 585.408 [ns/string] - extract: 197.131 [ns/string] - 2019-10-14 14:55:03: building fc_dictionary with bucket size 128... - 2019-10-14 14:55:03: DONE + locate: 585.835 [ns/string] + extract: 194.183 [ns/string] + locate_prefix-0%: 667.159 [ns/string] + locate_prefix-25%: 962.096 [ns/string] + locate_prefix-50%: 1056.04 [ns/string] + locate_prefix-75%: 1014.63 [ns/string] + locate_prefix-100%: 978.718 [ns/string] + 2019-10-24 11:13:39: building fc_dictionary with bucket size 128... + 2019-10-24 11:13:39: DONE using 32496375 bytes - locate: 812.441 [ns/string] - extract: 293.022 [ns/string] - 2019-10-14 14:55:15: building fc_dictionary with bucket size 256... - 2019-10-14 14:55:15: DONE + locate: 810.282 [ns/string] + extract: 286.967 [ns/string] + locate_prefix-0%: 574.352 [ns/string] + locate_prefix-25%: 1248.92 [ns/string] + locate_prefix-50%: 1435.28 [ns/string] + locate_prefix-75%: 1419.18 [ns/string] + locate_prefix-100%: 1398.48 [ns/string] + 2019-10-24 11:14:16: building fc_dictionary with bucket size 256... + 2019-10-24 11:14:16: DONE using 32286042 bytes - locate: 1283.83 [ns/string] - extract: 485.985 [ns/string] \ No newline at end of file + locate: 1281.09 [ns/string] + extract: 470.922 [ns/string] + locate_prefix-0%: 1065.07 [ns/string] + locate_prefix-25%: 2099.35 [ns/string] + locate_prefix-50%: 2387.39 [ns/string] + locate_prefix-75%: 2407.04 [ns/string] + locate_prefix-100%: 2403.04 [ns/string] \ No newline at end of file From f45fa9d7f04e66f58fa350e22996ac9d7dd39367 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 24 Oct 2019 11:43:14 +0200 Subject: [PATCH 022/102] updated partition_queries script --- README.md | 12 +++++++----- test_data/partition_queries_by_length.py | 25 ++++++++++++++++++------ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 60911a4..31c1649 100644 --- a/README.md +++ b/README.md @@ -73,10 +73,10 @@ Installation and quick start Just run $ bash ./install.sh - + from the parent directory. The script builds the code; prepare the test data in the folder `test_data` for indexing; executes the unit tests. -For having a minimal running example, just run +After that, for having a minimal running example, just run $ bash ./example.sh @@ -136,7 +136,7 @@ The script `preprocess.sh` in the directory `test_data` helps in preparing the data for indexing. Thus, from within the directory `test_data`, it is sufficient to do: - + $ bash preprocess.sh If you run the script, you will get: @@ -195,7 +195,10 @@ You can use $ python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions -to partition the input completions by number of query terms. +to partition the input completions by number of query terms. Each partition +of queries is shuffled at random to avoid locality of access. +(By default, 8 shards will be created: the ones having [1,7] query terms and +the one collecting all completions with >= 8 query terms). Then the command @@ -203,7 +206,6 @@ Then the command will execute 1000 top-10 queries with 3 terms, from which only 25% of the prefix of the last token is retained. -(For no locality, it is suggested to shuffle the queries at random, for example using `gshuf` on Mac.) We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`. From within the `/build` directory, run diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py index f9cb561..7f14b42 100644 --- a/test_data/partition_queries_by_length.py +++ b/test_data/partition_queries_by_length.py @@ -1,11 +1,15 @@ import sys import numpy as np +import random input_filename = sys.argv[1] num_shards = 7 -files = [open(input_filename + ".length=" + str(i), "w") for i in range(1,num_shards + 1)] -all_others = open(input_filename + ".length=" + str(num_shards + 1) + "+", "w") +files = [open(input_filename + ".length=" + str(i) + ".shuffled", "w") for i in range(1,num_shards + 1)] +all_others = open(input_filename + ".length=" + str(num_shards + 1) + "+.shuffled", "w") + +strings = [[] for i in range(0, num_shards)] +all_others_strings = [] lines = 0 with open(input_filename, 'r') as f: @@ -14,14 +18,23 @@ l = len(x) - 1 if l > num_shards: - all_others.write(line) + all_others_strings.append(line) else: - files[l - 1].write(line) + strings[l - 1].append(line) lines += 1 if lines % 1000000 == 0: print("processed " + str(lines) + " lines") -for f in files: - f.close() + +for i in range(0, num_shards): + random.shuffle(strings[i]) + for s in strings[i]: + files[i].write(s) + files[i].close() + +random.shuffle(all_others_strings) +for s in all_others_strings: + all_others.write(s) all_others.close() + From b54375d2090025ff017ed2c3a3bc4d4629ae2886 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 24 Oct 2019 12:15:09 +0200 Subject: [PATCH 023/102] minor --- include/autocomplete.hpp | 4 ++-- include/autocomplete2.hpp | 4 ++-- include/autocomplete3.hpp | 4 ++-- include/autocomplete4.hpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp index 9f01ed0..a9a281c 100644 --- a/include/autocomplete.hpp +++ b/include/autocomplete.hpp @@ -102,7 +102,7 @@ struct autocomplete { range r = m_completions.locate_prefix(prefix, suffix_lex_range); uint32_t num_completions = 0; - if (!r.is_invalid()) { + if (r.is_valid()) { num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); } @@ -148,7 +148,7 @@ struct autocomplete { suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); uint32_t num_completions = 0; - if (!r.is_invalid()) { + if (r.is_valid()) { num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); } timers[1].stop(); diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp index 3003c02..c1c3e76 100644 --- a/include/autocomplete2.hpp +++ b/include/autocomplete2.hpp @@ -124,7 +124,7 @@ struct autocomplete2 { range r = m_completions.locate_prefix(prefix, suffix_lex_range); uint32_t num_completions = 0; - if (!r.is_invalid()) { + if (r.is_valid()) { num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); } @@ -171,7 +171,7 @@ struct autocomplete2 { suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); uint32_t num_completions = 0; - if (!r.is_invalid()) { + if (r.is_valid()) { num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); } timers[1].stop(); diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index ab0abb1..db7353f 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -123,7 +123,7 @@ struct autocomplete3 { range r = m_completions.locate_prefix(prefix, suffix_lex_range); uint32_t num_completions = 0; - if (!r.is_invalid()) { + if (r.is_valid()) { num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); } @@ -163,7 +163,7 @@ struct autocomplete3 { suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); uint32_t num_completions = 0; - if (!r.is_invalid()) { + if (r.is_valid()) { num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); } timers[1].stop(); diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp index 8b3d882..88018f7 100644 --- a/include/autocomplete4.hpp +++ b/include/autocomplete4.hpp @@ -108,7 +108,7 @@ struct autocomplete4 { range r = m_completions.locate_prefix(prefix, suffix_lex_range); uint32_t num_completions = 0; - if (!r.is_invalid()) { + if (r.is_valid()) { num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); } @@ -138,7 +138,7 @@ struct autocomplete4 { suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); uint32_t num_completions = 0; - if (!r.is_invalid()) { + if (r.is_valid()) { num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); } timers[1].stop(); From 6d9bdae1be07e525d6539882110a2e7272a84fce Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 24 Oct 2019 14:34:17 +0200 Subject: [PATCH 024/102] deduplication of query terms --- include/autocomplete.hpp | 45 +++++++-------------- include/autocomplete2.hpp | 43 +++++++------------- include/autocomplete3.hpp | 64 +++++++++++++----------------- include/autocomplete4.hpp | 2 + include/autocomplete_common.hpp | 6 +++ include/blocked_inverted_index.hpp | 1 - src/CMakeLists.txt | 1 + src/check_topk.cpp | 64 ++++++++++++++++++++++++++++++ 8 files changed, 130 insertions(+), 96 deletions(-) create mode 100644 src/check_topk.cpp diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp index a9a281c..47b4472 100644 --- a/include/autocomplete.hpp +++ b/include/autocomplete.hpp @@ -73,13 +73,7 @@ struct autocomplete { true // must return unique results ); } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } return extract_strings(num_completions); @@ -114,13 +108,7 @@ struct autocomplete { true // must return unique results ); } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } } @@ -163,13 +151,7 @@ struct autocomplete { true // must return unique results ); } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } } timers[2].stop(); @@ -243,21 +225,13 @@ struct autocomplete { // step 2 timers[2].start(); if (num_terms == 1) { // special case - suffix_lex_range.end += 1; num_completions = m_unsorted_minimal_docs_list.topk( suffix_lex_range, k, m_pool.scores(), true // must return unique results ); - } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } timers[2].stop(); @@ -303,6 +277,17 @@ struct autocomplete { assert(m_pool.size() == 0); } + uint32_t conjunctive_topk(completion_type& prefix, const range suffix, + uint32_t const k) { + deduplicate(prefix); + if (prefix.size() == 1) { // we've got nothing to intersect + auto it = m_inverted_index.iterator(prefix.front() - 1); + return conjunctive_topk(it, suffix, k); + } + auto it = m_inverted_index.intersection_iterator(prefix); + return conjunctive_topk(it, suffix, k); + } + template uint32_t conjunctive_topk(Iterator& it, const range r, uint32_t const k) { auto& topk_scores = m_pool.scores(); diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp index c1c3e76..ece6d2e 100644 --- a/include/autocomplete2.hpp +++ b/include/autocomplete2.hpp @@ -96,13 +96,7 @@ struct autocomplete2 { ); extract_completions(num_completions); } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } return extract_strings(num_completions); @@ -137,13 +131,7 @@ struct autocomplete2 { ); extract_completions(num_completions); } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } } else { extract_completions(num_completions); @@ -186,13 +174,7 @@ struct autocomplete2 { ); extract_completions(num_completions); } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } } else { extract_completions(num_completions); @@ -275,13 +257,7 @@ struct autocomplete2 { ); extract_completions(num_completions); } else { - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } timers[2].stop(); @@ -345,6 +321,17 @@ struct autocomplete2 { } } + uint32_t conjunctive_topk(completion_type& prefix, const range suffix, + uint32_t const k) { + deduplicate(prefix); + if (prefix.size() == 1) { // we've got nothing to intersect + auto it = m_inverted_index.iterator(prefix.front() - 1); + return conjunctive_topk(it, suffix, k); + } + auto it = m_inverted_index.intersection_iterator(prefix); + return conjunctive_topk(it, suffix, k); + } + template uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) { auto& topk_scores = m_pool.scores(); diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index db7353f..44c1bf4 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -89,20 +89,15 @@ struct autocomplete3 { init(); completion_type prefix; byte_range suffix; - parse(m_dictionary, query, prefix, suffix); + uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); + assert(num_terms > 0); uint32_t num_completions = 0; range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } - + num_completions = + conjunctive_topk(num_terms, prefix, suffix_lex_range, k); extract_completions(num_completions); return extract_strings(num_completions); } @@ -128,16 +123,8 @@ struct autocomplete3 { } if (num_completions < k) { - if (num_terms == 1) { // we've got nothing to intersect - iterator it(0, m_inverted_index.num_docs()); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = + conjunctive_topk(num_terms, prefix, suffix_lex_range, k); } extract_completions(num_completions); @@ -170,16 +157,8 @@ struct autocomplete3 { timers[2].start(); if (num_completions < k) { - if (num_terms == 1) { // we've got nothing to intersect - iterator it(0, m_inverted_index.num_docs()); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = + conjunctive_topk(num_terms, prefix, suffix_lex_range, k); } timers[2].stop(); @@ -238,7 +217,8 @@ struct autocomplete3 { init(); completion_type prefix; byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); + uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); + assert(num_terms > 0); timers[0].stop(); uint32_t num_completions = 0; @@ -251,13 +231,8 @@ struct autocomplete3 { // step 2 timers[2].start(); - if (prefix.size() == 1) { // we've got nothing to intersect - auto it = m_inverted_index.iterator(prefix.front() - 1); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } else { - auto it = m_inverted_index.intersection_iterator(prefix); - num_completions = conjunctive_topk(it, suffix_lex_range, k); - } + num_completions = + conjunctive_topk(num_terms, prefix, suffix_lex_range, k); timers[2].stop(); // step 3 @@ -319,6 +294,21 @@ struct autocomplete3 { } } + uint32_t conjunctive_topk(uint32_t num_terms, completion_type& prefix, + const range suffix_lex_range, const uint32_t k) { + if (num_terms == 1) { // we've got nothing to intersect + iterator it(0, m_inverted_index.num_docs()); + return conjunctive_topk(it, suffix_lex_range, k); + } + deduplicate(prefix); + if (prefix.size() == 1) { // we've got nothing to intersect + auto it = m_inverted_index.iterator(prefix.front() - 1); + return conjunctive_topk(it, suffix_lex_range, k); + } + auto it = m_inverted_index.intersection_iterator(prefix); + return conjunctive_topk(it, suffix_lex_range, k); + } + template uint32_t conjunctive_topk(Iterator& it, const range r, const uint32_t k) { assert(r.is_valid()); diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp index 88018f7..d0f3304 100644 --- a/include/autocomplete4.hpp +++ b/include/autocomplete4.hpp @@ -283,6 +283,7 @@ struct autocomplete4 { uint32_t conjunctive_topk(completion_type& prefix, const range suffix, const uint32_t k) { auto& topk_scores = m_pool.scores(); + deduplicate(prefix); auto it = m_inverted_index.intersection_iterator(prefix, suffix); uint32_t results = 0; for (; it.has_next(); ++it) { @@ -319,4 +320,5 @@ struct autocomplete4 { return m_pool.begin(); } }; + } // namespace autocomplete \ No newline at end of file diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp index c04f8b6..362a706 100644 --- a/include/autocomplete_common.hpp +++ b/include/autocomplete_common.hpp @@ -19,4 +19,10 @@ uint32_t parse(Dictionary const& dict, std::string const& query, return num_terms; } +void deduplicate(completion_type& c) { + std::sort(c.begin(), c.end()); + auto end = std::unique(c.begin(), c.end()); + c.resize(std::distance(c.begin(), end)); +} + } // namespace autocomplete \ No newline at end of file diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index dfd452d..0d3d4ed 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -267,7 +267,6 @@ struct blocked_inverted_index { if (!term_ids.empty()) { m_iterators.reserve(term_ids.size()); // at most - std::sort(term_ids.begin(), term_ids.end()); uint32_t current_block_id = ii->block_id(term_ids.front()); uint32_t i = 0; uint32_t prev_i = 0; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7b000b1..a9e4661 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,3 +2,4 @@ add_executable(build build.cpp) add_executable(web_server web_server.cpp ../external/mongoose/mongoose.c) add_executable(output_ds2i_format output_ds2i_format.cpp) add_executable(statistics statistics.cpp) +add_executable(check_topk check_topk.cpp) \ No newline at end of file diff --git a/src/check_topk.cpp b/src/check_topk.cpp new file mode 100644 index 0000000..cb466a1 --- /dev/null +++ b/src/check_topk.cpp @@ -0,0 +1,64 @@ +#include + +#include "types.hpp" +#include "../benchmark/benchmark_common.hpp" + +using namespace autocomplete; + +template +void check_topk(char const* binary_filename1, char const* binary_filename2, + uint32_t k, uint32_t max_num_queries, float keep) { + Index index1; + ef_autocomplete_type1 index2; + essentials::load(index1, binary_filename1); + essentials::load(index2, binary_filename2); + std::vector queries; + load_queries(queries, max_num_queries, keep, std::cin); + for (auto const& query : queries) { + size_t n1 = index1.topk(query, k).size(); + size_t n2 = index2.topk(query, k).size(); + if (n1 != n2) { + std::cout << query << std::endl; + } + } +} + +int main(int argc, char** argv) { + int mandatory = 6; + if (argc < mandatory + 1) { + std::cout << argv[0] + << " " + " " + " < queries" + << std::endl; + std::cout << " is a float in [0,1] and specifies how much " + "we keep of the last token in a query " + << std::endl; + return 1; + } + + std::string type(argv[1]); + uint32_t k = std::atoi(argv[2]); + char const* binary_filename1 = argv[3]; + char const* binary_filename2 = argv[4]; + uint32_t max_num_queries = std::atoi(argv[5]); + float keep = std::atof(argv[6]); + + if (type == "ef_type1") { + check_topk(binary_filename1, binary_filename2, k, + max_num_queries, keep); + } else if (type == "ef_type2") { + check_topk(binary_filename1, binary_filename2, k, + max_num_queries, keep); + } else if (type == "ef_type3") { + check_topk(binary_filename1, binary_filename2, k, + max_num_queries, keep); + } else if (type == "ef_type4") { + check_topk(binary_filename1, binary_filename2, k, + max_num_queries, keep); + } else { + return 1; + } + + return 0; +} \ No newline at end of file From 9212c98b9f3892c194250363575f6e6dfcbfd12c Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 24 Oct 2019 14:47:38 +0200 Subject: [PATCH 025/102] assert --- include/blocked_inverted_index.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index 0d3d4ed..ec6c4b6 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -266,6 +266,10 @@ struct blocked_inverted_index { assert(r.is_valid()); if (!term_ids.empty()) { + assert(std::is_sorted(term_ids.begin(), term_ids.end())); + assert(std::unique(term_ids.begin(), term_ids.end()) == + term_ids.end()); + m_iterators.reserve(term_ids.size()); // at most uint32_t current_block_id = ii->block_id(term_ids.front()); uint32_t i = 0; From 7194cba247f13638facd0ca9c4407e926ad2af8b Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 24 Oct 2019 21:36:12 +0200 Subject: [PATCH 026/102] minor --- benchmark/benchmark_common.hpp | 22 ++++------ include/blocked_inverted_index.hpp | 65 ++++++++++++++---------------- 2 files changed, 39 insertions(+), 48 deletions(-) diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp index 0fdae98..0fbcc26 100644 --- a/benchmark/benchmark_common.hpp +++ b/benchmark/benchmark_common.hpp @@ -11,21 +11,15 @@ size_t load_queries(std::vector& queries, uint32_t max_num_queries, queries.reserve(max_num_queries); for (uint32_t i = 0; i != max_num_queries; ++i) { if (!std::getline(is, line)) break; - auto query = line.substr(line.find(' ') + 1, line.size()); - int32_t size = query.size() - 1; - while (size >= 0 and query[size] != ' ') --size; - auto last_token = query.substr(size + 1, query.size() - size); - uint32_t num_chars = - last_token.size() - std::ceil(last_token.size() * percentage); - char first = last_token.front(); - for (uint32_t i = 0; i != num_chars; ++i) last_token.pop_back(); - - // retain at least one char - if (last_token.empty()) last_token.push_back(first); - assert(last_token.size() > 0); - - queries.push_back(query.substr(0, size + 1) + last_token); + assert(query.size() > 0); + size_t size = query.size() - 1; + while (size > 0 and query[size] != ' ') --size; + size_t last_token_size = query.size() - size; + size_t end = size + std::ceil(last_token_size * percentage) + 1 + + 1; // retain at least one char + for (size = query.size(); size > end; --size) query.pop_back(); + queries.push_back(query); } return queries.size(); } diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index ec6c4b6..9a21d0c 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -270,7 +270,7 @@ struct blocked_inverted_index { assert(std::unique(term_ids.begin(), term_ids.end()) == term_ids.end()); - m_iterators.reserve(term_ids.size()); // at most + m_blocks.reserve(term_ids.size()); // at most uint32_t current_block_id = ii->block_id(term_ids.front()); uint32_t i = 0; uint32_t prev_i = 0; @@ -284,7 +284,7 @@ struct blocked_inverted_index { for (; prev_i != i; ++prev_i) { block.term_ids.push_back(term_ids[prev_i]); } - m_iterators.push_back(std::move(block)); + m_blocks.push_back(std::move(block)); } current_block_id = b; } @@ -294,16 +294,15 @@ struct blocked_inverted_index { for (; prev_i != i; ++prev_i) { block.term_ids.push_back(term_ids[prev_i]); } - m_iterators.push_back(std::move(block)); + m_blocks.push_back(std::move(block)); - assert(m_iterators.size() > 0); - std::sort(m_iterators.begin(), m_iterators.end(), + std::sort(m_blocks.begin(), m_blocks.end(), [](auto const& l, auto const& r) { return l.docs_iterator.size() < r.docs_iterator.size(); }); - m_candidate = m_iterators[0].docs_iterator.access(0); + m_candidate = m_blocks[0].docs_iterator.access(0); } else { m_candidate = 0; } @@ -334,10 +333,10 @@ struct blocked_inverted_index { } void operator++() { - assert(m_i == m_iterators.size()); - if (!m_iterators.empty()) { - if (m_iterators.size() > 1) { - m_candidate = m_iterators[0].docs_iterator.next(); + assert(m_i == m_blocks.size()); + if (!m_blocks.empty()) { + if (m_blocks.size() > 1) { + m_candidate = m_blocks[0].docs_iterator.next(); } } else { m_candidate += 1; @@ -347,17 +346,16 @@ struct blocked_inverted_index { } bool intersects() { - for (auto& block : m_range) { - uint64_t val = block.docs_iterator.next_geq(m_candidate); + for (auto& b : m_range) { + uint64_t val = b.docs_iterator.next_geq(m_candidate); if (val == m_candidate) { - uint64_t pos = block.docs_iterator.position(); - assert(block.docs_iterator.access(pos) == m_candidate); - uint64_t begin = block.offsets_iterator.access(pos); - uint64_t end = block.offsets_iterator.access(pos + 1); + uint64_t pos = b.docs_iterator.position(); + assert(b.docs_iterator.access(pos) == m_candidate); + uint64_t begin = b.offsets_iterator.access(pos); + uint64_t end = b.offsets_iterator.access(pos + 1); assert(end > begin); - uint32_t lower_bound = block.lower_bound; for (uint64_t i = begin; i != end; ++i) { - auto t = block.terms_iterator.access(i) + lower_bound; + auto t = b.terms_iterator.access(i) + b.lower_bound; if (t > m_suffix.end) break; if (m_suffix.contains(t)) return true; } @@ -370,26 +368,25 @@ struct blocked_inverted_index { id_type m_candidate; size_t m_i; uint64_t m_num_docs; - std::vector m_iterators; + std::vector m_blocks; std::vector m_range; range m_suffix; bool in() { // is candidate doc in intersection? - uint64_t pos = m_iterators[m_i].docs_iterator.position(); - if (pos == m_iterators[m_i].docs_iterator.size()) return false; - uint64_t begin = m_iterators[m_i].offsets_iterator.access(pos); - uint64_t end = m_iterators[m_i].offsets_iterator.access(pos + 1); + auto& b = m_blocks[m_i]; + uint64_t pos = b.docs_iterator.position(); + if (pos == b.docs_iterator.size()) return false; + uint64_t begin = b.offsets_iterator.access(pos); + uint64_t end = b.offsets_iterator.access(pos + 1); assert(end > begin); - if (end - begin < m_iterators[m_i].term_ids.size()) return false; + if (end - begin < b.term_ids.size()) return false; uint64_t i = begin; - uint32_t lower_bound = m_iterators[m_i].lower_bound; - for (auto x : m_iterators[m_i].term_ids) { + for (auto x : b.term_ids) { bool found = false; for (; i != end; ++i) { - auto t = - m_iterators[m_i].terms_iterator.access(i) + lower_bound; + auto t = b.terms_iterator.access(i) + b.lower_bound; if (t == x) { found = true; break; @@ -402,18 +399,18 @@ struct blocked_inverted_index { } void next() { - if (m_iterators.empty()) return; - if (m_iterators.size() == 1) { - while (m_candidate < m_num_docs and m_i != m_iterators.size()) { + if (m_blocks.empty()) return; + if (m_blocks.size() == 1) { + while (m_candidate < m_num_docs and m_i != m_blocks.size()) { assert(m_i == 0); - m_candidate = m_iterators[m_i].docs_iterator.next(); + m_candidate = m_blocks[m_i].docs_iterator.next(); if (in()) ++m_i; } } else { - while (m_candidate < m_num_docs and m_i != m_iterators.size()) { + while (m_candidate < m_num_docs and m_i != m_blocks.size()) { // NOTE: since we work with unions of posting lists, // next_geq by scan runs faster - auto val = m_iterators[m_i].docs_iterator.next_geq_by_scan( + auto val = m_blocks[m_i].docs_iterator.next_geq_by_scan( m_candidate); bool is_in = in(); if (val == m_candidate and is_in) { From efeb99f091daae4c547321f33e0e28bd423bf56a Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 26 Oct 2019 12:28:55 +0200 Subject: [PATCH 027/102] using cmd_line_parser --- .gitmodules | 3 ++ external/cmd_line_parser | 1 + external/mongoose | 2 +- include/blocked_inverted_index.hpp | 5 ++- src/build.cpp | 54 ++++++++++++------------------ 5 files changed, 30 insertions(+), 35 deletions(-) create mode 160000 external/cmd_line_parser diff --git a/.gitmodules b/.gitmodules index 60c5af2..5b9dc7e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "external/doctest"] path = external/doctest url = https://github.com/onqtam/doctest.git +[submodule "external/cmd_line_parser"] + path = external/cmd_line_parser + url = https://github.com/jermp/cmd_line_parser.git diff --git a/external/cmd_line_parser b/external/cmd_line_parser new file mode 160000 index 0000000..de6d870 --- /dev/null +++ b/external/cmd_line_parser @@ -0,0 +1 @@ +Subproject commit de6d870f8f01076f671a4eed6bbe55f3b9217d05 diff --git a/external/mongoose b/external/mongoose index c41a221..dce60c6 160000 --- a/external/mongoose +++ b/external/mongoose @@ -1 +1 @@ -Subproject commit c41a22195ceabc02ffd0379f0e71d6c3575337aa +Subproject commit dce60c6dbb096f3b96e1a45cbfdfd55e18b38bb6 diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index 9a21d0c..8425e4e 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -23,7 +23,10 @@ struct blocked_inverted_index { : m_num_integers(0) , m_num_docs(params.num_completions) , m_num_terms(params.num_terms) { - assert(c > 0.0); + if (!(c > 0.0 and c <= 1.0)) { + throw std::runtime_error("c must be in (0,1]"); + } + essentials::logger("building blocked_inverted_index with c = " + std::to_string(c) + "..."); diff --git a/src/build.cpp b/src/build.cpp index 732318f..ba73954 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -2,57 +2,48 @@ #include "types.hpp" #include "statistics.hpp" +#include "../external/cmd_line_parser/include/parser.hpp" using namespace autocomplete; template -void build(parameters const& params, char const* output_filename) { +void build(parameters const& params, std::string const& output_filename) { Index index(params); index.print_stats(); - if (output_filename) { + if (output_filename != "") { essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); + essentials::save(index, output_filename.c_str()); essentials::logger("DONE"); } } void build_type4(parameters const& params, const float c, - char const* output_filename) { + std::string const& output_filename) { ef_autocomplete_type4 index(params, c); index.print_stats(); - if (output_filename) { + if (output_filename != "") { essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename); + essentials::save(index, output_filename.c_str()); essentials::logger("DONE"); } } int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " [-o output_filename] [-c c]" - << std::endl; - return 1; - } - - std::string type(argv[1]); + cmd_line_parser::parser parser(argc, argv); + parser.add("type", "Index type."); + parser.add("collection_basename", "Collection basename."); + parser.add("output_filename", "Output filename.", "-o", false); + parser.add( + "c", + "Value for Bast and Weber's technique: c must be a float in (0,1].", + "-c", false); + if (!parser.parse()) return 1; + + auto type = parser.get("type"); parameters params; - params.collection_basename = argv[2]; + params.collection_basename = parser.get("collection_basename"); params.load(); - - char const* output_filename = nullptr; - float c = 0.0; - - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "-o") { - ++i; - output_filename = argv[i]; - } else if (std::string(argv[i]) == "-c") { - ++i; - c = std::stof(argv[i]); - } - } + auto output_filename = parser.get("output_filename"); if (type == "ef_type1") { build(params, output_filename); @@ -61,10 +52,7 @@ int main(int argc, char** argv) { } else if (type == "ef_type3") { build(params, output_filename); } else if (type == "ef_type4") { - if (c == 0.0) { - std::cerr << "c must be greater than 0.0" << std::endl; - return 1; - } + auto c = parser.get("c"); build_type4(params, c, output_filename); } else { return 1; From 1465feccf13a08e58c387d38910e347f4f5c78c9 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 26 Oct 2019 13:49:38 +0200 Subject: [PATCH 028/102] using cmd_line_parser --- benchmark/benchmark_common.hpp | 14 +++ benchmark/benchmark_conjunctive_topk.cpp | 110 ++++++++---------- benchmark/benchmark_integer_fc_dictionary.cpp | 2 +- benchmark/benchmark_locate_prefix.cpp | 48 ++++---- benchmark/benchmark_prefix_topk.cpp | 106 ++++++++--------- benchmark/benchmark_topk.cpp | 66 ++++------- src/statistics.cpp | 18 +-- 7 files changed, 162 insertions(+), 202 deletions(-) diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp index 0fbcc26..135992d 100644 --- a/benchmark/benchmark_common.hpp +++ b/benchmark/benchmark_common.hpp @@ -1,5 +1,7 @@ #pragma once +#include "../external/cmd_line_parser/include/parser.hpp" + namespace autocomplete { static const uint32_t runs = 5; @@ -24,4 +26,16 @@ size_t load_queries(std::vector& queries, uint32_t max_num_queries, return queries.size(); } +void configure_parser_for_benchmarking(cmd_line_parser::parser& parser) { + parser.add("type", "Index type."); + parser.add("k", "top-k value."); + parser.add("index_filename", "Index filename."); + parser.add("num_terms_per_query", "Number of terms per query."); + parser.add("max_num_queries", "Maximum number of queries to execute."); + parser.add("percentage", + "A float in [0,1] specifying how much we keep of the last token " + "in a query."); + parser.add("breakdown", "Collect timings breakdown.", "--breakdown"); +} + } // namespace autocomplete \ No newline at end of file diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp index 2a04c4c..83e2c99 100644 --- a/benchmark/benchmark_conjunctive_topk.cpp +++ b/benchmark/benchmark_conjunctive_topk.cpp @@ -1,110 +1,92 @@ #include #include "types.hpp" -#include "statistics.hpp" #include "benchmark_common.hpp" using namespace autocomplete; template -void benchmark_conjunctive_topk(char const* binary_filename, uint32_t k, - uint32_t max_num_queries, - essentials::json_lines& breakdowns, - bool breakdown) { - Index autocomp; - essentials::logger("loading data structure from disk..."); - essentials::load(autocomp, binary_filename); - essentials::logger("DONE"); - autocomp.print_stats(); +void benchmark(std::string const& index_filename, uint32_t k, + uint32_t max_num_queries, float keep, + essentials::json_lines& breakdowns, bool breakdown) { + Index index; + essentials::load(index, index_filename.c_str()); std::vector queries; - essentials::logger("loading queries..."); uint32_t num_queries = - load_queries(queries, max_num_queries, 0.25, std::cin); - essentials::logger("loaded " + std::to_string(num_queries) + " queries"); + load_queries(queries, max_num_queries, keep, std::cin); - auto ns_x_query = [&](double time) { - return uint64_t(time / (runs * num_queries) * 1000); + uint64_t reported_strings = 0; + auto musec_per_query = [&](double time) { + return time / (runs * num_queries); }; - essentials::logger("benchmarking conjunctive_topk queries..."); - uint64_t reported_strings = 0; + breakdowns.add("num_queries", std::to_string(num_queries)); if (breakdown) { std::vector timers(4); for (uint32_t run = 0; run != runs; ++run) { for (auto const& query : queries) { - auto it = autocomp.conjunctive_topk(query, k, timers); + auto it = index.prefix_topk(query, k, timers); reported_strings += it.size(); } } - essentials::logger("DONE"); std::cout << reported_strings << std::endl; - breakdowns.add("num_queries", std::to_string(num_queries)); - breakdowns.add("parsing_ns_per_query", - std::to_string(ns_x_query(timers[0].elapsed()))); - breakdowns.add("dictionary_search_ns_per_query", - std::to_string(ns_x_query(timers[1].elapsed()))); - breakdowns.add("conjunctive_search_ns_per_query", - std::to_string(ns_x_query(timers[2].elapsed()))); - breakdowns.add("reporting_ns_per_query", - std::to_string(ns_x_query(timers[3].elapsed()))); + breakdowns.add("parsing_musec_per_query", + std::to_string(musec_per_query(timers[0].elapsed()))); + breakdowns.add("dictionary_search_musec_per_query", + std::to_string(musec_per_query(timers[1].elapsed()))); + breakdowns.add("conjunctive_search_musec_per_query", + std::to_string(musec_per_query(timers[2].elapsed()))); + breakdowns.add("reporting_musec_per_query", + std::to_string(musec_per_query(timers[3].elapsed()))); } else { essentials::timer_type timer; timer.start(); for (uint32_t run = 0; run != runs; ++run) { for (auto const& query : queries) { - auto it = autocomp.conjunctive_topk(query, k); + auto it = index.prefix_topk(query, k); reported_strings += it.size(); } } timer.stop(); - essentials::logger("DONE"); std::cout << reported_strings << std::endl; - breakdowns.add("num_queries", std::to_string(num_queries)); - breakdowns.add("ns_per_query", - std::to_string(ns_x_query(timer.elapsed()))); + breakdowns.add("musec_per_query", + std::to_string(musec_per_query(timer.elapsed()))); } } int main(int argc, char** argv) { - int mandatory = 5; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " " - " --breakdown < queries" - << std::endl; - return 1; - } + cmd_line_parser::parser parser(argc, argv); + configure_parser_for_benchmarking(parser); + if (!parser.parse()) return 1; - std::string type(argv[1]); - uint32_t k = std::atoi(argv[2]); - char const* binary_filename = argv[3]; - std::string num_terms_per_query(argv[4]); - uint32_t max_num_queries = std::atoi(argv[5]); - - bool breakdown = false; - for (int i = mandatory; i != argc; ++i) { - if (std::string(argv[i]) == "--breakdown") { - breakdown = true; - } - } + auto type = parser.get("type"); + auto k = parser.get("k"); + auto index_filename = parser.get("index_filename"); + auto max_num_queries = parser.get("max_num_queries"); + auto keep = parser.get("percentage"); + auto breakdown = parser.get("breakdown"); essentials::json_lines breakdowns; breakdowns.new_line(); - breakdowns.add("num_terms_per_query", num_terms_per_query); + breakdowns.add("num_terms_per_query", + parser.get("num_terms_per_query")); + breakdowns.add("percentage", std::to_string(keep)); - if (type == "type1") { - benchmark_conjunctive_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); - } else if (type == "type2") { - benchmark_conjunctive_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); - } else if (type == "type3") { - benchmark_conjunctive_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); + if (type == "ef_type1") { + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); + } else if (type == "ef_type2") { + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); + } else if (type == "ef_type3") { + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); + } else if (type == "ef_type4") { + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); } else { - std::cout << "error: unknown type '" << type << "'" << std::endl; return 1; } diff --git a/benchmark/benchmark_integer_fc_dictionary.cpp b/benchmark/benchmark_integer_fc_dictionary.cpp index f1e35d9..3a752eb 100644 --- a/benchmark/benchmark_integer_fc_dictionary.cpp +++ b/benchmark/benchmark_integer_fc_dictionary.cpp @@ -8,7 +8,7 @@ using namespace autocomplete; template void perf_test(Dictionary const& dict, std::vector const& queries) { - completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); + static completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); essentials::timer_type timer; for (uint32_t i = 0; i != runs; ++i) { diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp index 6e9a1ab..8d37357 100644 --- a/benchmark/benchmark_locate_prefix.cpp +++ b/benchmark/benchmark_locate_prefix.cpp @@ -7,10 +7,9 @@ using namespace autocomplete; template -void benchmark_locate_prefix(parameters const& params, - fc_dictionary_type const& dict, - uint32_t max_num_queries, float keep, - essentials::json_lines& result) { +void benchmark(parameters const& params, fc_dictionary_type const& dict, + uint32_t max_num_queries, float keep, + essentials::json_lines& result) { Index index; { typename Index::builder builder(params); @@ -24,6 +23,7 @@ void benchmark_locate_prefix(parameters const& params, { num_queries = load_queries(strings, max_num_queries, keep, std::cin); + result.add("num_queries", std::to_string(num_queries)); for (auto const& string : strings) { completion_type prefix; byte_range suffix; @@ -51,26 +51,23 @@ void benchmark_locate_prefix(parameters const& params, } int main(int argc, char** argv) { - int mandatory = 5; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " " - " < queries" - << std::endl; - std::cout << " is a float in [0,1] and specifies how much " - "we keep of the last token in a query " - << std::endl; - return 1; - } + cmd_line_parser::parser parser(argc, argv); + parser.add("type", "Index type."); + parser.add("collection_basename", "Collection basename."); + parser.add("num_terms_per_query", "Number of terms per query."); + parser.add("max_num_queries", "Maximum number of queries to execute."); + parser.add("percentage", + "A float in [0,1] specifying how much we keep of the last token " + "in a query."); + if (!parser.parse()) return 1; - std::string type(argv[1]); parameters params; - params.collection_basename = argv[2]; + params.collection_basename = parser.get("collection_basename"); params.load(); - std::string num_terms_per_query(argv[3]); - uint32_t max_num_queries = std::atoi(argv[4]); - float keep = std::atof(argv[5]); + auto type = parser.get("type"); + auto max_num_queries = parser.get("max_num_queries"); + auto keep = parser.get("percentage"); fc_dictionary_type dict; { @@ -80,15 +77,16 @@ int main(int argc, char** argv) { essentials::json_lines result; result.new_line(); - result.add("num_terms_per_query", num_terms_per_query); + result.add("num_terms_per_query", + parser.get("num_terms_per_query")); result.add("percentage", std::to_string(keep)); if (type == "trie") { - benchmark_locate_prefix( - params, dict, max_num_queries, keep, result); + benchmark(params, dict, max_num_queries, keep, + result); } else if (type == "fc") { - benchmark_locate_prefix( - params, dict, max_num_queries, keep, result); + benchmark(params, dict, max_num_queries, + keep, result); } else { return 1; } diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp index 2149e03..28046a2 100644 --- a/benchmark/benchmark_prefix_topk.cpp +++ b/benchmark/benchmark_prefix_topk.cpp @@ -1,106 +1,92 @@ #include #include "types.hpp" -#include "statistics.hpp" #include "benchmark_common.hpp" using namespace autocomplete; template -void benchmark_prefix_topk(char const* binary_filename, uint32_t k, - uint32_t max_num_queries, - essentials::json_lines& breakdowns, bool breakdown) { - Index autocomp; - essentials::logger("loading data structure from disk..."); - essentials::load(autocomp, binary_filename); - essentials::logger("DONE"); - autocomp.print_stats(); +void benchmark(std::string const& index_filename, uint32_t k, + uint32_t max_num_queries, float keep, + essentials::json_lines& breakdowns, bool breakdown) { + Index index; + essentials::load(index, index_filename.c_str()); std::vector queries; - essentials::logger("loading queries..."); uint32_t num_queries = - load_queries(queries, max_num_queries, 0.25, std::cin); - essentials::logger("loaded " + std::to_string(num_queries) + " queries"); + load_queries(queries, max_num_queries, keep, std::cin); - auto ns_x_query = [&](double time) { - return uint64_t(time / (runs * num_queries) * 1000); + uint64_t reported_strings = 0; + auto musec_per_query = [&](double time) { + return time / (runs * num_queries); }; - essentials::logger("benchmarking prefix_topk queries..."); - uint64_t reported_strings = 0; + breakdowns.add("num_queries", std::to_string(num_queries)); if (breakdown) { std::vector timers(4); for (uint32_t run = 0; run != runs; ++run) { for (auto const& query : queries) { - auto it = autocomp.prefix_topk(query, k, timers); + auto it = index.prefix_topk(query, k, timers); reported_strings += it.size(); } } - essentials::logger("DONE"); std::cout << reported_strings << std::endl; - breakdowns.add("num_queries", std::to_string(num_queries)); - breakdowns.add("parsing_ns_per_query", - std::to_string(ns_x_query(timers[0].elapsed()))); - breakdowns.add("completions_search_ns_per_query", - std::to_string(ns_x_query(timers[1].elapsed()))); - breakdowns.add("topk_rmq_ns_per_query", - std::to_string(ns_x_query(timers[2].elapsed()))); - breakdowns.add("reporting_ns_per_query", - std::to_string(ns_x_query(timers[3].elapsed()))); + breakdowns.add("parsing_musec_per_query", + std::to_string(musec_per_query(timers[0].elapsed()))); + breakdowns.add("completions_search_musec_per_query", + std::to_string(musec_per_query(timers[1].elapsed()))); + breakdowns.add("topk_rmq_musec_per_query", + std::to_string(musec_per_query(timers[2].elapsed()))); + breakdowns.add("reporting_musec_per_query", + std::to_string(musec_per_query(timers[3].elapsed()))); } else { essentials::timer_type timer; timer.start(); for (uint32_t run = 0; run != runs; ++run) { for (auto const& query : queries) { - auto it = autocomp.prefix_topk(query, k); + auto it = index.prefix_topk(query, k); reported_strings += it.size(); } } timer.stop(); - essentials::logger("DONE"); std::cout << reported_strings << std::endl; - breakdowns.add("num_queries", std::to_string(num_queries)); - breakdowns.add("ns_per_query", - std::to_string(ns_x_query(timer.elapsed()))); + breakdowns.add("musec_per_query", + std::to_string(musec_per_query(timer.elapsed()))); } } int main(int argc, char** argv) { - int mandatory = 5; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " " - " --breakdown < queries" - << std::endl; - return 1; - } + cmd_line_parser::parser parser(argc, argv); + configure_parser_for_benchmarking(parser); + if (!parser.parse()) return 1; - std::string type(argv[1]); - uint32_t k = std::atoi(argv[2]); - char const* binary_filename = argv[3]; - std::string num_terms_per_query(argv[4]); - uint32_t max_num_queries = std::atoi(argv[5]); - - bool breakdown = false; - for (int i = mandatory + 1; i != argc; ++i) { - if (std::string(argv[i]) == "--breakdown") { - breakdown = true; - } - } + auto type = parser.get("type"); + auto k = parser.get("k"); + auto index_filename = parser.get("index_filename"); + auto max_num_queries = parser.get("max_num_queries"); + auto keep = parser.get("percentage"); + auto breakdown = parser.get("breakdown"); essentials::json_lines breakdowns; breakdowns.new_line(); - breakdowns.add("num_terms_per_query", num_terms_per_query); + breakdowns.add("num_terms_per_query", + parser.get("num_terms_per_query")); + breakdowns.add("percentage", std::to_string(keep)); - if (type == "type1") { - benchmark_prefix_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); - } else if (type == "type2") { - benchmark_prefix_topk( - binary_filename, k, max_num_queries, breakdowns, breakdown); + if (type == "ef_type1") { + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); + } else if (type == "ef_type2") { + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); + } else if (type == "ef_type3") { + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); + } else if (type == "ef_type4") { + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); } else { - std::cout << "error: unknown type '" << type << "'" << std::endl; return 1; } diff --git a/benchmark/benchmark_topk.cpp b/benchmark/benchmark_topk.cpp index a294afe..0ea1e97 100644 --- a/benchmark/benchmark_topk.cpp +++ b/benchmark/benchmark_topk.cpp @@ -1,17 +1,16 @@ #include #include "types.hpp" -#include "statistics.hpp" #include "benchmark_common.hpp" using namespace autocomplete; template -void benchmark_topk(char const* binary_filename, uint32_t k, - uint32_t max_num_queries, float keep, - essentials::json_lines& breakdowns, bool breakdown) { +void benchmark(std::string const& index_filename, uint32_t k, + uint32_t max_num_queries, float keep, + essentials::json_lines& breakdowns, bool breakdown) { Index index; - essentials::load(index, binary_filename); + essentials::load(index, index_filename.c_str()); std::vector queries; uint32_t num_queries = @@ -32,9 +31,7 @@ void benchmark_topk(char const* binary_filename, uint32_t k, reported_strings += it.size(); } } - std::cout << reported_strings << std::endl; - breakdowns.add("parsing_musec_per_query", std::to_string(musec_per_query(timers[0].elapsed()))); breakdowns.add("prefix_search_musec_per_query", @@ -43,7 +40,6 @@ void benchmark_topk(char const* binary_filename, uint32_t k, std::to_string(musec_per_query(timers[2].elapsed()))); breakdowns.add("reporting_musec_per_query", std::to_string(musec_per_query(timers[3].elapsed()))); - } else { essentials::timer_type timer; timer.start(); @@ -54,58 +50,42 @@ void benchmark_topk(char const* binary_filename, uint32_t k, } } timer.stop(); - std::cout << reported_strings << std::endl; - breakdowns.add("musec_per_query", std::to_string(musec_per_query(timer.elapsed()))); } } int main(int argc, char** argv) { - int mandatory = 6; - if (argc < mandatory + 1) { - std::cout << argv[0] - << " " - " [--breakdown] < queries" - << std::endl; - std::cout << " is a float in [0,1] and specifies how much " - "we keep of the last token in a query " - << std::endl; - return 1; - } - - std::string type(argv[1]); - uint32_t k = std::atoi(argv[2]); - char const* binary_filename = argv[3]; - std::string num_terms_per_query(argv[4]); - uint32_t max_num_queries = std::atoi(argv[5]); - float keep = std::atof(argv[6]); + cmd_line_parser::parser parser(argc, argv); + configure_parser_for_benchmarking(parser); + if (!parser.parse()) return 1; - bool breakdown = false; - for (int i = mandatory + 1; i != argc; ++i) { - if (std::string(argv[i]) == "--breakdown") { - breakdown = true; - } - } + auto type = parser.get("type"); + auto k = parser.get("k"); + auto index_filename = parser.get("index_filename"); + auto max_num_queries = parser.get("max_num_queries"); + auto keep = parser.get("percentage"); + auto breakdown = parser.get("breakdown"); essentials::json_lines breakdowns; breakdowns.new_line(); - breakdowns.add("num_terms_per_query", num_terms_per_query); + breakdowns.add("num_terms_per_query", + parser.get("num_terms_per_query")); breakdowns.add("percentage", std::to_string(keep)); if (type == "ef_type1") { - benchmark_topk( - binary_filename, k, max_num_queries, keep, breakdowns, breakdown); + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); } else if (type == "ef_type2") { - benchmark_topk( - binary_filename, k, max_num_queries, keep, breakdowns, breakdown); + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); } else if (type == "ef_type3") { - benchmark_topk( - binary_filename, k, max_num_queries, keep, breakdowns, breakdown); + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); } else if (type == "ef_type4") { - benchmark_topk( - binary_filename, k, max_num_queries, keep, breakdowns, breakdown); + benchmark(index_filename, k, max_num_queries, + keep, breakdowns, breakdown); } else { return 1; } diff --git a/src/statistics.cpp b/src/statistics.cpp index 5b2148f..9dbf689 100644 --- a/src/statistics.cpp +++ b/src/statistics.cpp @@ -2,25 +2,25 @@ #include "types.hpp" #include "statistics.hpp" +#include "../external/cmd_line_parser/include/parser.hpp" using namespace autocomplete; template -void print_stats(char const* index_filename) { +void print_stats(std::string const& index_filename) { Index index; - essentials::load(index, index_filename); + essentials::load(index, index_filename.c_str()); index.print_stats(); } int main(int argc, char** argv) { - int mandatory = 2; - if (argc < mandatory + 1) { - std::cout << argv[0] << " " << std::endl; - return 1; - } + cmd_line_parser::parser parser(argc, argv); + parser.add("type", "Index type."); + parser.add("index_filename", "Index filename."); + if (!parser.parse()) return 1; - std::string type(argv[1]); - char const* index_filename = argv[2]; + auto type = parser.get("type"); + auto index_filename = parser.get("index_filename"); if (type == "ef_type1") { print_stats(index_filename); From 44dab2bbeb59b7466abf2ec2ead7af383855a260 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sun, 27 Oct 2019 13:09:00 +0100 Subject: [PATCH 029/102] added license --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..35abc20 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright 2019 Giulio Ermanno Pibiri + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file From 8d46c1ce9591771162c88b5cca7af4410386b25f Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sun, 27 Oct 2019 13:46:09 +0100 Subject: [PATCH 030/102] dependencies updated --- external/cmd_line_parser | 2 +- external/essentials | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/external/cmd_line_parser b/external/cmd_line_parser index de6d870..70b779f 160000 --- a/external/cmd_line_parser +++ b/external/cmd_line_parser @@ -1 +1 @@ -Subproject commit de6d870f8f01076f671a4eed6bbe55f3b9217d05 +Subproject commit 70b779fbb1c5e1bbdb5949044a6b8824a3044855 diff --git a/external/essentials b/external/essentials index 3721ea2..07db05a 160000 --- a/external/essentials +++ b/external/essentials @@ -1 +1 @@ -Subproject commit 3721ea2b02c24005088cb9efeb89b4090753bbf2 +Subproject commit 07db05abd0c058ee310ff5078eb4ec27d2b3cdcb From 4327f39f0574f49e12386be81343cbefdf55f121 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 6 Nov 2019 10:04:44 +0100 Subject: [PATCH 031/102] queries are just strings, without any id --- benchmark/benchmark_common.hpp | 7 +++---- test_data/partition_queries_by_length.py | 9 +++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp index 135992d..4309912 100644 --- a/benchmark/benchmark_common.hpp +++ b/benchmark/benchmark_common.hpp @@ -9,11 +9,10 @@ static const uint32_t runs = 5; size_t load_queries(std::vector& queries, uint32_t max_num_queries, float percentage, std::istream& is = std::cin) { assert(percentage >= 0.0 and percentage <= 1.0); - std::string line; + std::string query; queries.reserve(max_num_queries); for (uint32_t i = 0; i != max_num_queries; ++i) { - if (!std::getline(is, line)) break; - auto query = line.substr(line.find(' ') + 1, line.size()); + if (!std::getline(is, query)) break; assert(query.size() > 0); size_t size = query.size() - 1; while (size > 0 and query[size] != ' ') --size; @@ -34,7 +33,7 @@ void configure_parser_for_benchmarking(cmd_line_parser::parser& parser) { parser.add("max_num_queries", "Maximum number of queries to execute."); parser.add("percentage", "A float in [0,1] specifying how much we keep of the last token " - "in a query."); + "in a query: n x 100 <=> n%, for n in [0,1]."); parser.add("breakdown", "Collect timings breakdown.", "--breakdown"); } diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py index 7f14b42..c2397de 100644 --- a/test_data/partition_queries_by_length.py +++ b/test_data/partition_queries_by_length.py @@ -16,17 +16,15 @@ for line in f: x = line.rstrip('\n').split() l = len(x) - 1 - + string = ' '.join(x[1:l+1]) + '\n' if l > num_shards: - all_others_strings.append(line) + all_others_strings.append(string) else: - strings[l - 1].append(line) - + strings[l - 1].append(string) lines += 1 if lines % 1000000 == 0: print("processed " + str(lines) + " lines") - for i in range(0, num_shards): random.shuffle(strings[i]) for s in strings[i]: @@ -37,4 +35,3 @@ for s in all_others_strings: all_others.write(s) all_others.close() - From d50e9445cb3aa761d92f870a225fb7f4b3b3f8fe Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Fri, 8 Nov 2019 14:12:08 +0100 Subject: [PATCH 032/102] removed comment --- include/autocomplete2.hpp | 5 ----- include/autocomplete3.hpp | 5 ----- include/autocomplete4.hpp | 5 ----- 3 files changed, 15 deletions(-) diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp index ece6d2e..7216379 100644 --- a/include/autocomplete2.hpp +++ b/include/autocomplete2.hpp @@ -304,11 +304,6 @@ struct autocomplete2 { assert(m_pool.size() == 0); } - // NOTE: this can be done more efficienctly exploiting - // the fact that the strings to be extracted share a common - // prefix, thus this task should be delegated to the - // integer_fc_dictionary... (enchance the locality of the operation) - // NOTE: this only work when used during the prefix_topk step. void extract_completions(const uint32_t num_completions) { auto const& topk_scores = m_pool.scores(); auto& completions = m_topk_completion_set.completions(); diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index 44c1bf4..c015583 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -277,11 +277,6 @@ struct autocomplete3 { assert(m_pool.size() == 0); } - // NOTE: this can be done more efficienctly exploiting - // the fact that the strings to be extracted share a common - // prefix, thus this task should be delegated to the - // integer_fc_dictionary... (enchance the locality of the operation) - // NOTE: this only work when used during the prefix_topk step. void extract_completions(const uint32_t num_completions) { auto const& topk_scores = m_pool.scores(); auto& completions = m_topk_completion_set.completions(); diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp index d0f3304..ec88ec3 100644 --- a/include/autocomplete4.hpp +++ b/include/autocomplete4.hpp @@ -263,11 +263,6 @@ struct autocomplete4 { assert(m_pool.size() == 0); } - // NOTE: this can be done more efficienctly exploiting - // the fact that the strings to be extracted share a common - // prefix, thus this task should be delegated to the - // integer_fc_dictionary... (enchance the locality of the operation) - // NOTE: this only work when used during the prefix_topk step. void extract_completions(const uint32_t num_completions) { auto const& topk_scores = m_pool.scores(); auto& completions = m_topk_completion_set.completions(); From 6c79eba738dd5b1f3d67436292cd7b05e9eb4c15 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 9 Nov 2019 15:55:52 +0100 Subject: [PATCH 033/102] removed unused import --- test_data/build_inverted_and_forward.py | 1 - test_data/partition_queries_by_length.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py index c47ea17..0634d82 100644 --- a/test_data/build_inverted_and_forward.py +++ b/test_data/build_inverted_and_forward.py @@ -1,5 +1,4 @@ import sys -import numpy as np input_filename = sys.argv[1] diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py index c2397de..7dfbed6 100644 --- a/test_data/partition_queries_by_length.py +++ b/test_data/partition_queries_by_length.py @@ -1,6 +1,4 @@ -import sys -import numpy as np -import random +import sys, random input_filename = sys.argv[1] From b2cc9a5cb7ad2a2b5fb04d74121d3681bc03efd1 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 11 Nov 2019 12:13:20 +0100 Subject: [PATCH 034/102] fix --- include/integer_fc_dictionary.hpp | 9 +++------ test/test_integer_fc_dictionary.cpp | 1 + test/test_locate_prefix.cpp | 24 +++++++++++++++--------- test_data/preprocess.sh | 1 + 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/integer_fc_dictionary.hpp b/include/integer_fc_dictionary.hpp index e0b228b..443cc8f 100644 --- a/include/integer_fc_dictionary.hpp +++ b/include/integer_fc_dictionary.hpp @@ -270,7 +270,7 @@ struct integer_fc_dictionary { if (cmp < 0) { bucket_id = mi; } else { - bucket_id = mi - 1; + bucket_id = hi == -1 ? 0 : hi; h = header(bucket_id); } @@ -288,18 +288,15 @@ struct integer_fc_dictionary { cmp = uint32_range_compare(h, t, n); if (cmp > 0) { hi = mi - 1; - } else if (cmp < 0) { + } else if (cmp <= 0) { lo = mi + 1; - } else { - bucket_id = mi; - return; } } if (cmp < 0) { bucket_id = mi; } else { - bucket_id = mi - 1; + bucket_id = hi == -1 ? 0 : hi; h = header(bucket_id); } } diff --git a/test/test_integer_fc_dictionary.cpp b/test/test_integer_fc_dictionary.cpp index b67879d..d36db82 100644 --- a/test/test_integer_fc_dictionary.cpp +++ b/test/test_integer_fc_dictionary.cpp @@ -48,6 +48,7 @@ TEST_CASE("test integer_fc_dictionary") { id_type got_id = dict.locate({decoded.data(), decoded.data() + size}); + REQUIRE(got_id != global::invalid_term_id); REQUIRE_MESSAGE(got_id == id, "Error in locating the " << id << "-th string: expected id " diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp index 8938965..7924899 100644 --- a/test/test_locate_prefix.cpp +++ b/test/test_locate_prefix.cpp @@ -19,11 +19,11 @@ void test_locate_prefix(Dictionary const& dict, Index const& index, suffix_lex_range.end += 1; range got = index.locate_prefix(prefix, suffix_lex_range); - REQUIRE_MESSAGE( - (got.begin == expected.begin and got.end == expected.end), - "Error for query '" << query << "': expected [" << expected.begin - << "," << expected.end << ") but got [" - << got.begin << "," << got.end << ")"); + CHECK_MESSAGE((got.begin == expected.begin and got.end == expected.end), + "Error for query '" + << query << "': expected [" << expected.begin << "," + << expected.end << ") but got [" << got.begin << "," + << got.end << ")"); } } @@ -82,14 +82,20 @@ TEST_CASE("test locate_prefix()") { << num_terms << std::endl; { queries.clear(); - std::ifstream querylog((params.collection_basename + - ".length=" + std::to_string(num_terms)) - .c_str()); + std::string filename = params.collection_basename + + ".length=" + std::to_string(num_terms) + + ".shuffled"; + std::ifstream querylog(filename.c_str()); + if (!querylog.is_open()) { + std::cerr << "cannot open file '" << filename << "'" + << std::endl; + return; + } load_queries(queries, max_num_queries, perc, querylog); querylog.close(); } - test_locate_prefix(dict, ct_index, queries, strings); + // test_locate_prefix(dict, ct_index, queries, strings); test_locate_prefix(dict, fc_index, queries, strings); } } diff --git a/test_data/preprocess.sh b/test_data/preprocess.sh index ab4dbeb..24c9488 100755 --- a/test_data/preprocess.sh +++ b/test_data/preprocess.sh @@ -8,4 +8,5 @@ for collection in $collections; do python map_dataset.py $collection python build_stats.py $collection.mapped python build_inverted_and_forward.py $collection + python partition_queries_by_length.py $collection done From cd83f927f0b605075475486569e415f2a47f824c Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 13 Nov 2019 10:40:24 +0100 Subject: [PATCH 035/102] check for terms out of vocabulary --- include/autocomplete_common.hpp | 8 +++++--- include/fc_dictionary.hpp | 6 ++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp index 362a706..17b38b4 100644 --- a/include/autocomplete_common.hpp +++ b/include/autocomplete_common.hpp @@ -7,14 +7,16 @@ namespace autocomplete { template uint32_t parse(Dictionary const& dict, std::string const& query, completion_type& prefix, byte_range& suffix) { - uint32_t num_terms = 1; + uint32_t num_terms = 1; // for suffix byte_range_iterator it(string_to_byte_range(query)); while (true) { suffix = it.next(); if (!it.has_next()) break; auto term_id = dict.locate(suffix); - prefix.push_back(term_id); - ++num_terms; + if (term_id != global::invalid_term_id) { + prefix.push_back(term_id); + ++num_terms; + } } return num_terms; } diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp index bde263e..ed09026 100644 --- a/include/fc_dictionary.hpp +++ b/include/fc_dictionary.hpp @@ -223,8 +223,7 @@ struct fc_dictionary { if (cmp < 0) { bucket_id = mi; } else { - assert(cmp > 0); - bucket_id = hi; + bucket_id = hi == -1 ? 0 : hi; h = header(bucket_id); } @@ -344,8 +343,7 @@ struct fc_dictionary { if (cmp < 0) return global::invalid_term_id; curr += l - lcp_len + 2; } - assert(false); - __builtin_unreachable(); + return global::invalid_term_id; // term does not exist in dictionary } id_type left_locate(byte_range p, byte_range h, id_type bucket_id) const { From c5086dc7abca7e7ac2b538c24881d5ad998da5f4 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sun, 17 Nov 2019 11:07:52 +0100 Subject: [PATCH 036/102] print avg. number of terms x completion --- include/statistics.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/statistics.hpp b/include/statistics.hpp index a863814..f93444f 100644 --- a/include/statistics.hpp +++ b/include/statistics.hpp @@ -74,6 +74,10 @@ void autocomplete(m_forward_index.num_integers()) / + m_completions.size() + << std::endl; print_bpi("data", m_forward_index.data_bytes(), m_forward_index.num_integers()); print_bpi("pointers", m_forward_index.pointer_bytes(), From 95720d8f23274b51046ae92c820707835344c3e8 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 20 Nov 2019 22:32:38 +0100 Subject: [PATCH 037/102] minor changes --- benchmark/benchmark_locate_prefix.cpp | 82 ++++++++++++++++----------- external/cmd_line_parser | 2 +- external/essentials | 2 +- include/fc_dictionary.hpp | 6 ++ include/statistics.hpp | 19 ++++--- 5 files changed, 68 insertions(+), 43 deletions(-) diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp index 8d37357..998d8c7 100644 --- a/benchmark/benchmark_locate_prefix.cpp +++ b/benchmark/benchmark_locate_prefix.cpp @@ -6,36 +6,28 @@ using namespace autocomplete; +typedef std::pair query_type; + template -void benchmark(parameters const& params, fc_dictionary_type const& dict, - uint32_t max_num_queries, float keep, - essentials::json_lines& result) { +void benchmark(parameters const& params, std::vector& queries, + uint32_t num_queries, uint32_t num_terms_per_query, float keep) { + essentials::json_lines result; + result.new_line(); + result.add("num_terms_per_query", std::to_string(num_terms_per_query)); + result.add("percentage", std::to_string(keep)); + result.add("num_queries", std::to_string(num_queries)); + Index index; { typename Index::builder builder(params); builder.build(index); } - typedef std::pair query_type; - std::vector strings; - std::vector queries; - uint32_t num_queries = 0; - - { - num_queries = load_queries(strings, max_num_queries, keep, std::cin); - result.add("num_queries", std::to_string(num_queries)); - for (auto const& string : strings) { - completion_type prefix; - byte_range suffix; - parse(dict, string, prefix, suffix); - range suffix_lex_range = dict.locate_prefix(suffix); - queries.emplace_back(prefix, suffix_lex_range); - } - } - - auto musec_per_query = [&](double time) { - return time / (runs * num_queries); - }; + result.add("MiB", std::to_string(static_cast(index.bytes()) / + essentials::MiB)); + result.add( + "bytes_per_completion", + std::to_string(static_cast(index.bytes()) / index.size())); essentials::timer_type timer; timer.start(); @@ -47,7 +39,8 @@ void benchmark(parameters const& params, fc_dictionary_type const& dict, } timer.stop(); result.add("musec_per_query", - std::to_string(musec_per_query(timer.elapsed()))); + std::to_string(timer.elapsed() / (runs * num_queries))); + result.print(); } int main(int argc, char** argv) { @@ -67,6 +60,7 @@ int main(int argc, char** argv) { auto type = parser.get("type"); auto max_num_queries = parser.get("max_num_queries"); + auto num_terms_per_query = parser.get("num_terms_per_query"); auto keep = parser.get("percentage"); fc_dictionary_type dict; @@ -75,22 +69,42 @@ int main(int argc, char** argv) { builder.build(dict); } - essentials::json_lines result; - result.new_line(); - result.add("num_terms_per_query", - parser.get("num_terms_per_query")); - result.add("percentage", std::to_string(keep)); + std::vector strings; + std::vector queries; + uint32_t num_queries = 0; + + { + num_queries = load_queries(strings, max_num_queries, keep, std::cin); + for (auto const& string : strings) { + completion_type prefix; + byte_range suffix; + parse(dict, string, prefix, suffix); + range suffix_lex_range = dict.locate_prefix(suffix); + queries.emplace_back(prefix, suffix_lex_range); + } + } if (type == "trie") { - benchmark(params, dict, max_num_queries, keep, - result); + benchmark(params, queries, num_queries, + num_terms_per_query, keep); } else if (type == "fc") { - benchmark(params, dict, max_num_queries, - keep, result); + benchmark>(params, queries, num_queries, + num_terms_per_query, keep); + benchmark>(params, queries, num_queries, + num_terms_per_query, keep); + benchmark>(params, queries, num_queries, + num_terms_per_query, keep); + benchmark>(params, queries, num_queries, + num_terms_per_query, keep); + benchmark>(params, queries, num_queries, + num_terms_per_query, keep); + benchmark>(params, queries, num_queries, + num_terms_per_query, keep); + benchmark>(params, queries, num_queries, + num_terms_per_query, keep); } else { return 1; } - result.print(); return 0; } \ No newline at end of file diff --git a/external/cmd_line_parser b/external/cmd_line_parser index 70b779f..1776808 160000 --- a/external/cmd_line_parser +++ b/external/cmd_line_parser @@ -1 +1 @@ -Subproject commit 70b779fbb1c5e1bbdb5949044a6b8824a3044855 +Subproject commit 1776808718445425dcad42ba2d1b6adf2cb5e496 diff --git a/external/essentials b/external/essentials index 07db05a..da66810 160000 --- a/external/essentials +++ b/external/essentials @@ -1 +1 @@ -Subproject commit 07db05abd0c058ee310ff5078eb4ec27d2b3cdcb +Subproject commit da6681019cbad6bef62804927801dd09832e512e diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp index ed09026..1b223be 100644 --- a/include/fc_dictionary.hpp +++ b/include/fc_dictionary.hpp @@ -37,14 +37,17 @@ struct fc_dictionary { std::string curr; std::string header; + uint64_t total_characters = 0; for (uint32_t b = 0; b != buckets; ++b) { input >> header; + total_characters += header.size(); write_header(header); m_pointers_to_headers.push_back(m_headers.size()); prev.swap(header); uint32_t size = b != buckets - 1 ? BucketSize : tail; for (uint32_t i = 0; i != size; ++i) { input >> curr; + total_characters += curr.size(); uint32_t l = 0; // |lcp(curr,prev)| while (l != curr.size() and l != prev.size() and curr[l] == prev[l]) { @@ -61,6 +64,9 @@ struct fc_dictionary { m_buckets.push_back(0); } + std::cout << static_cast(total_characters) / m_size + << " characters per string" << std::endl; + input.close(); essentials::logger("DONE"); } diff --git a/include/statistics.hpp b/include/statistics.hpp index f93444f..aa1fbe0 100644 --- a/include/statistics.hpp +++ b/include/statistics.hpp @@ -10,7 +10,8 @@ namespace autocomplete { void print(std::string const& what, size_t bytes, size_t total_bytes, uint64_t num_completions) { - std::cout << " " << what << ": " << convert(bytes, essentials::MiB) + std::cout << " " << what << ": " + << essentials::convert(bytes, essentials::MiB) << " [MiB]: " << static_cast(bytes) / num_completions << " [bytes per completion] "; std::cout << "(" << (bytes * 100.0) / total_bytes << "%)" << std::endl; @@ -31,8 +32,8 @@ template ::print_stats() const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]" - << std::endl; + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]" << std::endl; print_bps("nodes", nodes_bytes(), size()); print_bps("pointers", pointers_bytes(), size()); print_bps("left extremes", left_extremes_bytes(), size()); @@ -44,7 +45,8 @@ template ::print_stats() const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: " + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]: " << static_cast(total_bytes) / m_completions.size() << " [bytes per completion] " << std::endl; @@ -89,7 +91,8 @@ template ::print_stats() const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: " + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]: " << static_cast(total_bytes) / m_completions.size() << " [bytes per completion] " << std::endl; @@ -124,7 +127,8 @@ template ::print_stats() const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: " + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]: " << static_cast(total_bytes) / m_completions.size() << " [bytes per completion] " << std::endl; @@ -149,7 +153,8 @@ template ::print_stats() const { size_t total_bytes = bytes(); - std::cout << "using " << convert(total_bytes, essentials::MiB) << " [MiB]: " + std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) + << " [MiB]: " << static_cast(total_bytes) / m_completions.size() << " [bytes per completion] " << std::endl; From d807990fbddb201c8a364c4f5eb79cd3055034b2 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 21 Nov 2019 15:21:08 +0100 Subject: [PATCH 038/102] map queries --- benchmark/benchmark_locate_prefix.cpp | 24 ++++++------ include/integer_fc_dictionary.hpp | 4 +- include/inverted_index.hpp | 14 +++++++ src/CMakeLists.txt | 3 +- src/map_queries.cpp | 54 +++++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 15 deletions(-) create mode 100644 src/map_queries.cpp diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp index 998d8c7..f9e6282 100644 --- a/benchmark/benchmark_locate_prefix.cpp +++ b/benchmark/benchmark_locate_prefix.cpp @@ -88,20 +88,20 @@ int main(int argc, char** argv) { benchmark(params, queries, num_queries, num_terms_per_query, keep); } else if (type == "fc") { - benchmark>(params, queries, num_queries, - num_terms_per_query, keep); - benchmark>(params, queries, num_queries, - num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); benchmark>(params, queries, num_queries, num_terms_per_query, keep); - benchmark>(params, queries, num_queries, - num_terms_per_query, keep); - benchmark>(params, queries, num_queries, - num_terms_per_query, keep); - benchmark>(params, queries, num_queries, - num_terms_per_query, keep); - benchmark>(params, queries, num_queries, - num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); + // benchmark>(params, queries, num_queries, + // num_terms_per_query, keep); } else { return 1; } diff --git a/include/integer_fc_dictionary.hpp b/include/integer_fc_dictionary.hpp index 443cc8f..39e547f 100644 --- a/include/integer_fc_dictionary.hpp +++ b/include/integer_fc_dictionary.hpp @@ -174,12 +174,12 @@ struct integer_fc_dictionary { p_end += right_locate(completion_to_uint32_range(prefix), h_end, bucket_id_end); + prefix.pop_back(); + if (p_end < p_begin) { - prefix.pop_back(); return global::invalid_range; } - prefix.pop_back(); if (suffix_lex_range.begin == suffix_lex_range.end) { prefix.pop_back(); } diff --git a/include/inverted_index.hpp b/include/inverted_index.hpp index cd4ad29..0bef228 100644 --- a/include/inverted_index.hpp +++ b/include/inverted_index.hpp @@ -28,10 +28,18 @@ struct inverted_index { std::vector list; m_pointers.push_back(0); + + uint32_t max_list_size = 0; + uint32_t min_list_size = uint32_t(-1); + for (uint64_t i = 0; i != num_terms; ++i) { list.clear(); uint32_t n = 0; input >> n; + + if (n > max_list_size) max_list_size = n; + if (n < min_list_size) min_list_size = n; + list.reserve(n); m_num_integers += n; for (uint64_t k = 0; k != n; ++k) { @@ -46,6 +54,12 @@ struct inverted_index { m_pointers.push_back(m_bvb.size()); } + std::cout << "avg. list size = " + << static_cast(m_num_integers) / num_terms + << std::endl; + std::cout << "max_list_size = " << max_list_size << std::endl; + std::cout << "min_list_size = " << min_list_size << std::endl; + m_pointers.pop_back(); input.close(); essentials::logger("DONE"); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a9e4661..576f34b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,4 +2,5 @@ add_executable(build build.cpp) add_executable(web_server web_server.cpp ../external/mongoose/mongoose.c) add_executable(output_ds2i_format output_ds2i_format.cpp) add_executable(statistics statistics.cpp) -add_executable(check_topk check_topk.cpp) \ No newline at end of file +add_executable(check_topk check_topk.cpp) +add_executable(map_queries map_queries.cpp) \ No newline at end of file diff --git a/src/map_queries.cpp b/src/map_queries.cpp new file mode 100644 index 0000000..f607d3d --- /dev/null +++ b/src/map_queries.cpp @@ -0,0 +1,54 @@ +#include + +#include "types.hpp" + +using namespace autocomplete; + +template +completion_type parse(Dictionary const& dict, std::string const& query) { + completion_type completion; + byte_range_iterator it(string_to_byte_range(query)); + while (true) { + byte_range term = it.next(); + if (!it.has_next()) break; + auto term_id = dict.locate(term); + assert(term_id > 0); + assert(term_id != global::invalid_term_id); + completion.push_back(term_id - 1); + } + return completion; +} + +int main(int argc, char** argv) { + int mandatory = 2 + 1; + if (argc < mandatory) { + std::cout << argv[0] << " < queries" + << std::endl; + return 1; + } + + parameters params; + params.collection_basename = argv[1]; + params.load(); + + uint32_t num_queries = std::atoi(argv[2]); + + fc_dictionary_type dict; + { + fc_dictionary_type::builder builder(params); + builder.build(dict); + } + + std::string query; + for (uint32_t i = 0; i != num_queries; ++i) { + if (!std::getline(std::cin, query)) break; + auto completion = parse(dict, query); + std::cout << completion.front(); + for (size_t i = 1; i != completion.size(); ++i) { + std::cout << "\t" << completion[i]; + } + std::cout << "\n"; + } + + return 0; +} \ No newline at end of file From 7309a0aa3b0dd5e86c934ccb1b3367ae8e666787 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 21 Nov 2019 15:23:36 +0100 Subject: [PATCH 039/102] map queries --- src/map_queries.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/map_queries.cpp b/src/map_queries.cpp index f607d3d..17a460d 100644 --- a/src/map_queries.cpp +++ b/src/map_queries.cpp @@ -8,9 +8,8 @@ template completion_type parse(Dictionary const& dict, std::string const& query) { completion_type completion; byte_range_iterator it(string_to_byte_range(query)); - while (true) { + while (it.has_next()) { byte_range term = it.next(); - if (!it.has_next()) break; auto term_id = dict.locate(term); assert(term_id > 0); assert(term_id != global::invalid_term_id); From a22a83db1d3a69d37589a748fadacff1de518adf Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 21 Nov 2019 15:25:15 +0100 Subject: [PATCH 040/102] map queries --- src/map_queries.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/map_queries.cpp b/src/map_queries.cpp index 17a460d..de43df1 100644 --- a/src/map_queries.cpp +++ b/src/map_queries.cpp @@ -42,11 +42,11 @@ int main(int argc, char** argv) { for (uint32_t i = 0; i != num_queries; ++i) { if (!std::getline(std::cin, query)) break; auto completion = parse(dict, query); - std::cout << completion.front(); + std::cerr << completion.front(); for (size_t i = 1; i != completion.size(); ++i) { - std::cout << "\t" << completion[i]; + std::cerr << "\t" << completion[i]; } - std::cout << "\n"; + std::cerr << "\n"; } return 0; From 350df1bd9c345dbcf04cfbfffb1bea84404a4b9a Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Fri, 22 Nov 2019 11:26:02 +0100 Subject: [PATCH 041/102] fix benchmark_conjunctive_topk --- benchmark/benchmark_conjunctive_topk.cpp | 4 ++-- .../collect_results_by_varying_percentage.py | 24 +++++++++++++++++++ ...lect_topk_results_by_varying_percentage.py | 23 ------------------ 3 files changed, 26 insertions(+), 25 deletions(-) create mode 100644 script/collect_results_by_varying_percentage.py delete mode 100644 script/collect_topk_results_by_varying_percentage.py diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp index 83e2c99..ad10ec6 100644 --- a/benchmark/benchmark_conjunctive_topk.cpp +++ b/benchmark/benchmark_conjunctive_topk.cpp @@ -27,7 +27,7 @@ void benchmark(std::string const& index_filename, uint32_t k, std::vector timers(4); for (uint32_t run = 0; run != runs; ++run) { for (auto const& query : queries) { - auto it = index.prefix_topk(query, k, timers); + auto it = index.conjunctive_topk(query, k, timers); reported_strings += it.size(); } } @@ -45,7 +45,7 @@ void benchmark(std::string const& index_filename, uint32_t k, timer.start(); for (uint32_t run = 0; run != runs; ++run) { for (auto const& query : queries) { - auto it = index.prefix_topk(query, k); + auto it = index.conjunctive_topk(query, k); reported_strings += it.size(); } } diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py new file mode 100644 index 0000000..baeeb85 --- /dev/null +++ b/script/collect_results_by_varying_percentage.py @@ -0,0 +1,24 @@ +import sys, os + +index_type = sys.argv[1] +query_mode = sys.argv[2] # topk, prefix_topk, conjunctive_topk +index_filename = sys.argv[3] +dataset_name = sys.argv[4] +k = sys.argv[5] +num_queries = sys.argv[6] + +output_filename = dataset_name + "." + index_type + +breakdown = "" +if len(sys.argv) > 7 and sys.argv[7] == "--breakdown": + breakdown = "--breakdown" + output_filename += ".breakdown" + +output_filename += "." + query_mode + ".timings.json" + +percentages = ["0.0", "0.25", "0.50", "0.75"] + +for perc in percentages: + for terms in range(2,8): # (1,8) + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename) + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) diff --git a/script/collect_topk_results_by_varying_percentage.py b/script/collect_topk_results_by_varying_percentage.py deleted file mode 100644 index f520405..0000000 --- a/script/collect_topk_results_by_varying_percentage.py +++ /dev/null @@ -1,23 +0,0 @@ -import sys, os - -type = sys.argv[1] -index_filename = sys.argv[2] -dataset_name = sys.argv[3] -k = sys.argv[4] -num_queries = sys.argv[5] - -output_filename = dataset_name + "." + type - -breakdown = "" -if len(sys.argv) > 6 and sys.argv[6] == "--breakdown": - breakdown = "--breakdown" - output_filename += ".breakdown" - -output_filename += ".topk.timings.json" - -percentages = ["0.0", "0.25", "0.50", "0.75"] - -for perc in percentages: - for terms in range(2,8): # (1,8) - os.system("../build/benchmark_topk " + type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename) - os.system("../build/benchmark_topk " + type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) From 4aca38391216cfc37ac825bff6486d745cf13f84 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Fri, 22 Nov 2019 11:30:11 +0100 Subject: [PATCH 042/102] fix benchmark_conjunctive_topk --- include/autocomplete4.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp index ec88ec3..d884912 100644 --- a/include/autocomplete4.hpp +++ b/include/autocomplete4.hpp @@ -88,7 +88,7 @@ struct autocomplete4 { if (suffix_lex_range.is_invalid()) return m_pool.begin(); uint32_t num_completions = - conjunctive_topk(prefix, suffix_lex_range, k, m_pool.scores()); + conjunctive_topk(prefix, suffix_lex_range, k); extract_completions(num_completions); return extract_strings(num_completions); } @@ -217,8 +217,7 @@ struct autocomplete4 { // step 2 timers[2].start(); - num_completions = - conjunctive_topk(prefix, suffix_lex_range, k, m_pool.scores()); + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); timers[2].stop(); // step 3 From cca6b637674b89a1b07e841241e02d53c861d2b9 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 23 Nov 2019 10:45:47 +0100 Subject: [PATCH 043/102] small optimization for block_inv_idx --- include/blocked_inverted_index.hpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index 8425e4e..c9c3bf1 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -251,6 +251,11 @@ struct blocked_inverted_index { return id; } + uint32_t block_boundary(uint32_t block_id) const { + assert(block_id < m_blocks.size()); + return m_blocks[block_id]; + } + struct block_type { docs_iterator_type docs_iterator; offsets_iterator_type offsets_iterator; @@ -312,14 +317,16 @@ struct blocked_inverted_index { { uint32_t current_block_id = ii->block_id(r.begin); - uint32_t i = r.begin; - for (; i != r.end; ++i) { + uint32_t current_block_boundary = + ii->block_boundary(current_block_id); + for (uint32_t i = r.begin; i != r.end; ++i) { assert(i > 0); - uint32_t b = ii->block_id(i); - if (b > current_block_id) { + if (i > current_block_boundary) { m_range.push_back(ii->block(current_block_id)); + current_block_id += 1; + current_block_boundary = + ii->block_boundary(current_block_id); } - current_block_id = b; } m_range.push_back(ii->block(current_block_id)); } From 9cebb3dc20301a87a07532ab0508858f4036b675 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 23 Nov 2019 19:09:57 +0100 Subject: [PATCH 044/102] minor fix: ensure bit width --- include/bit_vector.hpp | 1 + include/blocked_inverted_index.hpp | 1 + include/compact_vector.hpp | 16 +++++++++------- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/bit_vector.hpp b/include/bit_vector.hpp index 676c112..747faef 100644 --- a/include/bit_vector.hpp +++ b/include/bit_vector.hpp @@ -412,6 +412,7 @@ struct bits_getter { , m_base(offset) , m_width(width) , m_mask(-(width == 64) | ((uint64_t(1) << width) - 1)) { + assert(width > 0); util::prefetch(m_data + m_base / 64); } diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index c9c3bf1..cf6307e 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -118,6 +118,7 @@ struct blocked_inverted_index { auto max = *std::max_element(term_list.begin(), term_list.end()); uint64_t width = util::ceil_log2(max + 1); + if (width == 0) width = 1; // std::cout << "using " << width << " [bpi]" << std::endl; m_terms.append_bits(width, 6); for (auto t : term_list) m_terms.append_bits(t, width); diff --git a/include/compact_vector.hpp b/include/compact_vector.hpp index eb3f9b0..da99182 100644 --- a/include/compact_vector.hpp +++ b/include/compact_vector.hpp @@ -73,24 +73,26 @@ struct compact_vector { }; struct builder { - builder(uint64_t n = 0, uint64_t w = 0) + builder() {} + + builder(uint64_t n, uint64_t w) : m_size(n) - , m_width(!w ? w + 1 : w) + , m_width(w) , m_mask(-(w == 64) | ((1ULL << w) - 1)) , m_back(0) , m_cur_block(0) , m_cur_shift(0) , m_bits(essentials::words_for(m_size * m_width), 0) { - if (m_width > 64) { - throw std::runtime_error("width must be <= 64"); + if (m_width == 0 or m_width > 64) { + throw std::runtime_error("width must be > 0 and <= 64"); } } void resize(size_t n, uint64_t w) { m_size = n; - m_width = !w ? w + 1 : w; - if (m_width > 64) { - throw std::runtime_error("width must be <= 64"); + m_width = w; + if (m_width == 0 or m_width > 64) { + throw std::runtime_error("width must be > 0 and <= 64"); } m_mask = -(w == 64) | ((uint64_t(1) << w) - 1); m_bits.resize(essentials::words_for(m_size * m_width), 0); From a52d05d52f2e89691f40e66170a23dcfbe6c4575 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 26 Nov 2019 16:04:53 +0100 Subject: [PATCH 045/102] optimized bast and weber --- benchmark/benchmark_conjunctive_topk.cpp | 8 +++ benchmark/benchmark_prefix_topk.cpp | 18 +++-- include/autocomplete2.hpp | 10 +-- include/autocomplete3.hpp | 16 ++--- include/autocomplete4.hpp | 64 +++++++++++++++-- include/blocked_inverted_index.hpp | 91 ++++++++---------------- 6 files changed, 122 insertions(+), 85 deletions(-) diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp index ad10ec6..23f9bba 100644 --- a/benchmark/benchmark_conjunctive_topk.cpp +++ b/benchmark/benchmark_conjunctive_topk.cpp @@ -53,6 +53,14 @@ void benchmark(std::string const& index_filename, uint32_t k, std::cout << reported_strings << std::endl; breakdowns.add("musec_per_query", std::to_string(musec_per_query(timer.elapsed()))); + + // for (auto const& query : queries) { + // auto it = index.conjunctive_topk(query, k); + // reported_strings += it.size(); + // } + // breakdowns.add("avg_results_per_query", + // std::to_string(static_cast(reported_strings) / + // queries.size())); } } diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp index 28046a2..2c31c68 100644 --- a/benchmark/benchmark_prefix_topk.cpp +++ b/benchmark/benchmark_prefix_topk.cpp @@ -34,12 +34,14 @@ void benchmark(std::string const& index_filename, uint32_t k, std::cout << reported_strings << std::endl; breakdowns.add("parsing_musec_per_query", std::to_string(musec_per_query(timers[0].elapsed()))); - breakdowns.add("completions_search_musec_per_query", + // breakdowns.add("completions_search_musec_per_query", + // std::to_string(musec_per_query(timers[1].elapsed()))); + // breakdowns.add("topk_rmq_musec_per_query", + // std::to_string(musec_per_query(timers[2].elapsed()))); + breakdowns.add("prefix_search_musec_per_query", std::to_string(musec_per_query(timers[1].elapsed()))); - breakdowns.add("topk_rmq_musec_per_query", - std::to_string(musec_per_query(timers[2].elapsed()))); breakdowns.add("reporting_musec_per_query", - std::to_string(musec_per_query(timers[3].elapsed()))); + std::to_string(musec_per_query(timers[2].elapsed()))); } else { essentials::timer_type timer; timer.start(); @@ -53,6 +55,14 @@ void benchmark(std::string const& index_filename, uint32_t k, std::cout << reported_strings << std::endl; breakdowns.add("musec_per_query", std::to_string(musec_per_query(timer.elapsed()))); + + // for (auto const& query : queries) { + // auto it = index.prefix_topk(query, k); + // reported_strings += it.size(); + // } + // breakdowns.add("avg_results_per_query", + // std::to_string(static_cast(reported_strings) / + // queries.size())); } } diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp index 7216379..52b7273 100644 --- a/include/autocomplete2.hpp +++ b/include/autocomplete2.hpp @@ -209,19 +209,19 @@ struct autocomplete2 { suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); if (r.is_invalid()) return m_pool.begin(); - timers[1].stop(); + // timers[1].stop(); // step 2 - timers[2].start(); + // timers[2].start(); uint32_t num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[2].stop(); + timers[1].stop(); // step 3 - timers[3].start(); + timers[2].start(); extract_completions(num_completions); auto it = extract_strings(num_completions); - timers[3].stop(); + timers[2].stop(); return it; } diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index c015583..4faf5a6 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -320,9 +320,7 @@ struct autocomplete3 { uint32_t results = 0; for (; it.has_next() and !q.empty(); ++it) { auto doc_id = *it; - - bool found = false; - while (!q.empty() and !found) { + while (!q.empty()) { auto& z = q.top(); auto val = *z; if (val > doc_id) break; @@ -334,12 +332,12 @@ struct autocomplete3 { q.heapify(); } } - if (val == doc_id) found = true; - } - - if (found) { - topk_scores[results++] = doc_id; - if (results == k) break; + if (val == doc_id) { // NOTE: putting else here seems to slow + // down the code! + topk_scores[results++] = doc_id; + if (results == k) return results; + break; + } } } diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp index d884912..ecab539 100644 --- a/include/autocomplete4.hpp +++ b/include/autocomplete4.hpp @@ -274,19 +274,75 @@ struct autocomplete4 { } } + typedef typename BlockedInvertedIndex::block_type block_t; + + struct block_type_comparator { + bool operator()(block_t& l, block_t& r) { + return l.docs_iterator.operator*() > r.docs_iterator.operator*(); + } + }; + + typedef min_heap min_priority_queue_type; + uint32_t conjunctive_topk(completion_type& prefix, const range suffix, const uint32_t k) { auto& topk_scores = m_pool.scores(); deduplicate(prefix); + + min_priority_queue_type q; + uint32_t current_block_id = m_inverted_index.block_id(suffix.begin); + uint32_t current_block_boundary = + m_inverted_index.block_boundary(current_block_id); + for (uint32_t i = suffix.begin; i != suffix.end; ++i) { + assert(i > 0); + if (i > current_block_boundary) { + q.push_back(m_inverted_index.block(current_block_id)); + current_block_id += 1; + current_block_boundary = + m_inverted_index.block_boundary(current_block_id); + } + } + q.push_back(m_inverted_index.block(current_block_id)); + q.make_heap(); + auto it = m_inverted_index.intersection_iterator(prefix, suffix); uint32_t results = 0; - for (; it.has_next(); ++it) { + for (; it.has_next() and !q.empty(); ++it) { auto doc_id = *it; - if (it.intersects()) { - topk_scores[results++] = doc_id; - if (results == k) break; + + while (!q.empty()) { + auto& z = q.top(); + auto val = z.docs_iterator.operator*(); + if (val > doc_id) break; + if (val < doc_id) { + val = z.docs_iterator.next_geq(doc_id); + if (!z.docs_iterator.has_next()) { + q.pop(); + } else { + q.heapify(); + } + } else { + if (val == doc_id) { + uint64_t pos = z.docs_iterator.position(); + assert(z.docs_iterator.access(pos) == doc_id); + uint64_t begin = z.offsets_iterator.access(pos); + uint64_t end = z.offsets_iterator.access(pos + 1); + assert(end > begin); + for (uint64_t i = begin; i != end; ++i) { + auto t = z.terms_iterator.access(i) + z.lower_bound; + if (t > suffix.end) break; + if (suffix.contains(t)) { + topk_scores[results++] = doc_id; + if (results == k) return results; + break; + } + } + } + break; + } } } + return results; } diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index cf6307e..e87aa32 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -316,22 +316,6 @@ struct blocked_inverted_index { m_candidate = 0; } - { - uint32_t current_block_id = ii->block_id(r.begin); - uint32_t current_block_boundary = - ii->block_boundary(current_block_id); - for (uint32_t i = r.begin; i != r.end; ++i) { - assert(i > 0); - if (i > current_block_boundary) { - m_range.push_back(ii->block(current_block_id)); - current_block_id += 1; - current_block_boundary = - ii->block_boundary(current_block_id); - } - } - m_range.push_back(ii->block(current_block_id)); - } - next(); } @@ -356,25 +340,6 @@ struct blocked_inverted_index { next(); } - bool intersects() { - for (auto& b : m_range) { - uint64_t val = b.docs_iterator.next_geq(m_candidate); - if (val == m_candidate) { - uint64_t pos = b.docs_iterator.position(); - assert(b.docs_iterator.access(pos) == m_candidate); - uint64_t begin = b.offsets_iterator.access(pos); - uint64_t end = b.offsets_iterator.access(pos + 1); - assert(end > begin); - for (uint64_t i = begin; i != end; ++i) { - auto t = b.terms_iterator.access(i) + b.lower_bound; - if (t > m_suffix.end) break; - if (m_suffix.contains(t)) return true; - } - } - } - return false; - } - private: id_type m_candidate; size_t m_i; @@ -440,34 +405,6 @@ struct blocked_inverted_index { return intersection_iterator_type(this, term_ids, r); } - template - void visit(Visitor& visitor) { - visitor.visit(m_num_integers); - visitor.visit(m_num_docs); - visitor.visit(m_num_terms); - visitor.visit(m_blocks); - visitor.visit(m_pointers_to_lists); - visitor.visit(m_lists); - visitor.visit(m_pointers_to_offsets); - visitor.visit(m_offsets); - visitor.visit(m_pointers_to_terms); - visitor.visit(m_terms); - } - -private: - uint64_t m_num_integers; - uint64_t m_num_docs; - uint64_t m_num_terms; - - std::vector m_blocks; - - ef::ef_sequence m_pointers_to_lists; - bit_vector m_lists; - ef::ef_sequence m_pointers_to_offsets; - bit_vector m_offsets; - ef::ef_sequence m_pointers_to_terms; - bit_vector m_terms; - block_type block(uint32_t block_id) const { assert(block_id < num_blocks()); block_type b; @@ -496,6 +433,34 @@ struct blocked_inverted_index { return b; } + + template + void visit(Visitor& visitor) { + visitor.visit(m_num_integers); + visitor.visit(m_num_docs); + visitor.visit(m_num_terms); + visitor.visit(m_blocks); + visitor.visit(m_pointers_to_lists); + visitor.visit(m_lists); + visitor.visit(m_pointers_to_offsets); + visitor.visit(m_offsets); + visitor.visit(m_pointers_to_terms); + visitor.visit(m_terms); + } + +private: + uint64_t m_num_integers; + uint64_t m_num_docs; + uint64_t m_num_terms; + + std::vector m_blocks; + + ef::ef_sequence m_pointers_to_lists; + bit_vector m_lists; + ef::ef_sequence m_pointers_to_offsets; + bit_vector m_offsets; + ef::ef_sequence m_pointers_to_terms; + bit_vector m_terms; }; } // namespace autocomplete \ No newline at end of file From 802ef16b303284ae09541475efd17f89a370553a Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 28 Nov 2019 12:06:34 +0100 Subject: [PATCH 046/102] refactoring --- README.md | 133 ++++-------- TODO.md | 0 benchmark/benchmark_common.hpp | 6 + benchmark/benchmark_conjunctive_topk.cpp | 17 +- include/autocomplete.hpp | 14 +- include/autocomplete2.hpp | 51 ++--- include/autocomplete3.hpp | 35 ++- include/autocomplete4.hpp | 34 ++- include/blocked_inverted_index.hpp | 2 +- include/building_util.hpp | 47 ++-- include/compact_forward_index.hpp | 11 +- include/compact_vector.hpp | 19 +- include/integer_fc_dictionary.hpp | 15 +- include/inverted_index.hpp | 2 +- include/parameters.hpp | 3 + install.sh | 2 +- results/README.md | 22 -- results/conjunctive_topk.md | 107 ---------- results/fc_dictionary.md | 75 ------- results/integer_fc_dictionary.md | 31 --- results/inverted_index_space.md | 19 -- results/prefix_topk.md | 94 -------- results/space.md | 159 -------------- results/topk.md | 201 ------------------ .../collect_results_by_varying_percentage.py | 12 +- src/output_ds2i_format.cpp | 2 +- test/test_blocked_inverted_index.cpp | 4 +- test/test_compact_forward_index.cpp | 4 +- test/test_inverted_index.cpp | 8 +- test/test_locate_prefix.cpp | 8 +- test/test_unsorted_list.cpp | 20 +- test_data/build_inverted_and_forward.py | 7 +- test_data/build_stats.py | 10 + test_data/filter_and_preprocess.sh | 14 ++ test_data/filter_dataset.py | 32 +++ test_data/partition_queries_by_length.py | 23 +- test_data/preprocess.sh | 17 +- 37 files changed, 281 insertions(+), 979 deletions(-) delete mode 100644 TODO.md delete mode 100644 results/README.md delete mode 100644 results/conjunctive_topk.md delete mode 100644 results/fc_dictionary.md delete mode 100644 results/integer_fc_dictionary.md delete mode 100644 results/inverted_index_space.md delete mode 100644 results/prefix_topk.md delete mode 100644 results/space.md delete mode 100644 results/topk.md create mode 100644 test_data/filter_and_preprocess.sh create mode 100644 test_data/filter_dataset.py diff --git a/README.md b/README.md index 31c1649..f19bd7b 100644 --- a/README.md +++ b/README.md @@ -4,119 +4,65 @@ Autocomplete Query autocompletion in C++. ##### Table of contents -1. [Description](#descr) -2. [Installation and quick start](#install) -3. [Compiling the code](#compiling) -4. [Input data format](#input) -5. [Running the unit tests](#testing) -6. [Building an index](#building) -7. [Benchmarks](#benchmarks) -8. [Live demo](#demo) - -Description ------------ - -We designed two solutions (`autocomplete.hpp` and `autocomplete2.hpp`). -The second solution avoids storing the forward index of the first solution. - -Both solution build on two steps: (1) a prefix search (`prefix_topk`) and (2) a conjunctive search (`conjunctive_topk`). - -Recall that each completion has an associated integer identifier (henceforth, called docID), assigned in *decreasing* score order. - -#### 1. Prefix search - -This step returns the top-k completions that are prefixed by the terms in the query. -For this purposes, we build a dictionary storing all completions seen as (multi-) sets of termIDs. -Solution 1 uses an integer trie data structure (`completion_trie.hpp`); -Solution 2 uses Front Coding (`integer_fc_dictionary.hpp`). -We also materialize the list L of docIDs sorted by the lexicographical order of the completions (`unsorted_list.hpp`). - -During a search, we first map the query terms to their lexicographic IDs by using a string dictionary (implemented as a 2-level index with Front Coding -- `fc_dictionary.hpp`). Then, we search the mapped query, say Q, into the completion trie to obtain the lexicographic range [l,r] of all completions that are children of Q. Then we need to identify the top-k docIDs from L[l,r]. Since the range [l,r] can be very large, we use a RMQ data structure built on L. - -Having retrieved a list of (at most) k docIDs, we then: - -1. Solution 1: use a forward index (`forward_index.hpp`) to materialize the identified completions into a string pool (`scored_string_pool.hpp`). -The forward index stores the sorted (multi-) set of the termIDs of each completion, plus also the permutation of such termIDs in order to restore the original completion. The sets are stored in increasing-docID order. -Specifically, we use the forward index to obtain the (permuted) set -of termIDs and the string dictionary to extract the strings. - -2. Solution 2: use a map from docIDs to lexicographic IDs. For every top-k docID, we extract the corresponding completion from the FC-based dictionary. - -#### 2. Conjunctive search - -This step returns the top-k completions using an inverted index (`inverted_index.hpp`). -For this purpose, let us consider a query Q[1,m] as tokenized into m terms (the last one possibly not completed). -In this case we want to return the top-k (smallest) docIDs belonging -to the intersection between the posting lists of the first m-1 terms -and the union between all the postings lists of the terms that are -prefixed by Q[m]. - -To do so, we could trivially materialize the union and then proceed -with the intersection. -The clear problem with this approach is that the number of terms that are prefixed by Q[m] can be very large. Therefore iterating over the union can be overkilling. - -To solve this problem, we first obtain the lexicographic range of Q[m] by the string dictionary, say [l,r]. -We then iterate over the intersection of the first m-1 terms' posting lists and for each docID x we check whether the range [l,r] intersect the forward list of x. This check is done with the forward index. -If the check succeeds, then x is among the top-k documents. -We keep iterating over the intersection and checking the forward lists until we have k completions or we touch every docID in the intersection. - -There is a special case for the case m = 1. In this case, we have no term before the last (only) one, thus we would check *all* forward lists for the range [l,r]. This is too expensive. -Therefore, we use another RMQ data structure, built on the list, say M, of all the first (i.e., *minimal*) docIDs of the posting lists (think of it as the "first" column of the inverted index). -A recursive heap-based algorithm is used to produce the smallest docIDs in M[l,r] using the RMQ data structure. - -The final string extraction step is identical to that of the -prefix search. +1. [Installation and quick start](#install) +2. [Compiling the code](#compiling) +3. [Input data format](#input) +4. [Running the unit tests](#testing) +5. [Building an index](#building) +6. [Benchmarks](#benchmarks) +7. [Live demo](#demo) Installation and quick start ------------------ Just run - $ bash ./install.sh + bash ./install.sh -from the parent directory. The script builds the code; prepare the test data in the folder `test_data` for indexing; executes the unit tests. +from the parent directory. The script builds the code; prepare the test data in the folder `test_data/trec_05_efficiency_queries` for indexing; executes the unit tests. After that, for having a minimal running example, just run - $ bash ./example.sh + bash ./example.sh and then access the service [here](http://127.0.0.1:8000). Compiling the code ------------------ -The code is tested on Linux with `gcc` 7.4.0 and on Mac 10.14 with `clang` 10.0.0. +The code has been tested on Linux with `gcc` 7.4.0, 8.3.0, 9.0.0 and on Mac 10.14 with `clang` 10.0.0. To build the code, [`CMake`](https://cmake.org/) is required. Clone the repository with - $ git clone --recursive https://github.com/jermp/autocomplete.git + git clone --recursive https://github.com/jermp/autocomplete.git If you have cloned the repository without `--recursive`, you will need to perform the following commands before compiling: - $ git submodule init - $ git submodule update + git submodule init + git submodule update To compile the code for a release environment (see file `CMakeLists.txt` for the used compilation flags), it is sufficient to do the following: - $ mkdir build - $ cd build - $ cmake .. - $ make + mkdir build + cd build + cmake .. + make -Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs. +Hint: Use `make -j` to compile the library in parallel using all +available threads. For the best of performance, we recommend compiling with: - $ cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On + cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On For a testing environment, use the following instead: - $ mkdir debug_build - $ cd debug_build - $ cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On - $ make + mkdir debug_build + cd debug_build + cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On + make Input data format ----------------- @@ -137,7 +83,11 @@ in preparing the data for indexing. Thus, from within the directory `test_data`, it is sufficient to do: - $ bash preprocess.sh + bash preprocess.sh 300 + +The second argument in the example, i.e., 300, represents the +number of completions (per completion size) that are drawn at +random and could be used to query the indexes. If you run the script, you will get: @@ -168,7 +118,7 @@ The unit tests are written using [doctest](https://github.com/onqtam/doctest). After compilation and preparation of the data for indexing (see Section [Input data format](#input)), it is advised to run the unit tests with: - $ make test + make test Building an index ----------- @@ -178,31 +128,36 @@ where the index will be written. For example, with - $ ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin + ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec05.ef_type1.bin we can build an index of type `ef_type1` from the test file `../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions`, that will be serialized to the file `trec05.ef_type1.bin`. Possible types are `ef_type1`, `ef_type2`, `ef_type3` and `ef_type4`. +Note: the type `ef_type4` requires an extra parameter +to be specified, `c`. Use for example: `-c 0.0001`. Benchmarks ---------- To run the top-k benchmarks in the `/benchmark` directory, we first need some query logs. +They should have been created already if you have run the +script `preprocess.sh`, otherwise +you can use -You can use - - $ python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions + python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries 300 -to partition the input completions by number of query terms. Each partition -of queries is shuffled at random to avoid locality of access. +to partition the input completions by number of query terms +and retain 300 queries at random. +Query files are placed in the output directory +`trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries`. (By default, 8 shards will be created: the ones having [1,7] query terms and the one collecting all completions with >= 8 query terms). Then the command - $ ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 1000 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.length=3 + ./benchmark_topk ef_type1 10 trec05.ef_type1.bin 3 300 0.25 < ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries/queries.length=3.shuffled will execute 1000 top-10 queries with 3 terms, from which only 25% of the prefix of the last token is retained. @@ -210,7 +165,7 @@ of the prefix of the last token is retained. We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`. From within the `/build` directory, run - $ python ../script/collect_topk_results_by_varying_percentage.py ef_type1 trec05.ef_type1.bin trec_05_efficiency_queries 10 5000 + python ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300 You can also specify the option `--breakdown` to record timings breakdowns. diff --git a/TODO.md b/TODO.md deleted file mode 100644 index e69de29..0000000 diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp index 4309912..2f12c8a 100644 --- a/benchmark/benchmark_common.hpp +++ b/benchmark/benchmark_common.hpp @@ -6,6 +6,11 @@ namespace autocomplete { static const uint32_t runs = 5; +// void tolower(std::string& str) { +// std::transform(str.begin(), str.end(), str.begin(), +// [](unsigned char c) { return std::tolower(c); }); +// } + size_t load_queries(std::vector& queries, uint32_t max_num_queries, float percentage, std::istream& is = std::cin) { assert(percentage >= 0.0 and percentage <= 1.0); @@ -20,6 +25,7 @@ size_t load_queries(std::vector& queries, uint32_t max_num_queries, size_t end = size + std::ceil(last_token_size * percentage) + 1 + 1; // retain at least one char for (size = query.size(); size > end; --size) query.pop_back(); + // tolower(query); queries.push_back(query); } return queries.size(); diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp index 23f9bba..7d8a7d3 100644 --- a/benchmark/benchmark_conjunctive_topk.cpp +++ b/benchmark/benchmark_conjunctive_topk.cpp @@ -16,22 +16,35 @@ void benchmark(std::string const& index_filename, uint32_t k, uint32_t num_queries = load_queries(queries, max_num_queries, keep, std::cin); + uint32_t R = runs; // runs + uint64_t reported_strings = 0; auto musec_per_query = [&](double time) { - return time / (runs * num_queries); + return time / (R * num_queries); }; breakdowns.add("num_queries", std::to_string(num_queries)); if (breakdown) { std::vector timers(4); - for (uint32_t run = 0; run != runs; ++run) { + for (uint32_t run = 0; run != R; ++run) { for (auto const& query : queries) { auto it = index.conjunctive_topk(query, k, timers); reported_strings += it.size(); } } std::cout << reported_strings << std::endl; + + // breakdowns.add("checked_docids", + // std::to_string(index.checked_docids)); breakdowns.add("heap_size", + // std::to_string(index.heap_size)); + + // auto perc_skipped_searches = + // (static_cast(index.skipped_searches) * 100.0) / + // queries.size(); + // breakdowns.add("skipped_searches", + // std::to_string(perc_skipped_searches)); + breakdowns.add("parsing_musec_per_query", std::to_string(musec_per_query(timers[0].elapsed()))); breakdowns.add("dictionary_search_musec_per_query", diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp index 47b4472..616b13f 100644 --- a/include/autocomplete.hpp +++ b/include/autocomplete.hpp @@ -13,6 +13,9 @@ struct autocomplete { typedef scored_string_pool::iterator iterator_type; autocomplete() { + // heap_size = 0; + // checked_docids = 0; + // skipped_searches = 0; m_pool.resize(constants::POOL_SIZE, constants::MAX_K); } @@ -218,7 +221,11 @@ struct autocomplete { // step 1 timers[1].start(); range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); + if (suffix_lex_range.is_invalid()) { + // ++skipped_searches; + // std::cout << "'" << query << "'\n"; + return m_pool.begin(); + } timers[1].stop(); @@ -261,6 +268,10 @@ struct autocomplete { visitor.visit(m_forward_index); } + // uint64_t heap_size; + // uint64_t checked_docids; + // uint64_t skipped_searches; + private: Completions m_completions; UnsortedDocsList m_unsorted_docs_list; @@ -294,6 +305,7 @@ struct autocomplete { uint32_t results = 0; for (; it.has_next(); ++it) { auto doc_id = *it; + // ++checked_docids; if (m_forward_index.intersects(doc_id, r)) { topk_scores[results++] = doc_id; if (results == k) break; diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp index 52b7273..9d05226 100644 --- a/include/autocomplete2.hpp +++ b/include/autocomplete2.hpp @@ -15,6 +15,8 @@ struct autocomplete2 { typedef scored_string_pool::iterator iterator_type; autocomplete2() { + // heap_size = 0; + // checked_docids = 0; m_pool.resize(constants::POOL_SIZE, constants::MAX_K); m_topk_completion_set.resize(constants::MAX_K, 2 * constants::MAX_NUM_TERMS_PER_QUERY); @@ -26,29 +28,13 @@ struct autocomplete2 { typename Dictionary::builder di_builder(params); typename InvertedIndex::builder ii_builder(params); - auto const& doc_ids = cm_builder.doc_ids(); - m_unsorted_docs_list.build(doc_ids); + auto const& docid_to_lexid = cm_builder.docid_to_lexid(); + m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), + util::ceil_log2(params.num_completions + 1)); + m_unsorted_docs_list.build( + util::invert(docid_to_lexid, params.num_completions)); m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids()); - { - essentials::logger("building map from doc_id to lex_id..."); - uint64_t n = doc_ids.size(); - typedef std::vector> id_map_type; - id_map_type ids; - ids.reserve(n); - for (id_type lex_id = 0; lex_id != n; ++lex_id) { - ids.emplace_back(lex_id, doc_ids[lex_id]); - } - std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) { - return l.second < r.second; - }); - m_docid_to_lexid.build( - util::first_iterator( - ids.begin()), - ids.size()); - essentials::logger("DONE"); - } - cm_builder.build(m_completions); di_builder.build(m_dictionary); ii_builder.build(m_inverted_index); @@ -287,6 +273,9 @@ struct autocomplete2 { visitor.visit(m_docid_to_lexid); } + // uint64_t heap_size; + // uint64_t checked_docids; + private: Completions m_completions; UnsortedDocsList m_unsorted_docs_list; @@ -336,19 +325,17 @@ struct autocomplete2 { for (; it.has_next(); ++it) { auto doc_id = *it; + // ++checked_docids; auto lex_id = m_docid_to_lexid[doc_id]; uint32_t size = m_completions.extract(lex_id, completions[i]); - - bool found = false; - for (uint32_t j = 0; j != size and !found; ++j) { - if (r.contains(completions[i][j])) found = true; - } - - if (found) { - topk_scores[i] = doc_id; - sizes[i] = size; - ++i; - if (i == k) break; + for (uint32_t j = 0; j != size; ++j) { + if (r.contains(completions[i][j])) { + topk_scores[i] = doc_id; + sizes[i] = size; + ++i; + if (i == k) return k; + break; + } } } diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index 4faf5a6..6165e19 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -25,6 +25,8 @@ struct autocomplete3 { min_priority_queue_type; autocomplete3() { + // heap_size = 0; + // checked_docids = 0; m_pool.resize(constants::POOL_SIZE, constants::MAX_K); m_topk_completion_set.resize(constants::MAX_K, 2 * constants::MAX_NUM_TERMS_PER_QUERY); @@ -36,27 +38,11 @@ struct autocomplete3 { typename Dictionary::builder di_builder(params); typename InvertedIndex::builder ii_builder(params); - auto const& doc_ids = cm_builder.doc_ids(); - m_unsorted_docs_list.build(doc_ids); - - { - essentials::logger("building map from doc_id to lex_id..."); - uint64_t n = doc_ids.size(); - typedef std::vector> id_map_type; - id_map_type ids; - ids.reserve(n); - for (id_type lex_id = 0; lex_id != n; ++lex_id) { - ids.emplace_back(lex_id, doc_ids[lex_id]); - } - std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) { - return l.second < r.second; - }); - m_docid_to_lexid.build( - util::first_iterator( - ids.begin()), - ids.size()); - essentials::logger("DONE"); - } + auto const& docid_to_lexid = cm_builder.docid_to_lexid(); + m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), + util::ceil_log2(params.num_completions + 1)); + m_unsorted_docs_list.build( + util::invert(docid_to_lexid, params.num_completions)); cm_builder.build(m_completions); di_builder.build(m_dictionary); @@ -261,6 +247,9 @@ struct autocomplete3 { visitor.visit(m_docid_to_lexid); } + // uint64_t heap_size; + // uint64_t checked_docids; + private: Completions m_completions; UnsortedDocsList m_unsorted_docs_list; @@ -317,9 +306,13 @@ struct autocomplete3 { } q.make_heap(); + // heap_size += q.size(); + uint32_t results = 0; for (; it.has_next() and !q.empty(); ++it) { auto doc_id = *it; + // ++checked_docids; + while (!q.empty()) { auto& z = q.top(); auto val = *z; diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp index ecab539..cd44706 100644 --- a/include/autocomplete4.hpp +++ b/include/autocomplete4.hpp @@ -18,6 +18,8 @@ struct autocomplete4 { typedef scored_string_pool::iterator iterator_type; autocomplete4() { + // heap_size = 0; + // checked_docids = 0; m_pool.resize(constants::POOL_SIZE, constants::MAX_K); m_topk_completion_set.resize(constants::MAX_K, 2 * constants::MAX_NUM_TERMS_PER_QUERY); @@ -29,27 +31,11 @@ struct autocomplete4 { typename Dictionary::builder di_builder(params); typename BlockedInvertedIndex::builder ii_builder(params, c); - auto const& doc_ids = cm_builder.doc_ids(); - m_unsorted_docs_list.build(doc_ids); - - { - essentials::logger("building map from doc_id to lex_id..."); - uint64_t n = doc_ids.size(); - typedef std::vector> id_map_type; - id_map_type ids; - ids.reserve(n); - for (id_type lex_id = 0; lex_id != n; ++lex_id) { - ids.emplace_back(lex_id, doc_ids[lex_id]); - } - std::sort(ids.begin(), ids.end(), [](auto const& l, auto const& r) { - return l.second < r.second; - }); - m_docid_to_lexid.build( - util::first_iterator( - ids.begin()), - ids.size()); - essentials::logger("DONE"); - } + auto const& docid_to_lexid = cm_builder.docid_to_lexid(); + m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), + util::ceil_log2(params.num_completions + 1)); + m_unsorted_docs_list.build( + util::invert(docid_to_lexid, params.num_completions)); cm_builder.build(m_completions); di_builder.build(m_dictionary); @@ -246,6 +232,9 @@ struct autocomplete4 { visitor.visit(m_docid_to_lexid); } + // uint64_t heap_size; + // uint64_t checked_docids; + private: Completions m_completions; UnsortedDocsList m_unsorted_docs_list; @@ -305,10 +294,13 @@ struct autocomplete4 { q.push_back(m_inverted_index.block(current_block_id)); q.make_heap(); + // heap_size += q.size(); + auto it = m_inverted_index.intersection_iterator(prefix, suffix); uint32_t results = 0; for (; it.has_next() and !q.empty(); ++it) { auto doc_id = *it; + // ++checked_docids; while (!q.empty()) { auto& z = q.top(); diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index e87aa32..519a0bf 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -21,7 +21,7 @@ struct blocked_inverted_index { builder(parameters const& params, float c) : m_num_integers(0) - , m_num_docs(params.num_completions) + , m_num_docs(params.universe) , m_num_terms(params.num_terms) { if (!(c > 0.0 and c <= 1.0)) { throw std::runtime_error("c must be in (0,1]"); diff --git a/include/building_util.hpp b/include/building_util.hpp index 17427b6..0398879 100644 --- a/include/building_util.hpp +++ b/include/building_util.hpp @@ -1,10 +1,22 @@ #pragma once +#include "util.hpp" #include "bit_vector.hpp" namespace autocomplete { namespace util { +std::vector invert(std::vector const& docid_to_lexid, + uint64_t size) { + std::vector lexid_to_docid(size); + for (uint64_t doc_id = 0; doc_id != docid_to_lexid.size(); ++doc_id) { + if (docid_to_lexid[doc_id] < size) { + lexid_to_docid[docid_to_lexid[doc_id]] = doc_id; + } + } + return lexid_to_docid; +} + void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) { uint64_t mod = bvb.size() % alignment; if (mod) { @@ -23,40 +35,5 @@ void eat_pad(bits_iterator& it, uint64_t alignment = 8) { } } -template -struct first_iterator - : std::iterator { - first_iterator(Iterator it, uint64_t state = 0) - : m_it(it) - , m_state(state) {} - - typename Iterator::value_type::first_type operator*() { - return (*m_it).first; - } - - first_iterator& operator++() { - m_it += 1; - m_state += 1; - return *this; - } - - first_iterator operator+(uint64_t n) { - return {m_it + n, m_state + n}; - } - - bool operator==(first_iterator const& other) const { - return m_state == other.m_state; - } - - bool operator!=(first_iterator const& other) const { - return !(*this == other); - } - -private: - Iterator m_it; - uint64_t m_state; -}; - } // namespace util } // namespace autocomplete \ No newline at end of file diff --git a/include/compact_forward_index.hpp b/include/compact_forward_index.hpp index 74ad769..bde4b71 100644 --- a/include/compact_forward_index.hpp +++ b/include/compact_forward_index.hpp @@ -14,20 +14,19 @@ struct compact_forward_index { : m_num_integers(0) , m_num_terms(params.num_terms) { essentials::logger("building forward_index..."); - uint64_t num_completions = params.num_completions; + uint64_t universe = params.universe; std::ifstream input( (params.collection_basename + ".forward").c_str(), std::ios_base::in); - - std::vector terms; - terms.reserve(params.num_completions * + std::vector terms; + terms.reserve(universe * constants::MAX_NUM_TERMS_PER_QUERY); // at most uint64_t size = 0; m_pointers.push_back(0); - for (uint64_t i = 0; i != num_completions; ++i) { + for (uint64_t i = 0; i != universe; ++i) { uint32_t n = 0; input >> n; - assert(n > 0 and n < constants::MAX_NUM_TERMS_PER_QUERY); + assert(n < constants::MAX_NUM_TERMS_PER_QUERY); m_num_integers += n; size += n; for (uint64_t k = 0; k != n; ++k) { diff --git a/include/compact_vector.hpp b/include/compact_vector.hpp index da99182..ac8e275 100644 --- a/include/compact_vector.hpp +++ b/include/compact_vector.hpp @@ -73,7 +73,10 @@ struct compact_vector { }; struct builder { - builder() {} + builder() + : m_back(0) + , m_cur_block(0) + , m_cur_shift(0) {} builder(uint64_t n, uint64_t w) : m_size(n) @@ -95,6 +98,8 @@ struct compact_vector { throw std::runtime_error("width must be > 0 and <= 64"); } m_mask = -(w == 64) | ((uint64_t(1) << w) - 1); + std::cout << "using " << essentials::words_for(m_size * m_width) + << " words" << std::endl; m_bits.resize(essentials::words_for(m_size * m_width), 0); } @@ -110,7 +115,7 @@ struct compact_vector { throw std::runtime_error("width must be greater than 0"); } - for (uint64_t i = 0; i < n; ++i, ++begin) { + for (uint64_t i = 0; i != n; ++i, ++begin) { push_back(*begin); } } @@ -222,8 +227,13 @@ struct compact_vector { void build(Iterator begin, uint64_t n) { uint64_t max = *std::max_element(begin, begin + n); uint64_t width = util::ceil_log2(max + 1); - std::cout << "\tusing " << width << " [bpi]" << std::endl; - compact_vector::builder builder(begin, n, width); + build(begin, n, width); + } + + template + void build(Iterator begin, uint64_t n, uint64_t w) { + std::cout << "\tusing " << w << " [bpi]" << std::endl; + compact_vector::builder builder(begin, n, w); builder.build(*this); } @@ -314,4 +324,5 @@ struct compact_vector { uint64_t m_mask; std::vector m_bits; }; + } // namespace autocomplete diff --git a/include/integer_fc_dictionary.hpp b/include/integer_fc_dictionary.hpp index 39e547f..29d8743 100644 --- a/include/integer_fc_dictionary.hpp +++ b/include/integer_fc_dictionary.hpp @@ -19,7 +19,7 @@ struct integer_fc_dictionary { essentials::logger( "building integer_fc_dictionary with bucket size " + std::to_string(BucketSize) + "..."); - m_doc_ids.reserve(params.num_completions); + m_docid_to_lexid.resize(params.universe, id_type(-1)); uint32_t buckets = std::ceil(double(m_size) / (BucketSize + 1)); m_pointers_to_buckets.reserve(buckets + 1); @@ -35,9 +35,10 @@ struct integer_fc_dictionary { std::ios_base::in); completion_iterator it(params, input); + id_type lex_id = 0; for (uint32_t b = 0; b != buckets; ++b) { auto& header = *it; - m_doc_ids.push_back(header.doc_id); + m_docid_to_lexid[header.doc_id] = lex_id++; write_header(header.completion); m_pointers_to_headers.push_back(m_headers.size()); completion_type prev; @@ -47,7 +48,7 @@ struct integer_fc_dictionary { for (uint32_t i = 0; i != size; ++i, ++it) { auto& record = *it; auto& curr = record.completion; - m_doc_ids.push_back(record.doc_id); + m_docid_to_lexid[record.doc_id] = lex_id++; uint32_t l = 0; // |lcp(curr,prev)| while (l != curr.size() and l != prev.size() and curr[l] == prev[l]) { @@ -76,7 +77,7 @@ struct integer_fc_dictionary { other.m_pointers_to_buckets.swap(m_pointers_to_buckets); other.m_headers.swap(m_headers); other.m_buckets.swap(m_buckets); - other.m_doc_ids.swap(m_doc_ids); + other.m_docid_to_lexid.swap(m_docid_to_lexid); } void build(integer_fc_dictionary& d) { @@ -88,8 +89,8 @@ struct integer_fc_dictionary { builder().swap(*this); } - std::vector& doc_ids() { - return m_doc_ids; + std::vector& docid_to_lexid() { + return m_docid_to_lexid; } private: @@ -98,7 +99,7 @@ struct integer_fc_dictionary { std::vector m_pointers_to_buckets; std::vector m_headers; std::vector m_buckets; - std::vector m_doc_ids; + std::vector m_docid_to_lexid; void write_header(completion_type const& c) { assert(c.size() > 0 and diff --git a/include/inverted_index.hpp b/include/inverted_index.hpp index 0bef228..900fd96 100644 --- a/include/inverted_index.hpp +++ b/include/inverted_index.hpp @@ -16,7 +16,7 @@ struct inverted_index { builder(parameters const& params) : m_num_integers(0) - , m_num_docs(params.num_completions) { + , m_num_docs(params.universe) { essentials::logger("building inverted_index..."); uint64_t num_terms = params.num_terms; diff --git a/include/parameters.hpp b/include/parameters.hpp index 9d03783..d628d25 100644 --- a/include/parameters.hpp +++ b/include/parameters.hpp @@ -24,10 +24,12 @@ struct parameters { input >> num_terms; input >> max_string_length; input >> num_completions; + input >> universe; input >> num_levels; assert(num_terms > 0); assert(max_string_length > 0); assert(num_completions > 0); + assert(universe >= num_completions); assert(num_levels > 0); if (max_string_length > constants::MAX_NUM_CHARS_PER_QUERY) { @@ -52,6 +54,7 @@ struct parameters { uint32_t num_terms; uint32_t max_string_length; uint32_t num_completions; + uint32_t universe; uint32_t num_levels; std::vector nodes_per_level; std::string collection_basename; diff --git a/install.sh b/install.sh index 9e8da9e..7714147 100644 --- a/install.sh +++ b/install.sh @@ -5,7 +5,7 @@ cd build cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=Off -DUSE_INTRINSICS=On -DUSE_PDEP=On make cd ../test_data -./preprocess.sh +bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300 cd ../build make test cd .. diff --git a/results/README.md b/results/README.md deleted file mode 100644 index 7e6ba77..0000000 --- a/results/README.md +++ /dev/null @@ -1,22 +0,0 @@ -Test machine ------------- - -4 Intel i7-7700 cores (@3.6 GHz); 64 GB of RAM DDR3 (@2.133 GHz); running Linux 4.4.0 (64 bits); 32K for both instruction and data L1 cache; 256K for L2 cache; 8192K for L3 cache. - -Compiler --------- - -gcc 7.4.0 - -`cmake .. -DCMAKE_BUILD_TYPE=Release -DUSE_SANITIZERS=OFF -DUSE_INTRINSICS=ON -DUSE_PDEP=ON` - - -Experiments ------------ - -- The file `space.md` reports the space breakdowns. -- The file `prefix_topk.md` reports the timing breakdowns for the prefix_topk step by varying the number of query terms. -- The file `conjunctive_topk.md` reports the timing breakdowns for the conjunctive_topk step by varying the number of query terms. -- The file `topk.md` reports the total time of the `topk` operation (combining the two steps, `prefix_topk` and `conjunctive_topk`) by varying the number of query terms. -- The file `fc_dictionary.md` reports on the `fc_dictionary` benchmark. -- The file `integer_fc_dictionary.md` reports on the `integer_fc_dictionary` benchmark. \ No newline at end of file diff --git a/results/conjunctive_topk.md b/results/conjunctive_topk.md deleted file mode 100644 index 3d9747b..0000000 --- a/results/conjunctive_topk.md +++ /dev/null @@ -1,107 +0,0 @@ -Conjunctive top-k ------------------ - -Executing queries shuffled at random, for k = 7. - -Average among 10 runs. - -From the last token of the query, we only retain the first character. This means that we spend less in obtaining the lexicographic range of the character (string comparisons are -very fast), but we spend more on the RMQ phase, because the -range obtained from the completion trie can be very large. - -### AOL - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "3", "conjunctive_search_ns_per_query": "2896", "reporting_ns_per_query": "352"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "52", "dictionary_search_ns_per_query": "10", "conjunctive_search_ns_per_query": "2273", "reporting_ns_per_query": "2333"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "919", "dictionary_search_ns_per_query": "39", "conjunctive_search_ns_per_query": "20478", "reporting_ns_per_query": "1772"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1298", "dictionary_search_ns_per_query": "49", "conjunctive_search_ns_per_query": "27363", "reporting_ns_per_query": "974"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1857", "dictionary_search_ns_per_query": "42", "conjunctive_search_ns_per_query": "25484", "reporting_ns_per_query": "556"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2239", "dictionary_search_ns_per_query": "34", "conjunctive_search_ns_per_query": "22070", "reporting_ns_per_query": "438"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2871", "dictionary_search_ns_per_query": "32", "conjunctive_search_ns_per_query": "18657", "reporting_ns_per_query": "465"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3774", "dictionary_search_ns_per_query": "30", "conjunctive_search_ns_per_query": "13967", "reporting_ns_per_query": "844"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4463"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6677"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "25503"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "31536"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "29973"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "27148"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "23630"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "20511"} - -If we do not check the forward index (thus erronously reporting the first k docids of the intersection), we have: - - {"num_terms_per_query": "3", "num_queries": "50000", "conjunctive_search_ns_per_query": "10362"} - {"num_terms_per_query": "4", "num_queries": "50000", "conjunctive_search_ns_per_query": "21327"} - {"num_terms_per_query": "5", "num_queries": "50000", "conjunctive_search_ns_per_query": "23187"} - {"num_terms_per_query": "6", "num_queries": "50000", "conjunctive_search_ns_per_query": "21259"} - {"num_terms_per_query": "7", "num_queries": "50000", "conjunctive_search_ns_per_query": "18234"} - {"num_terms_per_query": "8+", "num_queries": "50000", "conjunctive_search_ns_per_query": "13912"} - -We can see that the time for the `conjunctive_search` remains the same, except for the case with 3 terms. -This suggests that the time needed to check the forward index is negligible compared to the one -needed to produce the intersection. This can also be observed considering that the time for the case with 2 terms is very small: in this case we check the forward index for each doc in the inverted list of the first term. - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3275", "reporting_ns_per_query": "330"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "109", "dictionary_search_ns_per_query": "36", "conjunctive_search_ns_per_query": "15770", "reporting_ns_per_query": "2485"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "932", "dictionary_search_ns_per_query": "52", "conjunctive_search_ns_per_query": "24290", "reporting_ns_per_query": "1780"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1388", "dictionary_search_ns_per_query": "55", "conjunctive_search_ns_per_query": "29056", "reporting_ns_per_query": "953"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1880", "dictionary_search_ns_per_query": "41", "conjunctive_search_ns_per_query": "26675", "reporting_ns_per_query": "541"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2277", "dictionary_search_ns_per_query": "43", "conjunctive_search_ns_per_query": "22955", "reporting_ns_per_query": "421"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2762", "dictionary_search_ns_per_query": "37", "conjunctive_search_ns_per_query": "19437", "reporting_ns_per_query": "443"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3878", "dictionary_search_ns_per_query": "40", "conjunctive_search_ns_per_query": "14657", "reporting_ns_per_query": "814"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4917"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "20361"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "28619"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "33140"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "30410"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "27477"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "24357"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "21042"} - -### MSN - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3021", "reporting_ns_per_query": "576"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "39", "dictionary_search_ns_per_query": "7", "conjunctive_search_ns_per_query": "2279", "reporting_ns_per_query": "1926"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "810", "dictionary_search_ns_per_query": "15", "conjunctive_search_ns_per_query": "12382", "reporting_ns_per_query": "1078"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1104", "dictionary_search_ns_per_query": "15", "conjunctive_search_ns_per_query": "13534", "reporting_ns_per_query": "526"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1737", "dictionary_search_ns_per_query": "11", "conjunctive_search_ns_per_query": "11424", "reporting_ns_per_query": "305"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2049", "dictionary_search_ns_per_query": "10", "conjunctive_search_ns_per_query": "9565", "reporting_ns_per_query": "252"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2396", "dictionary_search_ns_per_query": "9", "conjunctive_search_ns_per_query": "8020", "reporting_ns_per_query": "324"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3431", "dictionary_search_ns_per_query": "9", "conjunctive_search_ns_per_query": "6199", "reporting_ns_per_query": "738"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "4982"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6176"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "16236"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "17306"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "15591"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "13961"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "12980"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "12311"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "dictionary_search_ns_per_query": "6", "conjunctive_search_ns_per_query": "3722", "reporting_ns_per_query": "511"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "56", "dictionary_search_ns_per_query": "20", "conjunctive_search_ns_per_query": "15134", "reporting_ns_per_query": "2043"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "835", "dictionary_search_ns_per_query": "20", "conjunctive_search_ns_per_query": "15310", "reporting_ns_per_query": "1072"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1117", "dictionary_search_ns_per_query": "19", "conjunctive_search_ns_per_query": "14672", "reporting_ns_per_query": "517"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1704", "dictionary_search_ns_per_query": "14", "conjunctive_search_ns_per_query": "12384", "reporting_ns_per_query": "300"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "2164", "dictionary_search_ns_per_query": "13", "conjunctive_search_ns_per_query": "10222", "reporting_ns_per_query": "246"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2567", "dictionary_search_ns_per_query": "12", "conjunctive_search_ns_per_query": "8579", "reporting_ns_per_query": "305"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3670", "dictionary_search_ns_per_query": "12", "conjunctive_search_ns_per_query": "6644", "reporting_ns_per_query": "714"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5667"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "19144"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "18886"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "18109"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "16030"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "14423"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "13418"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "12779"} \ No newline at end of file diff --git a/results/fc_dictionary.md b/results/fc_dictionary.md deleted file mode 100644 index 37ff080..0000000 --- a/results/fc_dictionary.md +++ /dev/null @@ -1,75 +0,0 @@ -#### Results on the AOL querylog. - - pibiri@rubino:~/autocomplete/build$ ./benchmark_fc_dictionary ../test_data/aol/aol.completions 1000000 < ../test_data/aol/aol.completions.dict_queries.1M.shuffled - 2019-10-24 11:11:49: loading queries... - 2019-10-24 11:11:49: loaded 1000000 queries - 2019-10-24 11:11:49: building fc_dictionary with bucket size 4... - 2019-10-24 11:11:50: DONE - using 42938890 bytes - locate: 557.091 [ns/string] - extract: 168.772 [ns/string] - locate_prefix-0%: 213.453 [ns/string] - locate_prefix-25%: 794.612 [ns/string] - locate_prefix-50%: 1064.44 [ns/string] - locate_prefix-75%: 912.04 [ns/string] - locate_prefix-100%: 702.745 [ns/string] - 2019-10-24 11:12:12: building fc_dictionary with bucket size 8... - 2019-10-24 11:12:12: DONE - using 38111527 bytes - locate: 511.503 [ns/string] - extract: 152.331 [ns/string] - locate_prefix-0%: 223.374 [ns/string] - locate_prefix-25%: 686.093 [ns/string] - locate_prefix-50%: 873.161 [ns/string] - locate_prefix-75%: 758.029 [ns/string] - locate_prefix-100%: 638.576 [ns/string] - 2019-10-24 11:12:32: building fc_dictionary with bucket size 16... - 2019-10-24 11:12:32: DONE - using 35270205 bytes - locate: 478.592 [ns/string] - extract: 139.109 [ns/string] - locate_prefix-0%: 228.416 [ns/string] - locate_prefix-25%: 662.483 [ns/string] - locate_prefix-50%: 769.227 [ns/string] - locate_prefix-75%: 685.358 [ns/string] - locate_prefix-100%: 615.757 [ns/string] - 2019-10-24 11:12:51: building fc_dictionary with bucket size 32... - 2019-10-24 11:12:51: DONE - using 33722303 bytes - locate: 484.72 [ns/string] - extract: 150.21 [ns/string] - locate_prefix-0%: 273.595 [ns/string] - locate_prefix-25%: 717.559 [ns/string] - locate_prefix-50%: 790.342 [ns/string] - locate_prefix-75%: 728.409 [ns/string] - locate_prefix-100%: 681.921 [ns/string] - 2019-10-24 11:13:11: building fc_dictionary with bucket size 64... - 2019-10-24 11:13:11: DONE - using 32910194 bytes - locate: 585.835 [ns/string] - extract: 194.183 [ns/string] - locate_prefix-0%: 667.159 [ns/string] - locate_prefix-25%: 962.096 [ns/string] - locate_prefix-50%: 1056.04 [ns/string] - locate_prefix-75%: 1014.63 [ns/string] - locate_prefix-100%: 978.718 [ns/string] - 2019-10-24 11:13:39: building fc_dictionary with bucket size 128... - 2019-10-24 11:13:39: DONE - using 32496375 bytes - locate: 810.282 [ns/string] - extract: 286.967 [ns/string] - locate_prefix-0%: 574.352 [ns/string] - locate_prefix-25%: 1248.92 [ns/string] - locate_prefix-50%: 1435.28 [ns/string] - locate_prefix-75%: 1419.18 [ns/string] - locate_prefix-100%: 1398.48 [ns/string] - 2019-10-24 11:14:16: building fc_dictionary with bucket size 256... - 2019-10-24 11:14:16: DONE - using 32286042 bytes - locate: 1281.09 [ns/string] - extract: 470.922 [ns/string] - locate_prefix-0%: 1065.07 [ns/string] - locate_prefix-25%: 2099.35 [ns/string] - locate_prefix-50%: 2387.39 [ns/string] - locate_prefix-75%: 2407.04 [ns/string] - locate_prefix-100%: 2403.04 [ns/string] \ No newline at end of file diff --git a/results/integer_fc_dictionary.md b/results/integer_fc_dictionary.md deleted file mode 100644 index 955afe0..0000000 --- a/results/integer_fc_dictionary.md +++ /dev/null @@ -1,31 +0,0 @@ -#### Results on the AOL querylog. - - pibiri@rubino:~/autocomplete/build$ ./benchmark_integer_fc_dictionary ../test_data/aol/aol.completions 1000000 - 2019-10-14 15:28:12: building integer_fc_dictionary with bucket size 4... - 2019-10-14 15:28:14: DONE - using 129855836 bytes - extract: 102.787 [ns/string] - 2019-10-14 15:28:15: building integer_fc_dictionary with bucket size 8... - 2019-10-14 15:28:18: DONE - using 112779868 bytes - extract: 98.9981 [ns/string] - 2019-10-14 15:28:19: building integer_fc_dictionary with bucket size 16... - 2019-10-14 15:28:21: DONE - using 102740006 bytes - extract: 103.745 [ns/string] - 2019-10-14 15:28:22: building integer_fc_dictionary with bucket size 32... - 2019-10-14 15:28:24: DONE - using 97266766 bytes - extract: 136.042 [ns/string] - 2019-10-14 15:28:26: building integer_fc_dictionary with bucket size 64... - 2019-10-14 15:28:28: DONE - using 94397632 bytes - extract: 207.699 [ns/string] - 2019-10-14 15:28:30: building integer_fc_dictionary with bucket size 128... - 2019-10-14 15:28:32: DONE - using 92933198 bytes - extract: 354.622 [ns/string] - 2019-10-14 15:28:36: building integer_fc_dictionary with bucket size 256... - 2019-10-14 15:28:38: DONE - using 92192244 bytes - extract: 651.357 [ns/string] \ No newline at end of file diff --git a/results/inverted_index_space.md b/results/inverted_index_space.md deleted file mode 100644 index f3acd81..0000000 --- a/results/inverted_index_space.md +++ /dev/null @@ -1,19 +0,0 @@ -Inverted index compression ----- - -#### AOL - - EF -- 17.1495 bits per element - PEF uniform -- 16.5788 bits per element - PEF opt -- 15.0967 bits per element - PFOR -- 15.2661 bits per element - BIC -- 14.1396 bits per element - Simple9 -- 21.8895 bits per element - Simple16 -- 21.7385 bits per element - VByte -- 20.9531 bits per element - Varint -- 21.996 bits per element - Gamma -- 23.6305 bits per element - Delta -- 19.2088 bits per element - Rice -- 19.4145 bits per element - DINT single -- 15.4204 bits per element - DINT multi -- 15.084 bits per element \ No newline at end of file diff --git a/results/prefix_topk.md b/results/prefix_topk.md deleted file mode 100644 index 6404bc4..0000000 --- a/results/prefix_topk.md +++ /dev/null @@ -1,94 +0,0 @@ -Prefix top-k ------------- - -Executing queries shuffled at random, for k = 7. - -Average among 10 runs. - -From the last token of the query, we only retain the first character. This means that we spend less in obtaining the lexicographic range of the character (string comparisons are -very fast), but we spend more on the RMQ phase, because the -range obtained from the completion trie can be very large. - -### AOL - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "279", "topk_rmq_ns_per_query": "2887", "reporting_ns_per_query": "317"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "47", "completions_search_ns_per_query": "853", "topk_rmq_ns_per_query": "576", "reporting_ns_per_query": "1851"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "706", "completions_search_ns_per_query": "945", "topk_rmq_ns_per_query": "95", "reporting_ns_per_query": "717"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1055", "completions_search_ns_per_query": "1057", "topk_rmq_ns_per_query": "22", "reporting_ns_per_query": "332"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1495", "completions_search_ns_per_query": "1215", "topk_rmq_ns_per_query": "9", "reporting_ns_per_query": "325"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1957", "completions_search_ns_per_query": "1434", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "425"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2410", "completions_search_ns_per_query": "1581", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "611"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3360", "completions_search_ns_per_query": "1888", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "913"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5027"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4974"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3984"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4137"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4660"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5335"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5785"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "7394"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "493", "topk_rmq_ns_per_query": "3072", "reporting_ns_per_query": "628"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "60", "completions_search_ns_per_query": "1078", "topk_rmq_ns_per_query": "589", "reporting_ns_per_query": "1897"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "675", "completions_search_ns_per_query": "1053", "topk_rmq_ns_per_query": "96", "reporting_ns_per_query": "730"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1047", "completions_search_ns_per_query": "1081", "topk_rmq_ns_per_query": "21", "reporting_ns_per_query": "320"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1367", "completions_search_ns_per_query": "1112", "topk_rmq_ns_per_query": "8", "reporting_ns_per_query": "244"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1886", "completions_search_ns_per_query": "1139", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "300"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2242", "completions_search_ns_per_query": "1166", "topk_rmq_ns_per_query": "3", "reporting_ns_per_query": "455"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3229", "completions_search_ns_per_query": "1205", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "809"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5768"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "5625"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "4389"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4421"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4830"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5336"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5963"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "7104"} - -### MSN - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "403", "topk_rmq_ns_per_query": "3211", "reporting_ns_per_query": "509"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "33", "completions_search_ns_per_query": "784", "topk_rmq_ns_per_query": "312", "reporting_ns_per_query": "1287"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "596", "completions_search_ns_per_query": "906", "topk_rmq_ns_per_query": "49", "reporting_ns_per_query": "423"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1026", "completions_search_ns_per_query": "1015", "topk_rmq_ns_per_query": "11", "reporting_ns_per_query": "206"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1434", "completions_search_ns_per_query": "1114", "topk_rmq_ns_per_query": "5", "reporting_ns_per_query": "217"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1938", "completions_search_ns_per_query": "1273", "topk_rmq_ns_per_query": "2", "reporting_ns_per_query": "330"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2362", "completions_search_ns_per_query": "1437", "topk_rmq_ns_per_query": "0", "reporting_ns_per_query": "545"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3186", "completions_search_ns_per_query": "1737", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "873"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5804"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4006"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3456"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "3873"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4587"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5030"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5617"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "6957"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "parsing_ns_per_query": "0", "completions_search_ns_per_query": "697", "topk_rmq_ns_per_query": "3495", "reporting_ns_per_query": "1114"} - {"num_terms_per_query": "2", "num_queries": "50000", "parsing_ns_per_query": "32", "completions_search_ns_per_query": "1038", "topk_rmq_ns_per_query": "321", "reporting_ns_per_query": "1384"} - {"num_terms_per_query": "3", "num_queries": "50000", "parsing_ns_per_query": "547", "completions_search_ns_per_query": "1029", "topk_rmq_ns_per_query": "51", "reporting_ns_per_query": "455"} - {"num_terms_per_query": "4", "num_queries": "50000", "parsing_ns_per_query": "1012", "completions_search_ns_per_query": "1038", "topk_rmq_ns_per_query": "11", "reporting_ns_per_query": "210"} - {"num_terms_per_query": "5", "num_queries": "50000", "parsing_ns_per_query": "1318", "completions_search_ns_per_query": "1066", "topk_rmq_ns_per_query": "5", "reporting_ns_per_query": "172"} - {"num_terms_per_query": "6", "num_queries": "50000", "parsing_ns_per_query": "1922", "completions_search_ns_per_query": "1077", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "242"} - {"num_terms_per_query": "7", "num_queries": "50000", "parsing_ns_per_query": "2213", "completions_search_ns_per_query": "1099", "topk_rmq_ns_per_query": "1", "reporting_ns_per_query": "425"} - {"num_terms_per_query": "8+", "num_queries": "50000", "parsing_ns_per_query": "3228", "completions_search_ns_per_query": "1124", "topk_rmq_ns_per_query": "0", "reporting_ns_per_query": "799"} - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "6772"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "4646"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "3831"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "4108"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "4594"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "5080"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "5621"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "6775"} \ No newline at end of file diff --git a/results/space.md b/results/space.md deleted file mode 100644 index 64ac1a2..0000000 --- a/results/space.md +++ /dev/null @@ -1,159 +0,0 @@ -AOL 2006 query log ------------------- - -10,142,395 distinct queries, whose ids have been assigned -in decreasing frequency order (ties broken lexicographically). - -#### Solution 1 - - using 1.05555 [GiB] - completions: 0.520278 [GiB] (49.2899%) - unsorted docs list: 0.0409812 [GiB] (3.88246%) - unsorted minimal docs list: 0.0154568 [GiB] (1.46434%) - dictionary: 0.0328479 [GiB] (3.11194%) - inverted index: 0.144273 [GiB] (13.6681%) - data: 33.0401 [bpi] - pointers: 8.13526 [bpi] - forward index: 0.30171 [GiB] (28.5833%) - data: 42.6801 [bpi] - pointers: 42.8379 [bpi] - - - + Elias-Fano - using 0.370675 [GiB] - completions: 0.0867222 [GiB] (23.3958%) - unsorted docs list: 0.0409812 [GiB] (11.0558%) - unsorted minimal docs list: 0.0154568 [GiB] (4.1699%) - dictionary: 0.0328479 [GiB] (8.86166%) - inverted index: 0.0595939 [GiB] (16.0771%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - forward index: 0.135073 [GiB] (36.4397%) - data: 32.866 [bpi] - pointers: 5.41964 [bpi] - - + Elias-Fano and compact_forward_index - using 0.318008 [GiB] - completions: 0.0867222 [GiB] (27.2704%) - unsorted docs list: 0.0409812 [GiB] (12.8868%) - unsorted minimal docs list: 0.0154568 [GiB] (4.86049%) - dictionary: 0.0328479 [GiB] (10.3293%) - inverted index: 0.0595939 [GiB] (18.7397%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - forward index: 0.0824065 [GiB] (25.9133%) - data: 22 [bpi] - pointers: 1.35762 [bpi] - - + Elias-Fano and delta_forward_index - using 0.350595 [GiB] - completions: 0.086722 [GiB] (24.7356%) - unsorted docs list: 0.0409812 [GiB] (11.689%) - unsorted minimal docs list: 0.0154568 [GiB] (4.40872%) - dictionary: 0.0328479 [GiB] (9.36919%) - data: 69.9866 [bps] - pointers: 3.76476 [bps] - inverted index: 0.0595939 [GiB] (16.9979%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - forward index: 0.114994 [GiB] (32.7995%) - data: 29.6008 [bpi] - pointers: 2.99348 [bpi] - - + Elias-Fano + compact_forward_index + compact_unsorted_lists - using 0.304999 [GiB] - completions: 0.086722 [GiB] (28.4335%) - unsorted docs list: 0.0315353 [GiB] (10.3395%) - unsorted minimal docs list: 0.0118937 [GiB] (3.89958%) - dictionary: 0.0328479 [GiB] (10.7698%) - data: 69.9866 [bps] - pointers: 3.76476 [bps] - inverted index: 0.0595939 [GiB] (19.539%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - forward index: 0.0824065 [GiB] (27.0186%) - data: 22 [bpi] - pointers: 1.35762 [bpi] - -#### Solution 2 - - using 0.377843 [GiB] - completions: 0.0956838 [GiB] (25.3237%) - unsorted docs list: 0.0409812 [GiB] (10.8461%) - unsorted minimal docs list: 0.0154568 [GiB] (4.09079%) - dictionary: 0.0330574 [GiB] (8.74898%) - inverted index: 0.154881 [GiB] (40.9907%) - map from docid to lexid: 0.0377834 [GiB] (9.99975%) - - - + Elias-Fano - using 0.259893 [GiB] - completions: 0.0956841 [GiB] (36.8168%) - data: 73.5086 [bps] - pointers: 7.52944 [bps] - unsorted docs list: 0.0315353 [GiB] (12.134%) - unsorted minimal docs list: 0.0118937 [GiB] (4.57639%) - dictionary: 0.0328479 [GiB] (12.639%) - data: 69.9866 [bps] - pointers: 3.76476 [bps] - inverted index: 0.0595939 [GiB] (22.9302%) - data: 15.7999 [bpi] - pointers: 1.20819 [bpi] - map from docid to lexid: 0.0283376 [GiB] (10.9036%) - - -MSN 2006 query log ------------------- - -7,083,363 distinct queries, whose ids have been assigned -in decreasing frequency order (ties broken lexicographically). - -#### Solution 1 - - using 0.769592 [GiB] - completion trie: 0.370163 [GiB] (48.0986%) - unsorted docs list: 0.0286179 [GiB] (3.71858%) - unsorted minimal docs list: 0.0104689 [GiB] (1.36031%) - dictionary: 0.0220881 [GiB] (2.87011%) - inverted index: 0.107578 [GiB] (13.9785%) - forward index: 0.230677 [GiB] (29.9739%) - - + compression - using 0.213269 [GiB] - completions: 0.0617906 [GiB] (28.973%) - unsorted docs list: 0.0211964 [GiB] (9.9388%) - unsorted minimal docs list: 0.00775427 [GiB] (3.6359%) - dictionary: 0.0219463 [GiB] (10.2904%) - data: 68.9954 [bps] - pointers: 3.7648 [bps] - inverted index: 0.0429281 [GiB] (20.1286%) - data: 16.2938 [bpi] - pointers: 1.1785 [bpi] - forward index: 0.0576538 [GiB] (27.0333%) - data: 22 [bpi] - pointers: 1.35605 [bpi] - -#### Solution 2 - - using 0.263256 [GiB] - completions: 0.0681158 [GiB] (25.8744%) - unsorted docs list: 0.0286179 [GiB] (10.8708%) - unsorted minimal docs list: 0.0104689 [GiB] (3.97669%) - dictionary: 0.0220881 [GiB] (8.39036%) - inverted index: 0.107578 [GiB] (40.8643%) - map from docid to lexid: 0.0263876 [GiB] (10.0236%) - - + compression - using 0.180907 [GiB] - completions: 0.0681161 [GiB] (37.6525%) - data: 75.0743 [bps] - pointers: 7.52946 [bps] - unsorted docs list: 0.0211964 [GiB] (11.7167%) - unsorted minimal docs list: 0.00775427 [GiB] (4.28633%) - dictionary: 0.0219463 [GiB] (12.1312%) - data: 68.9954 [bps] - pointers: 3.7648 [bps] - inverted index: 0.0429281 [GiB] (23.7293%) - data: 16.2938 [bpi] - pointers: 1.1785 [bpi] - map from docid to lexid: 0.0189661 [GiB] (10.4839%) \ No newline at end of file diff --git a/results/topk.md b/results/topk.md deleted file mode 100644 index b101b43..0000000 --- a/results/topk.md +++ /dev/null @@ -1,201 +0,0 @@ -Top-k ------------------ - -Executing queries shuffled at random, for k = 7. - -Average among 10 runs. - -### AOL - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5062"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6725"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "24960"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "32761"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "31450"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "28812"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "25978"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "22785"} - - + Elias-Fano - {"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5614"} - {"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "9767"} - {"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "26999"} - {"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "35428"} - {"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "36073"} - {"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "31718"} - {"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "29992"} - {"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "27313"} - - + Elias-Fano and forward_index2 - {"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5336"} - {"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "7573"} - {"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "26278"} - {"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "35664"} - {"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "35189"} - {"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "32033"} - {"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "29950"} - {"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "27332"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5812"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "12703"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "27307"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "33476"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "31403"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "28718"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "25728"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "22419"} - - + Elias-Fano - {"num_terms_per_query": "1", "num_queries": "10000", "ns_per_query": "5609"} - {"num_terms_per_query": "2", "num_queries": "10000", "ns_per_query": "10894"} - {"num_terms_per_query": "3", "num_queries": "10000", "ns_per_query": "27311"} - {"num_terms_per_query": "4", "num_queries": "10000", "ns_per_query": "34780"} - {"num_terms_per_query": "5", "num_queries": "10000", "ns_per_query": "33849"} - {"num_terms_per_query": "6", "num_queries": "10000", "ns_per_query": "30319"} - {"num_terms_per_query": "7", "num_queries": "10000", "ns_per_query": "28181"} - {"num_terms_per_query": "8+", "num_queries": "10000", "ns_per_query": "24757"} - -#### Solution 3 - - {"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "5899"} - {"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "12282007"} - {"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "18393403"} - {"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "15212918"} - {"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "11852012"} - {"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "7781194"} - {"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "7939661"} - {"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "6980226"} - - + Elias-Fano - {"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "6024"} - {"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "20553345"} - {"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "32495295"} - {"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "30929833"} - {"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "27103519"} - {"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "19912460"} - {"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "20956205"} - {"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "19643570"} - -#### Solution 4 - - c = 0.005 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6593"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "756944"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2188766"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "1920720"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2398355"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "1711205"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2195672"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2115028"} - - c = 0.01 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6610"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "739838"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2147339"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "1988980"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2440435"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "1858965"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2304761"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2254481"} - - c = 0.01, + Elias-Fano - {"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "5879"} - {"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "1754176"} - {"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "3435481"} - {"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "4442784"} - {"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "4946228"} - {"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "4818169"} - {"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "5157776"} - {"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "5431935"} - - c = 0.025 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6528"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "828082"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "2422803"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "2482018"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "2970064"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "2542134"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "2972710"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "2924603"} - - c = 0.05 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6508"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1059938"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "3046716"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "3528723"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "4037290"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "3850329"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "4371489"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "4648349"} - - c = 0.1 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6584"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1600869"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "4501125"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "5562030"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "6634491"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "6768321"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "7124462"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "7733525"} - - c = 0.2 - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "6589"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "2831409"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "7641806"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "9881857"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "11138148"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "11643908"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "11966417"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "12460833"} - -### MSN - -#### Solution 1 - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "5823"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "6251"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "16502"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "18380"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "17044"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "15622"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "14709"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "14323"} - -#### Solution 2 - - {"num_terms_per_query": "1", "num_queries": "50000", "ns_per_query": "6837"} - {"num_terms_per_query": "2", "num_queries": "50000", "ns_per_query": "14469"} - {"num_terms_per_query": "3", "num_queries": "50000", "ns_per_query": "18670"} - {"num_terms_per_query": "4", "num_queries": "50000", "ns_per_query": "19144"} - {"num_terms_per_query": "5", "num_queries": "50000", "ns_per_query": "17109"} - {"num_terms_per_query": "6", "num_queries": "50000", "ns_per_query": "15738"} - {"num_terms_per_query": "7", "num_queries": "50000", "ns_per_query": "14810"} - {"num_terms_per_query": "8+", "num_queries": "50000", "ns_per_query": "14260"} - - -#### Solution 3 - - - {"num_terms_per_query": "1", "num_queries": "1000", "ns_per_query": "6666"} - {"num_terms_per_query": "2", "num_queries": "1000", "ns_per_query": "6635754"} - {"num_terms_per_query": "3", "num_queries": "1000", "ns_per_query": "8612266"} - {"num_terms_per_query": "4", "num_queries": "1000", "ns_per_query": "5290905"} - {"num_terms_per_query": "5", "num_queries": "1000", "ns_per_query": "3939319"} - {"num_terms_per_query": "6", "num_queries": "1000", "ns_per_query": "3035556"} - {"num_terms_per_query": "7", "num_queries": "1000", "ns_per_query": "3106875"} - {"num_terms_per_query": "8+", "num_queries": "1000", "ns_per_query": "3089917"} - -#### Solution 4 with c = 0.1 - - {"num_terms_per_query": "1", "num_queries": "100", "ns_per_query": "7496"} - {"num_terms_per_query": "2", "num_queries": "100", "ns_per_query": "1280652"} - {"num_terms_per_query": "3", "num_queries": "100", "ns_per_query": "3181191"} - {"num_terms_per_query": "4", "num_queries": "100", "ns_per_query": "3722226"} - {"num_terms_per_query": "5", "num_queries": "100", "ns_per_query": "4056810"} - {"num_terms_per_query": "6", "num_queries": "100", "ns_per_query": "4130288"} - {"num_terms_per_query": "7", "num_queries": "100", "ns_per_query": "4282750"} - {"num_terms_per_query": "8+", "num_queries": "100", "ns_per_query": "4205507"} \ No newline at end of file diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py index baeeb85..b474d7a 100644 --- a/script/collect_results_by_varying_percentage.py +++ b/script/collect_results_by_varying_percentage.py @@ -3,22 +3,22 @@ index_type = sys.argv[1] query_mode = sys.argv[2] # topk, prefix_topk, conjunctive_topk index_filename = sys.argv[3] -dataset_name = sys.argv[4] +dataset_basename = sys.argv[4] # e.g., aol/aol.completions or aol/aol.completions.filtered k = sys.argv[5] num_queries = sys.argv[6] -output_filename = dataset_name + "." + index_type +output_filename = dataset_basename + "." + index_type breakdown = "" if len(sys.argv) > 7 and sys.argv[7] == "--breakdown": breakdown = "--breakdown" output_filename += ".breakdown" -output_filename += "." + query_mode + ".timings.json" +output_filename += "." + query_mode + ".json" +query_filename_prefix = dataset_basename + ".queries/queries." percentages = ["0.0", "0.25", "0.50", "0.75"] - for perc in percentages: for terms in range(2,8): # (1,8) - os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename) - os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) diff --git a/src/output_ds2i_format.cpp b/src/output_ds2i_format.cpp index cc139c4..eb92509 100644 --- a/src/output_ds2i_format.cpp +++ b/src/output_ds2i_format.cpp @@ -27,7 +27,7 @@ int main(int argc, char** argv) { { // write ds2i header uint32_t n = 1; - uint32_t universe = params.num_completions; + uint32_t universe = params.universe; docs.write(reinterpret_cast(&n), sizeof(uint32_t)); docs.write(reinterpret_cast(&universe), sizeof(uint32_t)); } diff --git a/test/test_blocked_inverted_index.cpp b/test/test_blocked_inverted_index.cpp index 80a9bc1..a2ede74 100644 --- a/test/test_blocked_inverted_index.cpp +++ b/test/test_blocked_inverted_index.cpp @@ -15,7 +15,7 @@ TEST_CASE("test blocked_inverted_index::intersection_iterator") { { inverted_index_type::builder ii_builder(params); ii_builder.build(ii); - REQUIRE(ii.num_docs() == params.num_completions); + REQUIRE(ii.num_docs() == params.universe); REQUIRE(ii.num_terms() == params.num_terms); } @@ -37,7 +37,7 @@ TEST_CASE("test blocked_inverted_index::intersection_iterator") { blocked_ii_builder.build(blocked_ii); } - REQUIRE(blocked_ii.num_docs() == params.num_completions); + REQUIRE(blocked_ii.num_docs() == params.universe); REQUIRE(blocked_ii.num_terms() == params.num_terms); for (auto& q : queries) { diff --git a/test/test_compact_forward_index.cpp b/test/test_compact_forward_index.cpp index aa09403..dc78c07 100644 --- a/test/test_compact_forward_index.cpp +++ b/test/test_compact_forward_index.cpp @@ -12,7 +12,7 @@ TEST_CASE("test compact_forward_index::iterator") { compact_forward_index::builder builder(params); compact_forward_index index; builder.build(index); - REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_docs() == params.universe); REQUIRE(index.num_terms() == params.num_terms); essentials::save(index, output_filename); } @@ -20,7 +20,7 @@ TEST_CASE("test compact_forward_index::iterator") { { compact_forward_index index; essentials::load(index, output_filename); - REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_docs() == params.universe); REQUIRE(index.num_terms() == params.num_terms); std::ifstream input((params.collection_basename + ".forward").c_str(), diff --git a/test/test_inverted_index.cpp b/test/test_inverted_index.cpp index b96b708..5faa823 100644 --- a/test/test_inverted_index.cpp +++ b/test/test_inverted_index.cpp @@ -14,7 +14,7 @@ TEST_CASE("test inverted_index::iterator") { inverted_index_type::builder builder(params); inverted_index_type index; builder.build(index); - REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_docs() == params.universe); REQUIRE(index.num_terms() == params.num_terms); essentials::save(index, output_filename); } @@ -22,7 +22,7 @@ TEST_CASE("test inverted_index::iterator") { { inverted_index_type index; essentials::load(index, output_filename); - REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_docs() == params.universe); REQUIRE(index.num_terms() == params.num_terms); std::ifstream input((params.collection_basename + ".inverted").c_str(), @@ -58,7 +58,7 @@ TEST_CASE("test inverted_index::intersection_iterator") { inverted_index_type::builder builder(params); inverted_index_type index; builder.build(index); - REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_docs() == params.universe); REQUIRE(index.num_terms() == params.num_terms); essentials::save(index, output_filename); } @@ -66,7 +66,7 @@ TEST_CASE("test inverted_index::intersection_iterator") { { inverted_index_type index; essentials::load(index, output_filename); - REQUIRE(index.num_docs() == params.num_completions); + REQUIRE(index.num_docs() == params.universe); REQUIRE(index.num_terms() == params.num_terms); static const uint32_t num_queries = 1000000; diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp index 7924899..ae99a6b 100644 --- a/test/test_locate_prefix.cpp +++ b/test/test_locate_prefix.cpp @@ -82,9 +82,9 @@ TEST_CASE("test locate_prefix()") { << num_terms << std::endl; { queries.clear(); - std::string filename = params.collection_basename + - ".length=" + std::to_string(num_terms) + - ".shuffled"; + std::string filename = + params.collection_basename + + ".queries/queries.length=" + std::to_string(num_terms); std::ifstream querylog(filename.c_str()); if (!querylog.is_open()) { std::cerr << "cannot open file '" << filename << "'" @@ -95,7 +95,7 @@ TEST_CASE("test locate_prefix()") { querylog.close(); } - // test_locate_prefix(dict, ct_index, queries, strings); + test_locate_prefix(dict, ct_index, queries, strings); test_locate_prefix(dict, fc_index, queries, strings); } } diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp index 8e791bb..8b1ce0f 100644 --- a/test/test_unsorted_list.cpp +++ b/test/test_unsorted_list.cpp @@ -62,15 +62,17 @@ TEST_CASE("test unsorted_list on doc_ids") { } input.close(); - { - // must have all ids from 0 to doc_ids.size() - 1 - std::vector tmp = doc_ids; - std::sort(tmp.begin(), tmp.end()); - for (id_type id = 0; id != doc_ids.size(); ++id) { - REQUIRE_MESSAGE(tmp[id] == id, - "Error: id " << id << " not found"); - } - } + // { + // // must have all ids from 0 to doc_ids.size() - 1 + // // NOTE: not true if we filter out some strings to be used as + // // queries + // std::vector tmp = doc_ids; + // std::sort(tmp.begin(), tmp.end()); + // for (id_type id = 0; id != doc_ids.size(); ++id) { + // REQUIRE_MESSAGE(tmp[id] == id, + // "Error: id " << id << " not found"); + // } + // } succinct_rmq list; list.build(doc_ids); diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py index 0634d82..acf4b8e 100644 --- a/test_data/build_inverted_and_forward.py +++ b/test_data/build_inverted_and_forward.py @@ -19,10 +19,11 @@ num_docs = 0 with open(input_filename + ".mapped.stats") as f: num_terms = int(f.readline()) - print num_terms - f.readline() # skip line containing max num. of query terms + print("terms: " + str(num_terms)) + f.readline() # skip line: max num. of query terms + f.readline() # skip line: num. of completions num_docs = int(f.readline()) - print num_docs + print("universe: " + str(num_docs)) inverted_index = [[] for i in range(num_terms + 1)] # id 0 is not assigned forward_index = [[] for i in range(num_docs)] diff --git a/test_data/build_stats.py b/test_data/build_stats.py index 5fdfdb7..8e60a39 100644 --- a/test_data/build_stats.py +++ b/test_data/build_stats.py @@ -8,10 +8,17 @@ output_file = open(input_filename + ".stats", 'a') prev = [] +universe = 0; with open(input_filename, 'r') as f: for line in f: x = line.rstrip('\n').split() + docid = int(x[0]) + + if docid > universe: + universe = docid + q = x[1:len(x)] + level_id = 0 while level_id < len(q) and level_id < len(prev) and q[level_id] == prev[level_id]: level_id += 1 @@ -31,7 +38,10 @@ # number of completions # number of levels in the trie # number of nodes for each level +print("universe: " + str(universe + 1)) +print("completions: " + str(lines)) output_file.write(str(lines) + "\n") +output_file.write(str(universe + 1) + "\n") output_file.write(str(len(nodes_per_level)) + "\n") for key, value in sorted(nodes_per_level.iteritems(), key = lambda kv: kv[0]): output_file.write(str(value) + "\n") diff --git a/test_data/filter_and_preprocess.sh b/test_data/filter_and_preprocess.sh new file mode 100644 index 0000000..38425d7 --- /dev/null +++ b/test_data/filter_and_preprocess.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +echo $1 # input filename + +# number of completions to exclude per completion size, +# e.g., if it is 100, then at most 8 x 100 completions are filtered out +echo $2 + +python partition_queries_by_length.py $1 $1.filtered.queries $2 +python filter_dataset.py $1 $1.filtered.queries +python extract_dict.py $1.filtered +python map_dataset.py $1.filtered +python build_stats.py $1.filtered.mapped +python build_inverted_and_forward.py $1.filtered diff --git a/test_data/filter_dataset.py b/test_data/filter_dataset.py new file mode 100644 index 0000000..4481cbe --- /dev/null +++ b/test_data/filter_dataset.py @@ -0,0 +1,32 @@ +import sys +from sets import Set + +input_filename = sys.argv[1] +queries_directory = sys.argv[2] + +to_filter = Set({}) +print("loading strings to filter...") +for i in range(1,8): + with open(queries_directory + "/queries.length=" + str(i)) as f: + for line in f: + s = line.rstrip('\n') + to_filter.add(s) +with open(queries_directory + "/queries.length=8+") as f: + for line in f: + s = line.rstrip('\n') + to_filter.add(s) + +lines = 0 +print("filtering dataset...") + +output_file = open(input_filename + ".filtered", 'w') +with open(input_filename, 'r') as f: + for line in f: + x = line.rstrip('\n').split() + string = ' '.join(x[1:len(x)]) + if string not in to_filter: + output_file.write(line) + lines += 1 + if lines % 1000000 == 0: + print("processed " + str(lines) + " lines") +output_file.close() \ No newline at end of file diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py index 7dfbed6..eb9b95d 100644 --- a/test_data/partition_queries_by_length.py +++ b/test_data/partition_queries_by_length.py @@ -1,12 +1,17 @@ -import sys, random +import sys, os, random input_filename = sys.argv[1] +output_directory = sys.argv[2] +n = int(sys.argv[3]) + +if not os.path.exists(output_directory): + os.makedirs(output_directory) num_shards = 7 -files = [open(input_filename + ".length=" + str(i) + ".shuffled", "w") for i in range(1,num_shards + 1)] -all_others = open(input_filename + ".length=" + str(num_shards + 1) + "+.shuffled", "w") +files = [open(output_directory + "/queries.length=" + str(i), "w") for i in range(1,num_shards + 1)] +all_others = open(output_directory + "/queries.length=" + str(num_shards + 1) + "+", "w") -strings = [[] for i in range(0, num_shards)] +strings = [[] for i in range(num_shards)] all_others_strings = [] lines = 0 @@ -23,13 +28,13 @@ if lines % 1000000 == 0: print("processed " + str(lines) + " lines") -for i in range(0, num_shards): +for i in range(num_shards): random.shuffle(strings[i]) - for s in strings[i]: - files[i].write(s) + for k in range(min(n, len(strings[i]))): + files[i].write(strings[i][k]) files[i].close() random.shuffle(all_others_strings) -for s in all_others_strings: - all_others.write(s) +for k in range(min(n, len(all_others_strings))): + all_others.write(all_others_strings[k]) all_others.close() diff --git a/test_data/preprocess.sh b/test_data/preprocess.sh index 24c9488..e3d96f7 100755 --- a/test_data/preprocess.sh +++ b/test_data/preprocess.sh @@ -1,12 +1,9 @@ #!/bin/bash -collections=`find . | grep "\\.completions$"` - -for collection in $collections; do - echo $collection - python extract_dict.py $collection - python map_dataset.py $collection - python build_stats.py $collection.mapped - python build_inverted_and_forward.py $collection - python partition_queries_by_length.py $collection -done +echo $1 # input filename +echo $2 # number of queries for each size +python extract_dict.py $1 +python map_dataset.py $1 +python build_stats.py $1.mapped +python build_inverted_and_forward.py $1 +python partition_queries_by_length.py $1 $1.queries $2 From 78f27ed61b143ad1a927fd30a4f2c227726f61aa Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 28 Nov 2019 14:32:50 +0100 Subject: [PATCH 047/102] script to automate benchmarking of dictionaries --- benchmark/benchmark_fc_dictionary.cpp | 33 +++++++++---------- script/benchmark_dictionaries.sh | 7 ++++ ...te_prefix_results_by_varying_percentage.py | 11 +++---- .../collect_results_by_varying_percentage.py | 6 ++-- 4 files changed, 30 insertions(+), 27 deletions(-) create mode 100644 script/benchmark_dictionaries.sh diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp index 1d94c8e..36882c9 100644 --- a/benchmark/benchmark_fc_dictionary.cpp +++ b/benchmark/benchmark_fc_dictionary.cpp @@ -41,12 +41,10 @@ void perf_test(Dictionary const& dict, timer.stop(); } - std::cout << "extract: " << (timer.average() * 1000.0) / ids.size() - << " [ns/string]" << std::endl; + std::cout << "extract: " << timer.average() / ids.size() + << " [musec/string]" << std::endl; static std::vector percentages = {0.0, 0.25, 0.50, 0.75, 1.0}; - // static std::vector percentages = {0.1, 0.2, 0.3, 0.4, 0.5, - // 0.6, 0.7, 0.8, 0.9, 1.0}; for (auto p : percentages) { timer.reset(); for (uint32_t i = 0; i != runs; ++i) { @@ -64,8 +62,8 @@ void perf_test(Dictionary const& dict, } std::cout << "\tlocate_prefix-" << p * 100.0 - << "%: " << (timer.average() * 1000.0) / queries.size() - << " [ns/string]" << std::endl; + << "%: " << timer.average() / queries.size() + << " [musec/string]" << std::endl; } } @@ -81,30 +79,29 @@ void perf_test(Dictionary const& dict, } int main(int argc, char** argv) { - int mandatory = 2 + 1; - if (argc < mandatory) { - std::cout << argv[0] << " < queries" - << std::endl; - return 1; - } + cmd_line_parser::parser parser(argc, argv); + parser.add("collection_basename", "Collection basename."); + parser.add("max_num_queries", "Maximum number of queries to execute."); + if (!parser.parse()) return 1; parameters params; - params.collection_basename = argv[1]; + params.collection_basename = parser.get("collection_basename"); params.load(); - uint32_t num_queries = std::atoi(argv[2]); + auto max_num_queries = parser.get("max_num_queries"); essentials::logger("loading queries..."); std::vector queries; - queries.reserve(num_queries); + queries.reserve(max_num_queries); std::string query; query.reserve(2 * constants::MAX_NUM_CHARS_PER_QUERY); - for (uint32_t i = 0; i != num_queries; ++i) { + for (uint32_t i = 0; i != max_num_queries; ++i) { if (!std::getline(std::cin, query)) break; queries.push_back(std::move(query)); } - num_queries = queries.size(); - essentials::logger("loaded " + std::to_string(num_queries) + " queries"); + max_num_queries = queries.size(); + essentials::logger("loaded " + std::to_string(max_num_queries) + + " queries"); exe(4) exe(8) exe(16) exe(32) exe(64) exe(128) exe(256) return 0; } \ No newline at end of file diff --git a/script/benchmark_dictionaries.sh b/script/benchmark_dictionaries.sh new file mode 100644 index 0000000..88c0254 --- /dev/null +++ b/script/benchmark_dictionaries.sh @@ -0,0 +1,7 @@ +cd ../test_data +bash preprocess.sh aol/aol.completions 100000 +cd ../build +python ../script/collect_locate_prefix_results_by_varying_percentage.py fc ../test_data/aol/aol.completions 100000 +python ../script/collect_locate_prefix_results_by_varying_percentage.py trie ../test_data/aol/aol.completions 100000 +./benchmark_fc_dictionary ../test_data/aol/aol.completions 100000 < ../test_data/aol/aol.completions.queries/queries.length=1 +cd ../script \ No newline at end of file diff --git a/script/collect_locate_prefix_results_by_varying_percentage.py b/script/collect_locate_prefix_results_by_varying_percentage.py index e9142d9..305fafa 100644 --- a/script/collect_locate_prefix_results_by_varying_percentage.py +++ b/script/collect_locate_prefix_results_by_varying_percentage.py @@ -2,14 +2,13 @@ type = sys.argv[1] # 'trie' or 'fc' collection_basename = sys.argv[2] -dataset_name = sys.argv[3] -num_queries = sys.argv[4] +num_queries = sys.argv[3] -output_filename = dataset_name + "." + type + ".locate_prefix.timings.json" +output_filename = collection_basename + "." + type + ".locate_prefix.json" +query_filename_prefix = collection_basename + ".queries/queries." percentages = ["0.0", "0.25", "0.50", "0.75"] - for perc in percentages: for terms in range(1,8): - os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=" + str(terms) + ".shuffled 2>> " + output_filename) - os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < ../test_data/" + dataset_name + "/" + dataset_name + ".completions.length=8+.shuffled 2>> " + output_filename) + os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) + os.system("../build/benchmark_locate_prefix " + type + " " + collection_basename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py index b474d7a..48a7dd1 100644 --- a/script/collect_results_by_varying_percentage.py +++ b/script/collect_results_by_varying_percentage.py @@ -3,11 +3,11 @@ index_type = sys.argv[1] query_mode = sys.argv[2] # topk, prefix_topk, conjunctive_topk index_filename = sys.argv[3] -dataset_basename = sys.argv[4] # e.g., aol/aol.completions or aol/aol.completions.filtered +collection_basename = sys.argv[4] # e.g., aol/aol.completions or aol/aol.completions.filtered k = sys.argv[5] num_queries = sys.argv[6] -output_filename = dataset_basename + "." + index_type +output_filename = collection_basename + "." + index_type breakdown = "" if len(sys.argv) > 7 and sys.argv[7] == "--breakdown": @@ -15,7 +15,7 @@ output_filename += ".breakdown" output_filename += "." + query_mode + ".json" -query_filename_prefix = dataset_basename + ".queries/queries." +query_filename_prefix = collection_basename + ".queries/queries." percentages = ["0.0", "0.25", "0.50", "0.75"] for perc in percentages: From 4e37b944b75c9eb4d753bffc2566b60015259116 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 28 Nov 2019 14:50:44 +0100 Subject: [PATCH 048/102] script to automate benchmarking of dictionaries --- README.md | 5 +++++ benchmark/benchmark_fc_dictionary.cpp | 4 ++-- script/benchmark_dictionaries.sh | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f19bd7b..50e111f 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,11 @@ From within the `/build` directory, run You can also specify the option `--breakdown` to record timings breakdowns. +To benchmark the dictionaries (Front-Coding and trie), just run the following script from within +the `script` directory: + + bash benchmark_dictionaries.sh + Live demo ---------- diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp index 36882c9..ce71f67 100644 --- a/benchmark/benchmark_fc_dictionary.cpp +++ b/benchmark/benchmark_fc_dictionary.cpp @@ -20,8 +20,8 @@ void perf_test(Dictionary const& dict, timer.stop(); } - std::cout << "locate: " << (timer.average() * 1000.0) / queries.size() - << " [ns/string]" << std::endl; + std::cout << "locate: " << timer.average() / queries.size() + << " [musec/string]" << std::endl; std::vector ids; ids.reserve(queries.size()); diff --git a/script/benchmark_dictionaries.sh b/script/benchmark_dictionaries.sh index 88c0254..29c9a84 100644 --- a/script/benchmark_dictionaries.sh +++ b/script/benchmark_dictionaries.sh @@ -3,5 +3,5 @@ bash preprocess.sh aol/aol.completions 100000 cd ../build python ../script/collect_locate_prefix_results_by_varying_percentage.py fc ../test_data/aol/aol.completions 100000 python ../script/collect_locate_prefix_results_by_varying_percentage.py trie ../test_data/aol/aol.completions 100000 -./benchmark_fc_dictionary ../test_data/aol/aol.completions 100000 < ../test_data/aol/aol.completions.queries/queries.length=1 +./benchmark_fc_dictionary ../test_data/aol/aol.completions 100000 < ../test_data/aol/aol.completions.queries/queries.length=1 > ../test_data/aol/aol.completions.dictionary_benchmark.txt cd ../script \ No newline at end of file From e4fb185dcf131df1f1df788c533d9b9e52cc3291 Mon Sep 17 00:00:00 2001 From: Simon Gog Date: Tue, 10 Dec 2019 00:19:58 -0800 Subject: [PATCH 049/102] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 50e111f..ce69cb7 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ in preparing the data for indexing. Thus, from within the directory `test_data`, it is sufficient to do: - bash preprocess.sh 300 + bash preprocess.sh 300 The second argument in the example, i.e., 300, represents the number of completions (per completion size) that are drawn at From fae328f7cd6dbbf503b0368a302ea2000285016d Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 29 Feb 2020 19:52:54 +0100 Subject: [PATCH 050/102] effectiveness benchmark --- benchmark/CMakeLists.txt | 3 +- benchmark/effectiveness.cpp | 104 ++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 benchmark/effectiveness.cpp diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index d7f9433..6275079 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -3,4 +3,5 @@ add_executable(benchmark_prefix_topk benchmark_prefix_topk.cpp) add_executable(benchmark_conjunctive_topk benchmark_conjunctive_topk.cpp) add_executable(benchmark_fc_dictionary benchmark_fc_dictionary.cpp) add_executable(benchmark_integer_fc_dictionary benchmark_integer_fc_dictionary.cpp) -add_executable(benchmark_locate_prefix benchmark_locate_prefix.cpp) \ No newline at end of file +add_executable(benchmark_locate_prefix benchmark_locate_prefix.cpp) +add_executable(effectiveness effectiveness.cpp) \ No newline at end of file diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp new file mode 100644 index 0000000..3fae9a8 --- /dev/null +++ b/benchmark/effectiveness.cpp @@ -0,0 +1,104 @@ +#include + +#include "types.hpp" +#include "benchmark_common.hpp" + +using namespace autocomplete; + +template +void benchmark(std::string const& index_filename, uint32_t k, + uint32_t max_num_queries, float keep, + essentials::json_lines& stats, bool verbose) { + Index index1, index2; + essentials::load(index1, index_filename.c_str()); + essentials::load(index2, index_filename.c_str()); + + std::vector queries; + uint32_t num_queries = + load_queries(queries, max_num_queries, keep, std::cin); + uint64_t strings_reported_by_prefix_search = 0; + uint64_t better_scored_strings_reported_by_conjunctive_search = 0; + + stats.add("num_queries", std::to_string(num_queries)); + + for (auto const& query : queries) { + auto it1 = index1.prefix_topk(query, k); + auto it2 = index2.conjunctive_topk(query, k); + strings_reported_by_prefix_search += it1.size(); + + uint64_t more = 0; + if (it2.size() >= it1.size()) { + more = it2.size() - it1.size(); + } + + if (verbose) { + { + auto it = it1; + std::cout << "prefix search scores: " << std::endl; + for (uint64_t i = 0; i != it.size(); ++i, ++it) { + std::cout << (*it).score << " "; + } + std::cout << std::endl; + } + { + auto it = it2; + std::cout << "conjunctive search scores: " << std::endl; + for (uint64_t i = 0; i != it.size(); ++i, ++it) { + std::cout << (*it).score << " "; + } + std::cout << std::endl; + } + std::cout << "more: " << more << std::endl; + } + + better_scored_strings_reported_by_conjunctive_search += more; + } + + stats.add("strings_reported_by_prefix_search", + std::to_string(strings_reported_by_prefix_search)); + stats.add( + "better_scored_strings_reported_by_conjunctive_search", + std::to_string(better_scored_strings_reported_by_conjunctive_search)); + stats.add( + "better_scored_strings_reported_by_conjunctive_search_in_percentage", + std::to_string(better_scored_strings_reported_by_conjunctive_search * + 100.0 / strings_reported_by_prefix_search)); +} + +int main(int argc, char** argv) { + cmd_line_parser::parser parser(argc, argv); + configure_parser_for_benchmarking(parser); + if (!parser.parse()) return 1; + + auto type = parser.get("type"); + auto k = parser.get("k"); + auto index_filename = parser.get("index_filename"); + auto max_num_queries = parser.get("max_num_queries"); + auto keep = parser.get("percentage"); + auto verbose = parser.get("verbose"); + + essentials::json_lines stats; + stats.new_line(); + stats.add("num_terms_per_query", + parser.get("num_terms_per_query")); + stats.add("percentage", std::to_string(keep)); + + if (type == "ef_type1") { + benchmark(index_filename, k, max_num_queries, + keep, stats, verbose); + } else if (type == "ef_type2") { + benchmark(index_filename, k, max_num_queries, + keep, stats, verbose); + } else if (type == "ef_type3") { + benchmark(index_filename, k, max_num_queries, + keep, stats, verbose); + } else if (type == "ef_type4") { + benchmark(index_filename, k, max_num_queries, + keep, stats, verbose); + } else { + return 1; + } + + stats.print(); + return 0; +} \ No newline at end of file From 87ac6f8f44e12062e4a5eec942ba83309d2c41ab Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 29 Feb 2020 19:57:48 +0100 Subject: [PATCH 051/102] effectiveness benchmark --- benchmark/effectiveness.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp index 3fae9a8..49969a1 100644 --- a/benchmark/effectiveness.cpp +++ b/benchmark/effectiveness.cpp @@ -67,7 +67,15 @@ void benchmark(std::string const& index_filename, uint32_t k, int main(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); - configure_parser_for_benchmarking(parser); + parser.add("type", "Index type."); + parser.add("k", "top-k value."); + parser.add("index_filename", "Index filename."); + parser.add("num_terms_per_query", "Number of terms per query."); + parser.add("max_num_queries", "Maximum number of queries to execute."); + parser.add("percentage", + "A float in [0,1] specifying how much we keep of the last token " + "in a query: n x 100 <=> n%, for n in [0,1]."); + parser.add("verbose", "Verbose output.", "--verbose"); if (!parser.parse()) return 1; auto type = parser.get("type"); From a5dbf289f239c51493a24fd49b3572023f4fa9e4 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 2 Mar 2020 12:28:25 +0100 Subject: [PATCH 052/102] effectiveness --- benchmark/effectiveness.cpp | 54 ++++++++++++------- include/scored_string_pool.hpp | 8 +++ ...ctiveness_results_by_varying_percentage.py | 18 +++++++ 3 files changed, 61 insertions(+), 19 deletions(-) create mode 100644 script/collect_effectiveness_results_by_varying_percentage.py diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp index 49969a1..7abb179 100644 --- a/benchmark/effectiveness.cpp +++ b/benchmark/effectiveness.cpp @@ -21,6 +21,9 @@ void benchmark(std::string const& index_filename, uint32_t k, stats.add("num_queries", std::to_string(num_queries)); + std::vector difference; + difference.reserve(k); + for (auto const& query : queries) { auto it1 = index1.prefix_topk(query, k); auto it2 = index2.conjunctive_topk(query, k); @@ -28,30 +31,43 @@ void benchmark(std::string const& index_filename, uint32_t k, uint64_t more = 0; if (it2.size() >= it1.size()) { - more = it2.size() - it1.size(); - } + auto const& prefix_search_scores = it1.pool()->const_scores(); + auto const& conjunctive_search_scores = it2.pool()->const_scores(); + assert(std::is_sorted(prefix_search_scores.begin(), + prefix_search_scores.begin() + it1.size())); + assert( + std::is_sorted(conjunctive_search_scores.begin(), + conjunctive_search_scores.begin() + it2.size())); - if (verbose) { - { - auto it = it1; - std::cout << "prefix search scores: " << std::endl; - for (uint64_t i = 0; i != it.size(); ++i, ++it) { - std::cout << (*it).score << " "; + if (verbose) { + { + auto it = it1; + std::cout << "prefix_search_scores: " << std::endl; + for (uint64_t i = 0; i != it.size(); ++i, ++it) { + std::cout << (*it).score << " "; + } + std::cout << std::endl; } - std::cout << std::endl; - } - { - auto it = it2; - std::cout << "conjunctive search scores: " << std::endl; - for (uint64_t i = 0; i != it.size(); ++i, ++it) { - std::cout << (*it).score << " "; + { + auto it = it2; + std::cout << "conjunctive_search_scores: " << std::endl; + for (uint64_t i = 0; i != it.size(); ++i, ++it) { + std::cout << (*it).score << " "; + } + std::cout << std::endl; } - std::cout << std::endl; } - std::cout << "more: " << more << std::endl; - } - better_scored_strings_reported_by_conjunctive_search += more; + difference.clear(); + auto it = std::set_difference( + conjunctive_search_scores.begin(), + conjunctive_search_scores.begin() + it2.size(), + prefix_search_scores.begin(), + prefix_search_scores.begin() + it1.size(), difference.begin()); + more = std::distance(difference.begin(), it); + if (verbose) std::cout << "more: " << more << std::endl; + better_scored_strings_reported_by_conjunctive_search += more; + } } stats.add("strings_reported_by_prefix_search", diff --git a/include/scored_string_pool.hpp b/include/scored_string_pool.hpp index f834453..c679aeb 100644 --- a/include/scored_string_pool.hpp +++ b/include/scored_string_pool.hpp @@ -39,6 +39,10 @@ struct scored_string_pool { return m_scores; } + std::vector const& const_scores() const { + return m_scores; + } + scored_byte_range operator[](size_t i) const { assert(i < size()); scored_byte_range sbr; @@ -69,6 +73,10 @@ struct scored_string_pool { return m_pool->operator[](m_pos); } + scored_string_pool const* pool() const { + return m_pool; + } + private: scored_string_pool const* m_pool; size_t m_pos; diff --git a/script/collect_effectiveness_results_by_varying_percentage.py b/script/collect_effectiveness_results_by_varying_percentage.py new file mode 100644 index 0000000..4fc7683 --- /dev/null +++ b/script/collect_effectiveness_results_by_varying_percentage.py @@ -0,0 +1,18 @@ +import sys, os + +index_type = sys.argv[1] +index_filename = sys.argv[2] +collection_basename = sys.argv[3] # e.g., aol/aol.completions or aol/aol.completions.filtered +k = sys.argv[4] +num_queries = sys.argv[5] + +output_filename = collection_basename + "." + index_type + +output_filename += ".effectiveness.json" +query_filename_prefix = collection_basename + ".queries/queries." + +percentages = ["0.0", "0.25", "0.50", "0.75"] +for perc in percentages: + for terms in range(2,8): # (1,8) + os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) + os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) From 7f34276714316eae0729c1f29120705466e92468 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 3 Mar 2020 13:21:34 +0100 Subject: [PATCH 053/102] minor fix --- benchmark/effectiveness.cpp | 76 +++++++++++++++++-------------- include/autocomplete.hpp | 6 ++- include/autocomplete2.hpp | 6 ++- include/autocomplete3.hpp | 6 ++- include/autocomplete4.hpp | 6 ++- include/autocomplete_common.hpp | 5 +- include/compact_forward_index.hpp | 2 + include/ef/ef_sequence.hpp | 20 +++----- include/fc_dictionary.hpp | 1 + 9 files changed, 75 insertions(+), 53 deletions(-) diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp index 7abb179..e7eb7b7 100644 --- a/benchmark/effectiveness.cpp +++ b/benchmark/effectiveness.cpp @@ -30,44 +30,52 @@ void benchmark(std::string const& index_filename, uint32_t k, strings_reported_by_prefix_search += it1.size(); uint64_t more = 0; - if (it2.size() >= it1.size()) { - auto const& prefix_search_scores = it1.pool()->const_scores(); - auto const& conjunctive_search_scores = it2.pool()->const_scores(); - assert(std::is_sorted(prefix_search_scores.begin(), - prefix_search_scores.begin() + it1.size())); - assert( - std::is_sorted(conjunctive_search_scores.begin(), - conjunctive_search_scores.begin() + it2.size())); - - if (verbose) { - { - auto it = it1; - std::cout << "prefix_search_scores: " << std::endl; - for (uint64_t i = 0; i != it.size(); ++i, ++it) { - std::cout << (*it).score << " "; - } - std::cout << std::endl; + assert(it2.size() >= it1.size()); + + auto const& prefix_search_scores = it1.pool()->const_scores(); + auto const& conjunctive_search_scores = it2.pool()->const_scores(); + assert(std::is_sorted(prefix_search_scores.begin(), + prefix_search_scores.begin() + it1.size())); + assert(std::is_sorted(conjunctive_search_scores.begin(), + conjunctive_search_scores.begin() + it2.size())); + + if (verbose) { + std::cout << "query: '" << query << "'" << std::endl; + { + auto it = it1; + std::cout << "prefix_search results: " << it.size() + << std::endl; + for (uint64_t i = 0; i != it.size(); ++i, ++it) { + auto completion = *it; + std::cout << completion.score << ": " + << std::string(completion.string.begin, + completion.string.end) + << std::endl; } - { - auto it = it2; - std::cout << "conjunctive_search_scores: " << std::endl; - for (uint64_t i = 0; i != it.size(); ++i, ++it) { - std::cout << (*it).score << " "; - } - std::cout << std::endl; + } + { + auto it = it2; + std::cout << "conjunctive_search results: " << it.size() + << std::endl; + for (uint64_t i = 0; i != it.size(); ++i, ++it) { + auto completion = *it; + std::cout << completion.score << ": " + << std::string(completion.string.begin, + completion.string.end) + << std::endl; } } - - difference.clear(); - auto it = std::set_difference( - conjunctive_search_scores.begin(), - conjunctive_search_scores.begin() + it2.size(), - prefix_search_scores.begin(), - prefix_search_scores.begin() + it1.size(), difference.begin()); - more = std::distance(difference.begin(), it); - if (verbose) std::cout << "more: " << more << std::endl; - better_scored_strings_reported_by_conjunctive_search += more; } + + difference.clear(); + auto it = std::set_difference( + conjunctive_search_scores.begin(), + conjunctive_search_scores.begin() + it2.size(), + prefix_search_scores.begin(), + prefix_search_scores.begin() + it1.size(), difference.begin()); + more = std::distance(difference.begin(), it); + if (verbose) std::cout << "more: " << more << std::endl; + better_scored_strings_reported_by_conjunctive_search += more; } stats.add("strings_reported_by_prefix_search", diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp index 616b13f..dd085fa 100644 --- a/include/autocomplete.hpp +++ b/include/autocomplete.hpp @@ -40,7 +40,9 @@ struct autocomplete { init(); completion_type prefix; byte_range suffix; - parse(m_dictionary, query, prefix, suffix); + if (parse(m_dictionary, query, prefix, suffix, true) == 0) { + return m_pool.begin(); + } range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); @@ -76,6 +78,8 @@ struct autocomplete { true // must return unique results ); } else { + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp index 9d05226..cd6f411 100644 --- a/include/autocomplete2.hpp +++ b/include/autocomplete2.hpp @@ -45,7 +45,9 @@ struct autocomplete2 { init(); completion_type prefix; byte_range suffix; - parse(m_dictionary, query, prefix, suffix); + if (parse(m_dictionary, query, prefix, suffix, true) == 0) { + return m_pool.begin(); + } range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); @@ -82,6 +84,8 @@ struct autocomplete2 { ); extract_completions(num_completions); } else { + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index 6165e19..a166d9f 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -54,7 +54,9 @@ struct autocomplete3 { init(); completion_type prefix; byte_range suffix; - parse(m_dictionary, query, prefix, suffix); + if (parse(m_dictionary, query, prefix, suffix, true) == 0) { + return m_pool.begin(); + } range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); @@ -82,6 +84,8 @@ struct autocomplete3 { range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; num_completions = conjunctive_topk(num_terms, prefix, suffix_lex_range, k); extract_completions(num_completions); diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp index cd44706..3006592 100644 --- a/include/autocomplete4.hpp +++ b/include/autocomplete4.hpp @@ -47,7 +47,9 @@ struct autocomplete4 { init(); completion_type prefix; byte_range suffix; - parse(m_dictionary, query, prefix, suffix); + if (parse(m_dictionary, query, prefix, suffix, true) == 0) { + return m_pool.begin(); + } range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); @@ -73,6 +75,8 @@ struct autocomplete4 { range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; uint32_t num_completions = conjunctive_topk(prefix, suffix_lex_range, k); extract_completions(num_completions); diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp index 17b38b4..bd49934 100644 --- a/include/autocomplete_common.hpp +++ b/include/autocomplete_common.hpp @@ -6,7 +6,8 @@ namespace autocomplete { template uint32_t parse(Dictionary const& dict, std::string const& query, - completion_type& prefix, byte_range& suffix) { + completion_type& prefix, byte_range& suffix, + bool must_find_prefix = false) { uint32_t num_terms = 1; // for suffix byte_range_iterator it(string_to_byte_range(query)); while (true) { @@ -16,6 +17,8 @@ uint32_t parse(Dictionary const& dict, std::string const& query, if (term_id != global::invalid_term_id) { prefix.push_back(term_id); ++num_terms; + } else { + if (must_find_prefix) return 0; } } return num_terms; diff --git a/include/compact_forward_index.hpp b/include/compact_forward_index.hpp index bde4b71..21aaa7c 100644 --- a/include/compact_forward_index.hpp +++ b/include/compact_forward_index.hpp @@ -32,6 +32,7 @@ struct compact_forward_index { for (uint64_t k = 0; k != n; ++k) { id_type x; input >> x; + assert(x > 0); terms.push_back(x); } m_pointers.push_back(size); @@ -89,6 +90,7 @@ struct compact_forward_index { bool intersects(const range r) const { for (uint64_t i = 0; i != size(); ++i) { auto val = m_cv[m_base + i]; + assert(val > 0); if (r.contains(val)) return true; } return false; diff --git a/include/ef/ef_sequence.hpp b/include/ef/ef_sequence.hpp index 0d1f436..0632f83 100644 --- a/include/ef/ef_sequence.hpp +++ b/include/ef/ef_sequence.hpp @@ -152,23 +152,15 @@ struct ef_sequence { assert(r.is_valid()); assert(r.end <= size()); auto prev_upper = previous_range_upperbound(r); - - uint64_t begin = - util::next_geq(*this, lex.begin + prev_upper, r.begin, r.end - 1); - if (begin == global::not_found) { + uint64_t id_begin = lex.begin + prev_upper; + uint64_t id_end = lex.end + prev_upper; + uint64_t begin = util::next_geq(*this, id_begin, r.begin, r.end - 1); + if (begin == global::not_found or access(begin) > id_end) { return {r.end, r.end}; } - - if (lex.begin == lex.end) { - return {begin, begin + 1}; - } - - uint64_t id_end = lex.end + prev_upper; + if (lex.begin == lex.end) return {begin, begin + 1}; uint64_t end = util::next_geq(*this, id_end, begin, r.end - 1); - if (end == global::not_found) { - return {begin, r.end}; - } - + if (end == global::not_found) return {begin, r.end}; return {begin, access(end) != id_end ? end : end + 1}; } diff --git a/include/fc_dictionary.hpp b/include/fc_dictionary.hpp index 1b223be..52e3971 100644 --- a/include/fc_dictionary.hpp +++ b/include/fc_dictionary.hpp @@ -115,6 +115,7 @@ struct fc_dictionary { fc_dictionary() {} // NOTE: return inclusive ranges, i.e., [a,b] + // 0-based ids range locate_prefix(byte_range p) const { if (p.end - p.begin == 0) return {0, size() - 1}; auto bucket_id = locate_buckets(p); From f04f127d7a7df1558347775d6754afcc5491fee9 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 4 Mar 2020 16:40:15 +0100 Subject: [PATCH 054/102] changed css style --- web/styles.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/styles.css b/web/styles.css index 5db5234..b540533 100644 --- a/web/styles.css +++ b/web/styles.css @@ -9,4 +9,4 @@ .autocomplete-group { padding: 2px 5px; } .autocomplete-group strong { font-weight: bold; font-size: 16px; color: #000; display: block; border-bottom: 1px solid #000; } -input { font-size: 28px; padding: 10px; border: 1px solid #CCC; display: block; margin: 20px 0; } +input { font-size: 18px; padding: 10px; border: 1px solid #CCC; display: block; margin: 20px 0; } From ad7e5845370803e200e0faee0a0d639d4da2b972 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 10 Mar 2020 23:31:20 +0100 Subject: [PATCH 055/102] single-token queries with few completions switch to heap-based algorithm; less code --- .gitignore | 2 +- benchmark/CMakeLists.txt | 2 +- benchmark/benchmark_common.hpp | 5 +- benchmark/benchmark_conjunctive_topk.cpp | 46 +-- benchmark/benchmark_fc_dictionary.cpp | 6 +- benchmark/benchmark_integer_fc_dictionary.cpp | 2 +- benchmark/benchmark_locate_prefix.cpp | 9 +- benchmark/benchmark_prefix_topk.cpp | 33 +-- benchmark/effectiveness.cpp | 5 +- include/autocomplete.hpp | 261 +++++------------ include/autocomplete2.hpp | 260 +++++------------ include/autocomplete3.hpp | 211 ++++---------- include/autocomplete4.hpp | 268 ++++++------------ include/autocomplete_common.hpp | 44 ++- include/blocked_inverted_index.hpp | 79 +++--- include/probe.hpp | 36 +++ include/util_types.hpp | 21 -- src/CMakeLists.txt | 2 +- src/web_server.cpp | 8 +- test/test_autocomplete.cpp | 6 +- test/test_common.hpp | 1 + test/test_locate_prefix.cpp | 2 +- 22 files changed, 456 insertions(+), 853 deletions(-) create mode 100644 include/probe.hpp diff --git a/.gitignore b/.gitignore index 3094469..51855af 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ .DS_Store -build +build* diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 6275079..8f2c632 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(benchmark_topk benchmark_topk.cpp) +# add_executable(benchmark_topk benchmark_topk.cpp) add_executable(benchmark_prefix_topk benchmark_prefix_topk.cpp) add_executable(benchmark_conjunctive_topk benchmark_conjunctive_topk.cpp) add_executable(benchmark_fc_dictionary benchmark_fc_dictionary.cpp) diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp index 2f12c8a..e7f9160 100644 --- a/benchmark/benchmark_common.hpp +++ b/benchmark/benchmark_common.hpp @@ -1,10 +1,13 @@ #pragma once #include "../external/cmd_line_parser/include/parser.hpp" +#include "probe.hpp" namespace autocomplete { -static const uint32_t runs = 5; +namespace benchmarking { +static const uint32_t runs = 1; +} // void tolower(std::string& str) { // std::transform(str.begin(), str.end(), str.begin(), diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp index 7d8a7d3..ae73512 100644 --- a/benchmark/benchmark_conjunctive_topk.cpp +++ b/benchmark/benchmark_conjunctive_topk.cpp @@ -16,64 +16,42 @@ void benchmark(std::string const& index_filename, uint32_t k, uint32_t num_queries = load_queries(queries, max_num_queries, keep, std::cin); - uint32_t R = runs; // runs - uint64_t reported_strings = 0; auto musec_per_query = [&](double time) { - return time / (R * num_queries); + return time / (benchmarking::runs * num_queries); }; breakdowns.add("num_queries", std::to_string(num_queries)); if (breakdown) { - std::vector timers(4); - for (uint32_t run = 0; run != R; ++run) { + timer_probe probe(3); + for (uint32_t run = 0; run != benchmarking::runs; ++run) { for (auto const& query : queries) { - auto it = index.conjunctive_topk(query, k, timers); + auto it = index.conjunctive_topk(query, k, probe); reported_strings += it.size(); } } - std::cout << reported_strings << std::endl; - - // breakdowns.add("checked_docids", - // std::to_string(index.checked_docids)); breakdowns.add("heap_size", - // std::to_string(index.heap_size)); - - // auto perc_skipped_searches = - // (static_cast(index.skipped_searches) * 100.0) / - // queries.size(); - // breakdowns.add("skipped_searches", - // std::to_string(perc_skipped_searches)); - + std::cout << "#ignore: " << reported_strings << std::endl; breakdowns.add("parsing_musec_per_query", - std::to_string(musec_per_query(timers[0].elapsed()))); - breakdowns.add("dictionary_search_musec_per_query", - std::to_string(musec_per_query(timers[1].elapsed()))); + std::to_string(musec_per_query(probe.get(0).elapsed()))); breakdowns.add("conjunctive_search_musec_per_query", - std::to_string(musec_per_query(timers[2].elapsed()))); + std::to_string(musec_per_query(probe.get(1).elapsed()))); breakdowns.add("reporting_musec_per_query", - std::to_string(musec_per_query(timers[3].elapsed()))); + std::to_string(musec_per_query(probe.get(2).elapsed()))); } else { essentials::timer_type timer; + nop_probe probe; timer.start(); - for (uint32_t run = 0; run != runs; ++run) { + for (uint32_t run = 0; run != benchmarking::runs; ++run) { for (auto const& query : queries) { - auto it = index.conjunctive_topk(query, k); + auto it = index.conjunctive_topk(query, k, probe); reported_strings += it.size(); } } timer.stop(); - std::cout << reported_strings << std::endl; + std::cout << "#ignore: " << reported_strings << std::endl; breakdowns.add("musec_per_query", std::to_string(musec_per_query(timer.elapsed()))); - - // for (auto const& query : queries) { - // auto it = index.conjunctive_topk(query, k); - // reported_strings += it.size(); - // } - // breakdowns.add("avg_results_per_query", - // std::to_string(static_cast(reported_strings) / - // queries.size())); } } diff --git a/benchmark/benchmark_fc_dictionary.cpp b/benchmark/benchmark_fc_dictionary.cpp index ce71f67..d3e66b5 100644 --- a/benchmark/benchmark_fc_dictionary.cpp +++ b/benchmark/benchmark_fc_dictionary.cpp @@ -11,7 +11,7 @@ void perf_test(Dictionary const& dict, static std::vector decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); essentials::timer_type timer; - for (uint32_t i = 0; i != runs; ++i) { + for (uint32_t i = 0; i != benchmarking::runs; ++i) { timer.start(); for (auto const& query : queries) { id_type id = dict.locate(string_to_byte_range(query)); @@ -32,7 +32,7 @@ void perf_test(Dictionary const& dict, timer.reset(); - for (uint32_t i = 0; i != runs; ++i) { + for (uint32_t i = 0; i != benchmarking::runs; ++i) { timer.start(); for (auto const& id : ids) { uint8_t string_len = dict.extract(id, decoded.data()); @@ -47,7 +47,7 @@ void perf_test(Dictionary const& dict, static std::vector percentages = {0.0, 0.25, 0.50, 0.75, 1.0}; for (auto p : percentages) { timer.reset(); - for (uint32_t i = 0; i != runs; ++i) { + for (uint32_t i = 0; i != benchmarking::runs; ++i) { timer.start(); for (auto const& query : queries) { size_t size = query.size(); diff --git a/benchmark/benchmark_integer_fc_dictionary.cpp b/benchmark/benchmark_integer_fc_dictionary.cpp index 3a752eb..8cb2b32 100644 --- a/benchmark/benchmark_integer_fc_dictionary.cpp +++ b/benchmark/benchmark_integer_fc_dictionary.cpp @@ -11,7 +11,7 @@ void perf_test(Dictionary const& dict, std::vector const& queries) { static completion_type decoded(2 * constants::MAX_NUM_CHARS_PER_QUERY); essentials::timer_type timer; - for (uint32_t i = 0; i != runs; ++i) { + for (uint32_t i = 0; i != benchmarking::runs; ++i) { timer.start(); for (auto const& id : queries) { uint8_t string_len = dict.extract(id, decoded); diff --git a/benchmark/benchmark_locate_prefix.cpp b/benchmark/benchmark_locate_prefix.cpp index f9e6282..a9e374a 100644 --- a/benchmark/benchmark_locate_prefix.cpp +++ b/benchmark/benchmark_locate_prefix.cpp @@ -31,15 +31,16 @@ void benchmark(parameters const& params, std::vector& queries, essentials::timer_type timer; timer.start(); - for (uint32_t run = 0; run != runs; ++run) { + for (uint32_t run = 0; run != benchmarking::runs; ++run) { for (auto& query : queries) { auto r = index.locate_prefix(query.first, query.second); essentials::do_not_optimize_away(r.end - r.begin); } } timer.stop(); - result.add("musec_per_query", - std::to_string(timer.elapsed() / (runs * num_queries))); + result.add( + "musec_per_query", + std::to_string(timer.elapsed() / (benchmarking::runs * num_queries))); result.print(); } @@ -78,7 +79,7 @@ int main(int argc, char** argv) { for (auto const& string : strings) { completion_type prefix; byte_range suffix; - parse(dict, string, prefix, suffix); + parse(dict, string, prefix, suffix, true); range suffix_lex_range = dict.locate_prefix(suffix); queries.emplace_back(prefix, suffix_lex_range); } diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp index 2c31c68..f09d3dc 100644 --- a/benchmark/benchmark_prefix_topk.cpp +++ b/benchmark/benchmark_prefix_topk.cpp @@ -18,36 +18,33 @@ void benchmark(std::string const& index_filename, uint32_t k, uint64_t reported_strings = 0; auto musec_per_query = [&](double time) { - return time / (runs * num_queries); + return time / (benchmarking::runs * num_queries); }; breakdowns.add("num_queries", std::to_string(num_queries)); if (breakdown) { - std::vector timers(4); - for (uint32_t run = 0; run != runs; ++run) { + timer_probe probe(3); + for (uint32_t run = 0; run != benchmarking::runs; ++run) { for (auto const& query : queries) { - auto it = index.prefix_topk(query, k, timers); + auto it = index.prefix_topk(query, k, probe); reported_strings += it.size(); } } - std::cout << reported_strings << std::endl; + std::cout << "#ignore: " << reported_strings << std::endl; breakdowns.add("parsing_musec_per_query", - std::to_string(musec_per_query(timers[0].elapsed()))); - // breakdowns.add("completions_search_musec_per_query", - // std::to_string(musec_per_query(timers[1].elapsed()))); - // breakdowns.add("topk_rmq_musec_per_query", - // std::to_string(musec_per_query(timers[2].elapsed()))); + std::to_string(musec_per_query(probe.get(0).elapsed()))); breakdowns.add("prefix_search_musec_per_query", - std::to_string(musec_per_query(timers[1].elapsed()))); + std::to_string(musec_per_query(probe.get(1).elapsed()))); breakdowns.add("reporting_musec_per_query", - std::to_string(musec_per_query(timers[2].elapsed()))); + std::to_string(musec_per_query(probe.get(2).elapsed()))); } else { essentials::timer_type timer; + nop_probe probe; timer.start(); - for (uint32_t run = 0; run != runs; ++run) { + for (uint32_t run = 0; run != benchmarking::runs; ++run) { for (auto const& query : queries) { - auto it = index.prefix_topk(query, k); + auto it = index.prefix_topk(query, k, probe); reported_strings += it.size(); } } @@ -55,14 +52,6 @@ void benchmark(std::string const& index_filename, uint32_t k, std::cout << reported_strings << std::endl; breakdowns.add("musec_per_query", std::to_string(musec_per_query(timer.elapsed()))); - - // for (auto const& query : queries) { - // auto it = index.prefix_topk(query, k); - // reported_strings += it.size(); - // } - // breakdowns.add("avg_results_per_query", - // std::to_string(static_cast(reported_strings) / - // queries.size())); } } diff --git a/benchmark/effectiveness.cpp b/benchmark/effectiveness.cpp index e7eb7b7..e9c6590 100644 --- a/benchmark/effectiveness.cpp +++ b/benchmark/effectiveness.cpp @@ -23,10 +23,11 @@ void benchmark(std::string const& index_filename, uint32_t k, std::vector difference; difference.reserve(k); + nop_probe probe; for (auto const& query : queries) { - auto it1 = index1.prefix_topk(query, k); - auto it2 = index2.conjunctive_topk(query, k); + auto it1 = index1.prefix_topk(query, k, probe); + auto it2 = index2.conjunctive_topk(query, k, probe); strings_reported_by_prefix_search += it1.size(); uint64_t more = 0; diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp index dd085fa..f55b9e5 100644 --- a/include/autocomplete.hpp +++ b/include/autocomplete.hpp @@ -13,9 +13,6 @@ struct autocomplete { typedef scored_string_pool::iterator iterator_type; autocomplete() { - // heap_size = 0; - // checked_docids = 0; - // skipped_searches = 0; m_pool.resize(constants::POOL_SIZE, constants::MAX_K); } @@ -25,235 +22,126 @@ struct autocomplete { typename Dictionary::builder di_builder(params); typename InvertedIndex::builder ii_builder(params); typename ForwardIndex::builder fi_builder(params); - m_unsorted_docs_list.build(cm_builder.doc_ids()); m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids()); - cm_builder.build(m_completions); di_builder.build(m_dictionary); ii_builder.build(m_inverted_index); fi_builder.build(m_forward_index); } - iterator_type prefix_topk(std::string const& query, const uint32_t k) { + template + iterator_type prefix_topk(std::string const& query, const uint32_t k, + Probe& probe) { assert(k <= constants::MAX_K); + + probe.start(0); init(); completion_type prefix; byte_range suffix; - if (parse(m_dictionary, query, prefix, suffix, true) == 0) { + constexpr bool must_find_prefix = true; + if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) { return m_pool.begin(); } + probe.stop(0); + probe.start(1); range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - // NOTE: because the completion_trie works with 1-based ids - // (id 0 is reserved for null terminator) suffix_lex_range.begin += 1; suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); if (r.is_invalid()) return m_pool.begin(); - uint32_t num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - return extract_strings(num_completions); - } - - iterator_type conjunctive_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - uint32_t num_completions = 0; - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - if (num_terms == 1) { // special case - suffix_lex_range.end += 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - } else { - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - - uint32_t num_completions = 0; - if (r.is_valid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - - if (num_completions < k) { - if (num_terms == 1) { // special case - suffix_lex_range.begin -= 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - } else { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - } - - return extract_strings(num_completions); - } + probe.stop(1); - iterator_type topk(std::string const& query, const uint32_t k, - std::vector& timers) { - assert(k <= constants::MAX_K); - - // step 1: parsing - timers[0].start(); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - - // step 2: prefix search - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - uint32_t num_completions = 0; - if (r.is_valid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - timers[1].stop(); - - // step 3: conjunctive search - timers[2].start(); - if (num_completions < k) { - if (num_terms == 1) { // special case - suffix_lex_range.begin -= 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - } else { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - } - timers[2].stop(); - - // step 4: reporting - timers[3].start(); + probe.start(2); auto it = extract_strings(num_completions); - timers[3].stop(); + probe.stop(2); return it; } - // for benchmarking - iterator_type prefix_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); + template + iterator_type conjunctive_topk(std::string const& query, const uint32_t k, + Probe& probe) { assert(k <= constants::MAX_K); + + probe.start(0); init(); completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); + byte_range suffix; + constexpr bool must_find_prefix = false; + parse(m_dictionary, query, prefix, suffix, must_find_prefix); + probe.stop(0); - // step 1 - timers[1].start(); + probe.start(1); range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[2].stop(); - - // step 3 - timers[3].start(); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type conjunctive_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - uint32_t num_completions = 0; - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) { - // ++skipped_searches; - // std::cout << "'" << query << "'\n"; - return m_pool.begin(); - } - - timers[1].stop(); - - // step 2 - timers[2].start(); - if (num_terms == 1) { // special case + if (prefix.size() == 0) { suffix_lex_range.end += 1; + constexpr bool must_return_unique_results = true; num_completions = m_unsorted_minimal_docs_list.topk( suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); + must_return_unique_results); + if (num_completions < k) { + suffix_lex_range.begin += 1; + num_completions = heap_topk(m_inverted_index, suffix_lex_range, + k, m_pool.scores()); + } } else { + suffix_lex_range.begin += 1; + suffix_lex_range.end += 1; num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } - timers[2].stop(); + probe.stop(1); - // step 3 - timers[3].start(); + probe.start(2); auto it = extract_strings(num_completions); - timers[3].stop(); + probe.stop(2); return it; } + // iterator_type topk(std::string const& query, const uint32_t k) { + // assert(k <= constants::MAX_K); + // init(); + // completion_type prefix; + // byte_range suffix; + // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); + // assert(num_terms > 0); + + // range suffix_lex_range = m_dictionary.locate_prefix(suffix); + // if (suffix_lex_range.is_invalid()) return m_pool.begin(); + + // suffix_lex_range.begin += 1; + // suffix_lex_range.end += 1; + // range r = m_completions.locate_prefix(prefix, suffix_lex_range); + + // uint32_t num_completions = 0; + // if (r.is_valid()) { + // num_completions = m_unsorted_docs_list.topk(r, k, + // m_pool.scores()); + // } + + // if (num_completions < k) { + // if (num_terms == 1) { // special case + // suffix_lex_range.begin -= 1; + // num_completions = m_unsorted_minimal_docs_list.topk( + // suffix_lex_range, k, m_pool.scores(), + // true // must return unique results + // ); + // } else { + // num_completions = conjunctive_topk(prefix, suffix_lex_range, + // k); + // } + // } + + // return extract_strings(num_completions); + // } + size_t bytes() const { return m_completions.bytes() + m_unsorted_docs_list.bytes() + m_unsorted_minimal_docs_list.bytes() + m_dictionary.bytes() + @@ -272,10 +160,6 @@ struct autocomplete { visitor.visit(m_forward_index); } - // uint64_t heap_size; - // uint64_t checked_docids; - // uint64_t skipped_searches; - private: Completions m_completions; UnsortedDocsList m_unsorted_docs_list; @@ -309,7 +193,6 @@ struct autocomplete { uint32_t results = 0; for (; it.has_next(); ++it) { auto doc_id = *it; - // ++checked_docids; if (m_forward_index.intersects(doc_id, r)) { topk_scores[results++] = doc_id; if (results == k) break; diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp index cd6f411..f713043 100644 --- a/include/autocomplete2.hpp +++ b/include/autocomplete2.hpp @@ -15,8 +15,6 @@ struct autocomplete2 { typedef scored_string_pool::iterator iterator_type; autocomplete2() { - // heap_size = 0; - // checked_docids = 0; m_pool.resize(constants::POOL_SIZE, constants::MAX_K); m_topk_completion_set.resize(constants::MAX_K, 2 * constants::MAX_NUM_TERMS_PER_QUERY); @@ -27,237 +25,133 @@ struct autocomplete2 { typename Completions::builder cm_builder(params); typename Dictionary::builder di_builder(params); typename InvertedIndex::builder ii_builder(params); - auto const& docid_to_lexid = cm_builder.docid_to_lexid(); m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), util::ceil_log2(params.num_completions + 1)); m_unsorted_docs_list.build( util::invert(docid_to_lexid, params.num_completions)); m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids()); - cm_builder.build(m_completions); di_builder.build(m_dictionary); ii_builder.build(m_inverted_index); } - iterator_type prefix_topk(std::string const& query, const uint32_t k) { + template + iterator_type prefix_topk(std::string const& query, const uint32_t k, + Probe& probe) { assert(k <= constants::MAX_K); + + probe.start(0); init(); completion_type prefix; byte_range suffix; - if (parse(m_dictionary, query, prefix, suffix, true) == 0) { + constexpr bool must_find_prefix = true; + if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) { return m_pool.begin(); } + probe.stop(0); + probe.start(1); range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); if (r.is_invalid()) return m_pool.begin(); - uint32_t num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); + probe.stop(1); + + probe.start(2); extract_completions(num_completions); - return extract_strings(num_completions); + auto it = extract_strings(num_completions); + probe.stop(2); + + return it; } - iterator_type conjunctive_topk(std::string const& query, const uint32_t k) { + template + iterator_type conjunctive_topk(std::string const& query, const uint32_t k, + Probe& probe) { assert(k <= constants::MAX_K); + + probe.start(0); init(); completion_type prefix; byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); + constexpr bool must_find_prefix = false; + parse(m_dictionary, query, prefix, suffix, must_find_prefix); + probe.stop(0); + probe.start(1); range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - uint32_t num_completions = 0; - - if (num_terms == 1) { // special case + if (prefix.size() == 0) { suffix_lex_range.end += 1; + constexpr bool must_return_unique_results = true; num_completions = m_unsorted_minimal_docs_list.topk( suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); + must_return_unique_results); + if (num_completions < k) { + suffix_lex_range.begin += 1; + num_completions = heap_topk(m_inverted_index, suffix_lex_range, + k, m_pool.scores()); + } extract_completions(num_completions); } else { suffix_lex_range.begin += 1; suffix_lex_range.end += 1; num_completions = conjunctive_topk(prefix, suffix_lex_range, k); } + probe.stop(1); - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - - uint32_t num_completions = 0; - if (r.is_valid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - - if (num_completions < k) { - if (num_terms == 1) { // special case - suffix_lex_range.begin -= 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - extract_completions(num_completions); - } else { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - } else { - extract_completions(num_completions); - } - - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k, - std::vector& timers) { - assert(k <= constants::MAX_K); - - timers[0].start(); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - uint32_t num_completions = 0; - if (r.is_valid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - timers[1].stop(); - - timers[2].start(); - if (num_completions < k) { - if (num_terms == 1) { // special case - suffix_lex_range.begin -= 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - extract_completions(num_completions); - } else { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - } else { - extract_completions(num_completions); - } - timers[2].stop(); - - timers[3].start(); + probe.start(2); auto it = extract_strings(num_completions); - timers[3].stop(); + probe.stop(2); return it; } - // for benchmarking - iterator_type prefix_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - // timers[1].stop(); - - // step 2 - // timers[2].start(); - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[1].stop(); - - // step 3 - timers[2].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[2].stop(); - - return it; - } - - // for benchmarking - iterator_type conjunctive_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); - - uint32_t num_completions = 0; - - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - if (num_terms == 1) { // special case - suffix_lex_range.end += 1; - num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - true // must return unique results - ); - extract_completions(num_completions); - } else { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - timers[2].stop(); - - // step 3 - timers[3].start(); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } + // iterator_type topk(std::string const& query, const uint32_t k) { + // assert(k <= constants::MAX_K); + // init(); + // completion_type prefix; + // byte_range suffix; + // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); + // assert(num_terms > 0); + + // range suffix_lex_range = m_dictionary.locate_prefix(suffix); + // if (suffix_lex_range.is_invalid()) return m_pool.begin(); + + // suffix_lex_range.begin += 1; + // suffix_lex_range.end += 1; + // range r = m_completions.locate_prefix(prefix, suffix_lex_range); + + // uint32_t num_completions = 0; + // if (r.is_valid()) { + // num_completions = m_unsorted_docs_list.topk(r, k, + // m_pool.scores()); + // } + + // if (num_completions < k) { + // if (num_terms == 1) { // special case + // suffix_lex_range.begin -= 1; + // num_completions = m_unsorted_minimal_docs_list.topk( + // suffix_lex_range, k, m_pool.scores(), + // true // must return unique results + // ); + // extract_completions(num_completions); + // } else { + // num_completions = conjunctive_topk(prefix, suffix_lex_range, + // k); + // } + // } else { + // extract_completions(num_completions); + // } + + // return extract_strings(num_completions); + // } size_t bytes() const { return m_completions.bytes() + m_unsorted_docs_list.bytes() + @@ -277,9 +171,6 @@ struct autocomplete2 { visitor.visit(m_docid_to_lexid); } - // uint64_t heap_size; - // uint64_t checked_docids; - private: Completions m_completions; UnsortedDocsList m_unsorted_docs_list; @@ -329,7 +220,6 @@ struct autocomplete2 { for (; it.has_next(); ++it) { auto doc_id = *it; - // ++checked_docids; auto lex_id = m_docid_to_lexid[doc_id]; uint32_t size = m_completions.extract(lex_id, completions[i]); for (uint32_t j = 0; j != size; ++j) { diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index a166d9f..b6b76b4 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -5,7 +5,6 @@ #include "compact_vector.hpp" #include "autocomplete_common.hpp" #include "scored_string_pool.hpp" -#include "min_heap.hpp" #include "constants.hpp" namespace autocomplete { @@ -25,8 +24,6 @@ struct autocomplete3 { min_priority_queue_type; autocomplete3() { - // heap_size = 0; - // checked_docids = 0; m_pool.resize(constants::POOL_SIZE, constants::MAX_K); m_topk_completion_set.resize(constants::MAX_K, 2 * constants::MAX_NUM_TERMS_PER_QUERY); @@ -37,202 +34,109 @@ struct autocomplete3 { typename Completions::builder cm_builder(params); typename Dictionary::builder di_builder(params); typename InvertedIndex::builder ii_builder(params); - auto const& docid_to_lexid = cm_builder.docid_to_lexid(); m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), util::ceil_log2(params.num_completions + 1)); m_unsorted_docs_list.build( util::invert(docid_to_lexid, params.num_completions)); - cm_builder.build(m_completions); di_builder.build(m_dictionary); ii_builder.build(m_inverted_index); } - iterator_type prefix_topk(std::string const& query, const uint32_t k) { + template + iterator_type prefix_topk(std::string const& query, const uint32_t k, + Probe& probe) { assert(k <= constants::MAX_K); + + probe.start(0); init(); completion_type prefix; byte_range suffix; - if (parse(m_dictionary, query, prefix, suffix, true) == 0) { + constexpr bool must_find_prefix = true; + if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) { return m_pool.begin(); } + probe.stop(0); + probe.start(1); range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); if (r.is_invalid()) return m_pool.begin(); - uint32_t num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type conjunctive_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - uint32_t num_completions = 0; - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); + probe.stop(1); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - num_completions = - conjunctive_topk(num_terms, prefix, suffix_lex_range, k); + probe.start(2); extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - - uint32_t num_completions = 0; - if (r.is_valid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - - if (num_completions < k) { - num_completions = - conjunctive_topk(num_terms, prefix, suffix_lex_range, k); - } + auto it = extract_strings(num_completions); + probe.stop(2); - extract_completions(num_completions); - return extract_strings(num_completions); + return it; } - iterator_type topk(std::string const& query, const uint32_t k, - std::vector& timers) { + template + iterator_type conjunctive_topk(std::string const& query, const uint32_t k, + Probe& probe) { assert(k <= constants::MAX_K); - timers[0].start(); + probe.start(0); init(); completion_type prefix; byte_range suffix; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); + constexpr bool must_find_prefix = false; + parse(m_dictionary, query, prefix, suffix, must_find_prefix); + probe.stop(0); - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); + probe.start(1); uint32_t num_completions = 0; - if (r.is_valid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - timers[1].stop(); - - timers[2].start(); - if (num_completions < k) { - num_completions = - conjunctive_topk(num_terms, prefix, suffix_lex_range, k); - } - timers[2].stop(); - - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); - - return it; - } - - // for benchmarking - iterator_type prefix_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); - - // step 1 - timers[1].start(); range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); - uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[2].stop(); + num_completions = conjunctive_topk(prefix, suffix_lex_range, k); + probe.stop(1); - // step 3 - timers[3].start(); + probe.start(2); extract_completions(num_completions); auto it = extract_strings(num_completions); - timers[3].stop(); + probe.stop(2); return it; } - // for benchmarking - iterator_type conjunctive_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); - assert(num_terms > 0); - timers[0].stop(); + // iterator_type topk(std::string const& query, const uint32_t k) { + // assert(k <= constants::MAX_K); + // init(); + // completion_type prefix; + // byte_range suffix; + // uint32_t num_terms = parse(m_dictionary, query, prefix, suffix); + // assert(num_terms > 0); - uint32_t num_completions = 0; + // range suffix_lex_range = m_dictionary.locate_prefix(suffix); + // if (suffix_lex_range.is_invalid()) return m_pool.begin(); - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - timers[1].stop(); + // suffix_lex_range.begin += 1; + // suffix_lex_range.end += 1; + // range r = m_completions.locate_prefix(prefix, suffix_lex_range); - // step 2 - timers[2].start(); - num_completions = - conjunctive_topk(num_terms, prefix, suffix_lex_range, k); - timers[2].stop(); + // uint32_t num_completions = 0; + // if (r.is_valid()) { + // num_completions = m_unsorted_docs_list.topk(r, k, + // m_pool.scores()); + // } - // step 3 - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); + // if (num_completions < k) { + // num_completions = + // conjunctive_topk(num_terms, prefix, suffix_lex_range, k); + // } - return it; - } + // extract_completions(num_completions); + // return extract_strings(num_completions); + // } size_t bytes() const { return m_completions.bytes() + m_unsorted_docs_list.bytes() + @@ -251,9 +155,6 @@ struct autocomplete3 { visitor.visit(m_docid_to_lexid); } - // uint64_t heap_size; - // uint64_t checked_docids; - private: Completions m_completions; UnsortedDocsList m_unsorted_docs_list; @@ -282,11 +183,11 @@ struct autocomplete3 { } } - uint32_t conjunctive_topk(uint32_t num_terms, completion_type& prefix, + uint32_t conjunctive_topk(completion_type& prefix, const range suffix_lex_range, const uint32_t k) { - if (num_terms == 1) { // we've got nothing to intersect - iterator it(0, m_inverted_index.num_docs()); - return conjunctive_topk(it, suffix_lex_range, k); + if (prefix.size() == 0) { // we've got nothing to intersect + return heap_topk(m_inverted_index, suffix_lex_range, k, + m_pool.scores()); } deduplicate(prefix); if (prefix.size() == 1) { // we've got nothing to intersect @@ -310,13 +211,9 @@ struct autocomplete3 { } q.make_heap(); - // heap_size += q.size(); - uint32_t results = 0; for (; it.has_next() and !q.empty(); ++it) { auto doc_id = *it; - // ++checked_docids; - while (!q.empty()) { auto& z = q.top(); auto val = *z; diff --git a/include/autocomplete4.hpp b/include/autocomplete4.hpp index 3006592..d478683 100644 --- a/include/autocomplete4.hpp +++ b/include/autocomplete4.hpp @@ -5,7 +5,6 @@ #include "compact_vector.hpp" #include "autocomplete_common.hpp" #include "scored_string_pool.hpp" -#include "min_heap.hpp" #include "constants.hpp" namespace autocomplete { @@ -18,8 +17,6 @@ struct autocomplete4 { typedef scored_string_pool::iterator iterator_type; autocomplete4() { - // heap_size = 0; - // checked_docids = 0; m_pool.resize(constants::POOL_SIZE, constants::MAX_K); m_topk_completion_set.resize(constants::MAX_K, 2 * constants::MAX_NUM_TERMS_PER_QUERY); @@ -30,194 +27,107 @@ struct autocomplete4 { typename Completions::builder cm_builder(params); typename Dictionary::builder di_builder(params); typename BlockedInvertedIndex::builder ii_builder(params, c); - auto const& docid_to_lexid = cm_builder.docid_to_lexid(); m_docid_to_lexid.build(docid_to_lexid.begin(), docid_to_lexid.size(), util::ceil_log2(params.num_completions + 1)); m_unsorted_docs_list.build( util::invert(docid_to_lexid, params.num_completions)); - cm_builder.build(m_completions); di_builder.build(m_dictionary); ii_builder.build(m_inverted_index); } - iterator_type prefix_topk(std::string const& query, const uint32_t k) { + template + iterator_type prefix_topk(std::string const& query, const uint32_t k, + Probe& probe) { assert(k <= constants::MAX_K); + + probe.start(0); init(); completion_type prefix; byte_range suffix; - if (parse(m_dictionary, query, prefix, suffix, true) == 0) { + constexpr bool must_find_prefix = true; + if (!parse(m_dictionary, query, prefix, suffix, must_find_prefix)) { return m_pool.begin(); } + probe.stop(0); + probe.start(1); range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; suffix_lex_range.end += 1; range r = m_completions.locate_prefix(prefix, suffix_lex_range); if (r.is_invalid()) return m_pool.begin(); - uint32_t num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type conjunctive_topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - uint32_t num_completions = - conjunctive_topk(prefix, suffix_lex_range, k); - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k) { - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - - uint32_t num_completions = 0; - if (r.is_valid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - - if (num_completions < k) { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - - extract_completions(num_completions); - return extract_strings(num_completions); - } - - iterator_type topk(std::string const& query, const uint32_t k, - std::vector& timers) { - assert(k <= constants::MAX_K); - - timers[0].start(); - init(); - completion_type prefix; - byte_range suffix; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); + probe.stop(1); - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; - suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - uint32_t num_completions = 0; - if (r.is_valid()) { - num_completions = m_unsorted_docs_list.topk(r, k, m_pool.scores()); - } - timers[1].stop(); - - timers[2].start(); - if (num_completions < k) { - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - } - timers[2].stop(); - - timers[3].start(); + probe.start(2); extract_completions(num_completions); auto it = extract_strings(num_completions); - timers[3].stop(); + probe.stop(2); return it; } - // for benchmarking - iterator_type prefix_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); + template + iterator_type conjunctive_topk(std::string const& query, const uint32_t k, + Probe& probe) { assert(k <= constants::MAX_K); + + probe.start(0); init(); completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); + byte_range suffix; + constexpr bool must_find_prefix = false; + parse(m_dictionary, query, prefix, suffix, must_find_prefix); + probe.stop(0); - // step 1 - timers[1].start(); + probe.start(1); range suffix_lex_range = m_dictionary.locate_prefix(suffix); if (suffix_lex_range.is_invalid()) return m_pool.begin(); - suffix_lex_range.begin += 1; suffix_lex_range.end += 1; - range r = m_completions.locate_prefix(prefix, suffix_lex_range); - if (r.is_invalid()) return m_pool.begin(); - timers[1].stop(); - - // step 2 - timers[2].start(); uint32_t num_completions = - m_unsorted_docs_list.topk(r, k, m_pool.scores()); - timers[2].stop(); + conjunctive_topk(prefix, suffix_lex_range, k); + probe.stop(1); - // step 3 - timers[3].start(); + probe.start(2); extract_completions(num_completions); auto it = extract_strings(num_completions); - timers[3].stop(); + probe.stop(2); return it; } - // for benchmarking - iterator_type conjunctive_topk(std::string const& query, uint32_t const k, - std::vector& timers) { - // step 0 - timers[0].start(); - assert(k <= constants::MAX_K); - init(); - completion_type prefix; - byte_range suffix{0, 0}; - parse(m_dictionary, query, prefix, suffix); - timers[0].stop(); + // iterator_type topk(std::string const& query, const uint32_t k) { + // assert(k <= constants::MAX_K); + // init(); + // completion_type prefix; + // byte_range suffix; + // parse(m_dictionary, query, prefix, suffix); - uint32_t num_completions = 0; + // range suffix_lex_range = m_dictionary.locate_prefix(suffix); + // if (suffix_lex_range.is_invalid()) return m_pool.begin(); - // step 1 - timers[1].start(); - range suffix_lex_range = m_dictionary.locate_prefix(suffix); - if (suffix_lex_range.is_invalid()) return m_pool.begin(); - timers[1].stop(); + // suffix_lex_range.begin += 1; + // suffix_lex_range.end += 1; + // range r = m_completions.locate_prefix(prefix, suffix_lex_range); - // step 2 - timers[2].start(); - num_completions = conjunctive_topk(prefix, suffix_lex_range, k); - timers[2].stop(); + // uint32_t num_completions = 0; + // if (r.is_valid()) { + // num_completions = m_unsorted_docs_list.topk(r, k, + // m_pool.scores()); + // } - // step 3 - timers[3].start(); - extract_completions(num_completions); - auto it = extract_strings(num_completions); - timers[3].stop(); + // if (num_completions < k) { + // num_completions = conjunctive_topk(prefix, suffix_lex_range, k); + // } - return it; - } + // extract_completions(num_completions); + // return extract_strings(num_completions); + // } size_t bytes() const { return m_completions.bytes() + m_unsorted_docs_list.bytes() + @@ -236,9 +146,6 @@ struct autocomplete4 { visitor.visit(m_docid_to_lexid); } - // uint64_t heap_size; - // uint64_t checked_docids; - private: Completions m_completions; UnsortedDocsList m_unsorted_docs_list; @@ -275,13 +182,12 @@ struct autocomplete4 { } }; - typedef min_heap min_priority_queue_type; - uint32_t conjunctive_topk(completion_type& prefix, const range suffix, const uint32_t k) { auto& topk_scores = m_pool.scores(); - deduplicate(prefix); + typedef min_heap + min_priority_queue_type; min_priority_queue_type q; uint32_t current_block_id = m_inverted_index.block_id(suffix.begin); uint32_t current_block_boundary = @@ -298,43 +204,57 @@ struct autocomplete4 { q.push_back(m_inverted_index.block(current_block_id)); q.make_heap(); - // heap_size += q.size(); - - auto it = m_inverted_index.intersection_iterator(prefix, suffix); uint32_t results = 0; - for (; it.has_next() and !q.empty(); ++it) { - auto doc_id = *it; - // ++checked_docids; + auto check = [&](block_t& block, id_type doc_id) { + uint64_t pos = block.docs_iterator.position(); + assert(block.docs_iterator.access(pos) == doc_id); + uint64_t begin = block.offsets_iterator.access(pos); + uint64_t end = block.offsets_iterator.access(pos + 1); + assert(end > begin); + for (uint64_t i = begin; i != end; ++i) { + auto t = block.terms_iterator.access(i) + block.lower_bound; + if (t > suffix.end) break; + if (suffix.contains(t)) { + topk_scores[results++] = doc_id; + break; + } + } + }; + + if (prefix.size() == 0) { while (!q.empty()) { auto& z = q.top(); - auto val = z.docs_iterator.operator*(); - if (val > doc_id) break; - if (val < doc_id) { - val = z.docs_iterator.next_geq(doc_id); - if (!z.docs_iterator.has_next()) { - q.pop(); + auto doc_id = z.docs_iterator.operator*(); + check(z, doc_id); + if (results == k) return results; + z.docs_iterator.next(); + if (!z.docs_iterator.has_next()) q.pop(); + q.heapify(); + } + } else { + deduplicate(prefix); + auto it = m_inverted_index.intersection_iterator(prefix, suffix); + for (; it.has_next() and !q.empty(); ++it) { + auto doc_id = *it; + while (!q.empty()) { + auto& z = q.top(); + auto val = z.docs_iterator.operator*(); + if (val > doc_id) break; + if (val < doc_id) { + val = z.docs_iterator.next_geq(doc_id); + if (!z.docs_iterator.has_next()) { + q.pop(); + } else { + q.heapify(); + } } else { - q.heapify(); - } - } else { - if (val == doc_id) { - uint64_t pos = z.docs_iterator.position(); - assert(z.docs_iterator.access(pos) == doc_id); - uint64_t begin = z.offsets_iterator.access(pos); - uint64_t end = z.offsets_iterator.access(pos + 1); - assert(end > begin); - for (uint64_t i = begin; i != end; ++i) { - auto t = z.terms_iterator.access(i) + z.lower_bound; - if (t > suffix.end) break; - if (suffix.contains(t)) { - topk_scores[results++] = doc_id; - if (results == k) return results; - break; - } + if (val == doc_id) { + check(z, doc_id); + if (results == k) return results; } + break; } - break; } } } diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp index bd49934..f655d3f 100644 --- a/include/autocomplete_common.hpp +++ b/include/autocomplete_common.hpp @@ -1,14 +1,13 @@ #pragma once #include "util_types.hpp" +#include "min_heap.hpp" namespace autocomplete { template -uint32_t parse(Dictionary const& dict, std::string const& query, - completion_type& prefix, byte_range& suffix, - bool must_find_prefix = false) { - uint32_t num_terms = 1; // for suffix +bool parse(Dictionary const& dict, std::string const& query, + completion_type& prefix, byte_range& suffix, bool must_find_prefix) { byte_range_iterator it(string_to_byte_range(query)); while (true) { suffix = it.next(); @@ -16,12 +15,11 @@ uint32_t parse(Dictionary const& dict, std::string const& query, auto term_id = dict.locate(suffix); if (term_id != global::invalid_term_id) { prefix.push_back(term_id); - ++num_terms; } else { - if (must_find_prefix) return 0; + if (must_find_prefix) return false; } } - return num_terms; + return true; } void deduplicate(completion_type& c) { @@ -30,4 +28,36 @@ void deduplicate(completion_type& c) { c.resize(std::distance(c.begin(), end)); } +template +uint32_t heap_topk(InvertedIndex const& index, const range r, const uint32_t k, + std::vector& topk_scores) { + assert(r.is_valid()); + + typedef min_heap> + min_priority_queue_type; + + min_priority_queue_type q; + q.reserve(r.end - r.begin + 1); // inclusive range + assert(r.begin > 0); + for (uint64_t term_id = r.begin; term_id <= r.end; ++term_id) { + q.push_back(index.iterator(term_id - 1)); + } + q.make_heap(); + + uint32_t results = 0; + + while (!q.empty()) { + auto& z = q.top(); + auto doc_id = *z; + topk_scores[results++] = doc_id; + if (results == k) return results; + z.next(); + if (!z.has_next()) q.pop(); + q.heapify(); + } + + return results; +} + } // namespace autocomplete \ No newline at end of file diff --git a/include/blocked_inverted_index.hpp b/include/blocked_inverted_index.hpp index 519a0bf..2f1af3a 100644 --- a/include/blocked_inverted_index.hpp +++ b/include/blocked_inverted_index.hpp @@ -273,48 +273,44 @@ struct blocked_inverted_index { , m_num_docs(ii->num_docs()) , m_suffix(r) { assert(r.is_valid()); - - if (!term_ids.empty()) { - assert(std::is_sorted(term_ids.begin(), term_ids.end())); - assert(std::unique(term_ids.begin(), term_ids.end()) == - term_ids.end()); - - m_blocks.reserve(term_ids.size()); // at most - uint32_t current_block_id = ii->block_id(term_ids.front()); - uint32_t i = 0; - uint32_t prev_i = 0; - for (; i != term_ids.size(); ++i) { - auto term_id = term_ids[i]; - assert(term_id > 0); - uint32_t b = ii->block_id(term_id); - if (b > current_block_id) { - auto block = ii->block(current_block_id); - block.term_ids.reserve(term_ids.size()); // at most - for (; prev_i != i; ++prev_i) { - block.term_ids.push_back(term_ids[prev_i]); - } - m_blocks.push_back(std::move(block)); + assert(!term_ids.empty()); + assert(std::is_sorted(term_ids.begin(), term_ids.end())); + assert(std::unique(term_ids.begin(), term_ids.end()) == + term_ids.end()); + + m_blocks.reserve(term_ids.size()); // at most + uint32_t current_block_id = ii->block_id(term_ids.front()); + uint32_t i = 0; + uint32_t prev_i = 0; + for (; i != term_ids.size(); ++i) { + auto term_id = term_ids[i]; + assert(term_id > 0); + uint32_t b = ii->block_id(term_id); + if (b > current_block_id) { + auto block = ii->block(current_block_id); + block.term_ids.reserve(term_ids.size()); // at most + for (; prev_i != i; ++prev_i) { + block.term_ids.push_back(term_ids[prev_i]); } - current_block_id = b; + m_blocks.push_back(std::move(block)); } + current_block_id = b; + } - auto block = ii->block(current_block_id); - block.term_ids.reserve(term_ids.size()); // at most - for (; prev_i != i; ++prev_i) { - block.term_ids.push_back(term_ids[prev_i]); - } - m_blocks.push_back(std::move(block)); + auto block = ii->block(current_block_id); + block.term_ids.reserve(term_ids.size()); // at most + for (; prev_i != i; ++prev_i) { + block.term_ids.push_back(term_ids[prev_i]); + } + m_blocks.push_back(std::move(block)); - std::sort(m_blocks.begin(), m_blocks.end(), - [](auto const& l, auto const& r) { - return l.docs_iterator.size() < - r.docs_iterator.size(); - }); + std::sort(m_blocks.begin(), m_blocks.end(), + [](auto const& l, auto const& r) { + return l.docs_iterator.size() < + r.docs_iterator.size(); + }); - m_candidate = m_blocks[0].docs_iterator.access(0); - } else { - m_candidate = 0; - } + m_candidate = m_blocks[0].docs_iterator.access(0); next(); } @@ -329,12 +325,8 @@ struct blocked_inverted_index { void operator++() { assert(m_i == m_blocks.size()); - if (!m_blocks.empty()) { - if (m_blocks.size() > 1) { - m_candidate = m_blocks[0].docs_iterator.next(); - } - } else { - m_candidate += 1; + if (m_blocks.size() > 1) { + m_candidate = m_blocks[0].docs_iterator.next(); } m_i = 0; next(); @@ -375,7 +367,6 @@ struct blocked_inverted_index { } void next() { - if (m_blocks.empty()) return; if (m_blocks.size() == 1) { while (m_candidate < m_num_docs and m_i != m_blocks.size()) { assert(m_i == 0); diff --git a/include/probe.hpp b/include/probe.hpp new file mode 100644 index 0000000..955a939 --- /dev/null +++ b/include/probe.hpp @@ -0,0 +1,36 @@ +#pragma once + +#include +#include "util_types.hpp" + +namespace autocomplete { + +struct nop_probe { + inline void start(uint64_t) {} + inline void stop(uint64_t) {} +}; + +struct timer_probe { + timer_probe(uint64_t n) + : m_timers(n) {} + + inline void start(uint64_t i) { + assert(i < m_timers.size()); + m_timers[i].start(); + } + + inline void stop(uint64_t i) { + assert(i < m_timers.size()); + m_timers[i].stop(); + } + + timer_type const& get(uint64_t i) { + assert(i < m_timers.size()); + return m_timers[i]; + } + +private: + std::vector m_timers; +}; + +} // namespace autocomplete diff --git a/include/util_types.hpp b/include/util_types.hpp index e056bb6..531e65d 100644 --- a/include/util_types.hpp +++ b/include/util_types.hpp @@ -242,25 +242,4 @@ struct timer { typedef timer timer_type; -struct iterator { - iterator(id_type begin, id_type end) - : m_begin(begin) - , m_end(end) {} - - bool has_next() const { - return m_begin < m_end; - } - - id_type operator*() const { - return m_begin; - } - - void operator++() { - ++m_begin; - } - -private: - id_type m_begin, m_end; -}; - } // namespace autocomplete diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 576f34b..1c5a82d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,5 +2,5 @@ add_executable(build build.cpp) add_executable(web_server web_server.cpp ../external/mongoose/mongoose.c) add_executable(output_ds2i_format output_ds2i_format.cpp) add_executable(statistics statistics.cpp) -add_executable(check_topk check_topk.cpp) +# add_executable(check_topk check_topk.cpp) add_executable(map_queries map_queries.cpp) \ No newline at end of file diff --git a/src/web_server.cpp b/src/web_server.cpp index 7a0a61c..db317fa 100644 --- a/src/web_server.cpp +++ b/src/web_server.cpp @@ -5,6 +5,7 @@ #include "constants.hpp" #include "types.hpp" +#include "probe.hpp" #include "../external/mongoose/mongoose.h" @@ -53,9 +54,10 @@ static void ev_handler(struct mg_connection* nc, int ev, void* p) { } std::string data; - auto it = topk_index.topk(query, k); - // auto it = topk_index.prefix_topk(query, k); - // auto it = topk_index.conjunctive_topk(query, k); + nop_probe probe; + // auto it = topk_index.topk(query, k probe); + // auto it = topk_index.prefix_topk(query, k, probe); + auto it = topk_index.conjunctive_topk(query, k, probe); if (it.empty()) { data = "{\"suggestions\":[\"value\":\"\",\"data\":\"\"]}\n"; } else { diff --git a/test/test_autocomplete.cpp b/test/test_autocomplete.cpp index 964a451..8fe49cc 100644 --- a/test/test_autocomplete.cpp +++ b/test/test_autocomplete.cpp @@ -36,8 +36,9 @@ TEST_CASE("test autocomplete topk functions") { "florir", "fly", "the starting l", "floridaaa"}; + nop_probe probe; for (auto& query : queries) { - auto it = index.prefix_topk(query, k); + auto it = index.prefix_topk(query, k, probe); std::cout << "top-" << it.size() << " completions for '" << query << "':\n"; for (uint32_t i = 0; i != it.size(); ++i, ++it) { @@ -61,8 +62,9 @@ TEST_CASE("test autocomplete topk functions") { "fo", "f", "matt", "fl", "flor", "fly", "the starting l"}; + nop_probe probe; for (auto& query : queries) { - auto it = index.conjunctive_topk(query, k); + auto it = index.conjunctive_topk(query, k, probe); std::cout << "top-" << it.size() << " completions for '" << query << "':\n"; for (uint32_t i = 0; i != it.size(); ++i, ++it) { diff --git a/test/test_common.hpp b/test/test_common.hpp index 24f4540..c17283f 100644 --- a/test/test_common.hpp +++ b/test/test_common.hpp @@ -6,6 +6,7 @@ #include #include "types.hpp" +#include "probe.hpp" #include "../benchmark/benchmark_common.hpp" namespace autocomplete { diff --git a/test/test_locate_prefix.cpp b/test/test_locate_prefix.cpp index ae99a6b..1a81693 100644 --- a/test/test_locate_prefix.cpp +++ b/test/test_locate_prefix.cpp @@ -12,7 +12,7 @@ void test_locate_prefix(Dictionary const& dict, Index const& index, range expected = testing::locate_prefix(strings, query); completion_type prefix; byte_range suffix; - parse(dict, query, prefix, suffix); + parse(dict, query, prefix, suffix, true); range suffix_lex_range = dict.locate_prefix(suffix); suffix_lex_range.begin += 1; From 9adfac2a91b668f5022b0dfad5780b55d532fc79 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 11 Mar 2020 11:13:02 +0100 Subject: [PATCH 056/102] up --- benchmark/benchmark_common.hpp | 2 +- benchmark/benchmark_conjunctive_topk.cpp | 4 ++++ script/collect_results_by_varying_percentage.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp index e7f9160..14dca8a 100644 --- a/benchmark/benchmark_common.hpp +++ b/benchmark/benchmark_common.hpp @@ -6,7 +6,7 @@ namespace autocomplete { namespace benchmarking { -static const uint32_t runs = 1; +static const uint32_t runs = 5; } // void tolower(std::string& str) { diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp index ae73512..5ab37b7 100644 --- a/benchmark/benchmark_conjunctive_topk.cpp +++ b/benchmark/benchmark_conjunctive_topk.cpp @@ -32,6 +32,8 @@ void benchmark(std::string const& index_filename, uint32_t k, } } std::cout << "#ignore: " << reported_strings << std::endl; + breakdowns.add("reported_strings", + std::to_string(reported_strings / benchmarking::runs)); breakdowns.add("parsing_musec_per_query", std::to_string(musec_per_query(probe.get(0).elapsed()))); breakdowns.add("conjunctive_search_musec_per_query", @@ -50,6 +52,8 @@ void benchmark(std::string const& index_filename, uint32_t k, } timer.stop(); std::cout << "#ignore: " << reported_strings << std::endl; + breakdowns.add("reported_strings", + std::to_string(reported_strings / benchmarking::runs)); breakdowns.add("musec_per_query", std::to_string(musec_per_query(timer.elapsed()))); } diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py index 48a7dd1..f268443 100644 --- a/script/collect_results_by_varying_percentage.py +++ b/script/collect_results_by_varying_percentage.py @@ -19,6 +19,6 @@ percentages = ["0.0", "0.25", "0.50", "0.75"] for perc in percentages: - for terms in range(2,8): # (1,8) + for terms in range(1,8): os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) From 2732f639f253dbd4c27d9cf06d30bdbdc42b36c2 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 11 Mar 2020 12:27:39 +0100 Subject: [PATCH 057/102] refactored benchmarking suite --- README.md | 6 +- benchmark/benchmark_common.hpp | 84 ++++++++++++++++- benchmark/benchmark_conjunctive_topk.cpp | 93 +------------------ benchmark/benchmark_prefix_topk.cpp | 89 +----------------- benchmark/benchmark_topk.cpp | 90 +----------------- ...ctiveness_results_by_varying_percentage.py | 3 +- .../collect_results_by_varying_percentage.py | 10 +- 7 files changed, 94 insertions(+), 281 deletions(-) diff --git a/README.md b/README.md index ce69cb7..12b6328 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,11 @@ in preparing the data for indexing. Thus, from within the directory `test_data`, it is sufficient to do: - bash preprocess.sh 300 + bash preprocess.sh + +Therefore, for our example with `trec_05_efficiency_queries`, it would be: + + bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300 The second argument in the example, i.e., 300, represents the number of completions (per completion size) that are drawn at diff --git a/benchmark/benchmark_common.hpp b/benchmark/benchmark_common.hpp index 14dca8a..1a96333 100644 --- a/benchmark/benchmark_common.hpp +++ b/benchmark/benchmark_common.hpp @@ -43,7 +43,89 @@ void configure_parser_for_benchmarking(cmd_line_parser::parser& parser) { parser.add("percentage", "A float in [0,1] specifying how much we keep of the last token " "in a query: n x 100 <=> n%, for n in [0,1]."); - parser.add("breakdown", "Collect timings breakdown.", "--breakdown"); } +#define BENCHMARK(what) \ + template \ + void benchmark(std::string const& index_filename, uint32_t k, \ + uint32_t max_num_queries, float keep, \ + essentials::json_lines& breakdowns) { \ + Index index; \ + essentials::load(index, index_filename.c_str()); \ + \ + std::vector queries; \ + uint32_t num_queries = \ + load_queries(queries, max_num_queries, keep, std::cin); \ + \ + uint64_t reported_strings = 0; \ + auto musec_per_query = [&](double time) { \ + return time / (benchmarking::runs * num_queries); \ + }; \ + \ + breakdowns.add("num_queries", std::to_string(num_queries)); \ + \ + timer_probe probe(3); \ + for (uint32_t run = 0; run != benchmarking::runs; ++run) { \ + for (auto const& query : queries) { \ + auto it = index.what##topk(query, k, probe); \ + reported_strings += it.size(); \ + } \ + } \ + std::cout << "#ignore: " << reported_strings << std::endl; \ + \ + breakdowns.add("reported_strings", \ + std::to_string(reported_strings / benchmarking::runs)); \ + breakdowns.add( \ + "parsing_musec_per_query", \ + std::to_string(musec_per_query(probe.get(0).elapsed()))); \ + breakdowns.add( \ + std::string(#what) + "search_musec_per_query", \ + std::to_string(musec_per_query(probe.get(1).elapsed()))); \ + breakdowns.add( \ + "reporting_musec_per_query", \ + std::to_string(musec_per_query(probe.get(2).elapsed()))); \ + breakdowns.add( \ + "total_musec_per_query", \ + std::to_string(musec_per_query(probe.get(0).elapsed()) + \ + musec_per_query(probe.get(1).elapsed()) + \ + musec_per_query(probe.get(2).elapsed()))); \ + } \ + \ + int main(int argc, char** argv) { \ + cmd_line_parser::parser parser(argc, argv); \ + configure_parser_for_benchmarking(parser); \ + if (!parser.parse()) return 1; \ + \ + auto type = parser.get("type"); \ + auto k = parser.get("k"); \ + auto index_filename = parser.get("index_filename"); \ + auto max_num_queries = parser.get("max_num_queries"); \ + auto keep = parser.get("percentage"); \ + \ + essentials::json_lines breakdowns; \ + breakdowns.new_line(); \ + breakdowns.add("num_terms_per_query", \ + parser.get("num_terms_per_query")); \ + breakdowns.add("percentage", std::to_string(keep)); \ + \ + if (type == "ef_type1") { \ + benchmark( \ + index_filename, k, max_num_queries, keep, breakdowns); \ + } else if (type == "ef_type2") { \ + benchmark( \ + index_filename, k, max_num_queries, keep, breakdowns); \ + } else if (type == "ef_type3") { \ + benchmark( \ + index_filename, k, max_num_queries, keep, breakdowns); \ + } else if (type == "ef_type4") { \ + benchmark( \ + index_filename, k, max_num_queries, keep, breakdowns); \ + } else { \ + return 1; \ + } \ + \ + breakdowns.print(); \ + return 0; \ + } + } // namespace autocomplete \ No newline at end of file diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/benchmark/benchmark_conjunctive_topk.cpp index 5ab37b7..df14c84 100644 --- a/benchmark/benchmark_conjunctive_topk.cpp +++ b/benchmark/benchmark_conjunctive_topk.cpp @@ -4,95 +4,4 @@ #include "benchmark_common.hpp" using namespace autocomplete; - -template -void benchmark(std::string const& index_filename, uint32_t k, - uint32_t max_num_queries, float keep, - essentials::json_lines& breakdowns, bool breakdown) { - Index index; - essentials::load(index, index_filename.c_str()); - - std::vector queries; - uint32_t num_queries = - load_queries(queries, max_num_queries, keep, std::cin); - - uint64_t reported_strings = 0; - auto musec_per_query = [&](double time) { - return time / (benchmarking::runs * num_queries); - }; - - breakdowns.add("num_queries", std::to_string(num_queries)); - - if (breakdown) { - timer_probe probe(3); - for (uint32_t run = 0; run != benchmarking::runs; ++run) { - for (auto const& query : queries) { - auto it = index.conjunctive_topk(query, k, probe); - reported_strings += it.size(); - } - } - std::cout << "#ignore: " << reported_strings << std::endl; - breakdowns.add("reported_strings", - std::to_string(reported_strings / benchmarking::runs)); - breakdowns.add("parsing_musec_per_query", - std::to_string(musec_per_query(probe.get(0).elapsed()))); - breakdowns.add("conjunctive_search_musec_per_query", - std::to_string(musec_per_query(probe.get(1).elapsed()))); - breakdowns.add("reporting_musec_per_query", - std::to_string(musec_per_query(probe.get(2).elapsed()))); - } else { - essentials::timer_type timer; - nop_probe probe; - timer.start(); - for (uint32_t run = 0; run != benchmarking::runs; ++run) { - for (auto const& query : queries) { - auto it = index.conjunctive_topk(query, k, probe); - reported_strings += it.size(); - } - } - timer.stop(); - std::cout << "#ignore: " << reported_strings << std::endl; - breakdowns.add("reported_strings", - std::to_string(reported_strings / benchmarking::runs)); - breakdowns.add("musec_per_query", - std::to_string(musec_per_query(timer.elapsed()))); - } -} - -int main(int argc, char** argv) { - cmd_line_parser::parser parser(argc, argv); - configure_parser_for_benchmarking(parser); - if (!parser.parse()) return 1; - - auto type = parser.get("type"); - auto k = parser.get("k"); - auto index_filename = parser.get("index_filename"); - auto max_num_queries = parser.get("max_num_queries"); - auto keep = parser.get("percentage"); - auto breakdown = parser.get("breakdown"); - - essentials::json_lines breakdowns; - breakdowns.new_line(); - breakdowns.add("num_terms_per_query", - parser.get("num_terms_per_query")); - breakdowns.add("percentage", std::to_string(keep)); - - if (type == "ef_type1") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type2") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type3") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type4") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else { - return 1; - } - - breakdowns.print(); - return 0; -} \ No newline at end of file +BENCHMARK(conjunctive_) \ No newline at end of file diff --git a/benchmark/benchmark_prefix_topk.cpp b/benchmark/benchmark_prefix_topk.cpp index f09d3dc..69a0bc1 100644 --- a/benchmark/benchmark_prefix_topk.cpp +++ b/benchmark/benchmark_prefix_topk.cpp @@ -4,91 +4,4 @@ #include "benchmark_common.hpp" using namespace autocomplete; - -template -void benchmark(std::string const& index_filename, uint32_t k, - uint32_t max_num_queries, float keep, - essentials::json_lines& breakdowns, bool breakdown) { - Index index; - essentials::load(index, index_filename.c_str()); - - std::vector queries; - uint32_t num_queries = - load_queries(queries, max_num_queries, keep, std::cin); - - uint64_t reported_strings = 0; - auto musec_per_query = [&](double time) { - return time / (benchmarking::runs * num_queries); - }; - - breakdowns.add("num_queries", std::to_string(num_queries)); - - if (breakdown) { - timer_probe probe(3); - for (uint32_t run = 0; run != benchmarking::runs; ++run) { - for (auto const& query : queries) { - auto it = index.prefix_topk(query, k, probe); - reported_strings += it.size(); - } - } - std::cout << "#ignore: " << reported_strings << std::endl; - breakdowns.add("parsing_musec_per_query", - std::to_string(musec_per_query(probe.get(0).elapsed()))); - breakdowns.add("prefix_search_musec_per_query", - std::to_string(musec_per_query(probe.get(1).elapsed()))); - breakdowns.add("reporting_musec_per_query", - std::to_string(musec_per_query(probe.get(2).elapsed()))); - } else { - essentials::timer_type timer; - nop_probe probe; - timer.start(); - for (uint32_t run = 0; run != benchmarking::runs; ++run) { - for (auto const& query : queries) { - auto it = index.prefix_topk(query, k, probe); - reported_strings += it.size(); - } - } - timer.stop(); - std::cout << reported_strings << std::endl; - breakdowns.add("musec_per_query", - std::to_string(musec_per_query(timer.elapsed()))); - } -} - -int main(int argc, char** argv) { - cmd_line_parser::parser parser(argc, argv); - configure_parser_for_benchmarking(parser); - if (!parser.parse()) return 1; - - auto type = parser.get("type"); - auto k = parser.get("k"); - auto index_filename = parser.get("index_filename"); - auto max_num_queries = parser.get("max_num_queries"); - auto keep = parser.get("percentage"); - auto breakdown = parser.get("breakdown"); - - essentials::json_lines breakdowns; - breakdowns.new_line(); - breakdowns.add("num_terms_per_query", - parser.get("num_terms_per_query")); - breakdowns.add("percentage", std::to_string(keep)); - - if (type == "ef_type1") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type2") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type3") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type4") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else { - return 1; - } - - breakdowns.print(); - return 0; -} \ No newline at end of file +BENCHMARK(prefix_) \ No newline at end of file diff --git a/benchmark/benchmark_topk.cpp b/benchmark/benchmark_topk.cpp index 0ea1e97..98d208c 100644 --- a/benchmark/benchmark_topk.cpp +++ b/benchmark/benchmark_topk.cpp @@ -4,92 +4,4 @@ #include "benchmark_common.hpp" using namespace autocomplete; - -template -void benchmark(std::string const& index_filename, uint32_t k, - uint32_t max_num_queries, float keep, - essentials::json_lines& breakdowns, bool breakdown) { - Index index; - essentials::load(index, index_filename.c_str()); - - std::vector queries; - uint32_t num_queries = - load_queries(queries, max_num_queries, keep, std::cin); - - uint64_t reported_strings = 0; - auto musec_per_query = [&](double time) { - return time / (runs * num_queries); - }; - - breakdowns.add("num_queries", std::to_string(num_queries)); - - if (breakdown) { - std::vector timers(4); - for (uint32_t run = 0; run != runs; ++run) { - for (auto const& query : queries) { - auto it = index.topk(query, k, timers); - reported_strings += it.size(); - } - } - std::cout << reported_strings << std::endl; - breakdowns.add("parsing_musec_per_query", - std::to_string(musec_per_query(timers[0].elapsed()))); - breakdowns.add("prefix_search_musec_per_query", - std::to_string(musec_per_query(timers[1].elapsed()))); - breakdowns.add("conjunctive_search_musec_per_query", - std::to_string(musec_per_query(timers[2].elapsed()))); - breakdowns.add("reporting_musec_per_query", - std::to_string(musec_per_query(timers[3].elapsed()))); - } else { - essentials::timer_type timer; - timer.start(); - for (uint32_t run = 0; run != runs; ++run) { - for (auto const& query : queries) { - auto it = index.topk(query, k); - reported_strings += it.size(); - } - } - timer.stop(); - std::cout << reported_strings << std::endl; - breakdowns.add("musec_per_query", - std::to_string(musec_per_query(timer.elapsed()))); - } -} - -int main(int argc, char** argv) { - cmd_line_parser::parser parser(argc, argv); - configure_parser_for_benchmarking(parser); - if (!parser.parse()) return 1; - - auto type = parser.get("type"); - auto k = parser.get("k"); - auto index_filename = parser.get("index_filename"); - auto max_num_queries = parser.get("max_num_queries"); - auto keep = parser.get("percentage"); - auto breakdown = parser.get("breakdown"); - - essentials::json_lines breakdowns; - breakdowns.new_line(); - breakdowns.add("num_terms_per_query", - parser.get("num_terms_per_query")); - breakdowns.add("percentage", std::to_string(keep)); - - if (type == "ef_type1") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type2") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type3") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else if (type == "ef_type4") { - benchmark(index_filename, k, max_num_queries, - keep, breakdowns, breakdown); - } else { - return 1; - } - - breakdowns.print(); - return 0; -} \ No newline at end of file +BENCHMARK("") \ No newline at end of file diff --git a/script/collect_effectiveness_results_by_varying_percentage.py b/script/collect_effectiveness_results_by_varying_percentage.py index 4fc7683..b1cfe40 100644 --- a/script/collect_effectiveness_results_by_varying_percentage.py +++ b/script/collect_effectiveness_results_by_varying_percentage.py @@ -7,12 +7,11 @@ num_queries = sys.argv[5] output_filename = collection_basename + "." + index_type - output_filename += ".effectiveness.json" query_filename_prefix = collection_basename + ".queries/queries." percentages = ["0.0", "0.25", "0.50", "0.75"] for perc in percentages: - for terms in range(2,8): # (1,8) + for terms in range(1,8): os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py index f268443..d565689 100644 --- a/script/collect_results_by_varying_percentage.py +++ b/script/collect_results_by_varying_percentage.py @@ -8,17 +8,11 @@ num_queries = sys.argv[6] output_filename = collection_basename + "." + index_type - -breakdown = "" -if len(sys.argv) > 7 and sys.argv[7] == "--breakdown": - breakdown = "--breakdown" - output_filename += ".breakdown" - output_filename += "." + query_mode + ".json" query_filename_prefix = collection_basename + ".queries/queries." percentages = ["0.0", "0.25", "0.50", "0.75"] for perc in percentages: for terms in range(1,8): - os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) - os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " " + breakdown + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) From 58f6cb187f3b037e797e9b65abcd168b6c6729b3 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 11 Mar 2020 15:13:35 +0100 Subject: [PATCH 058/102] scripts updated --- script/collect_effectiveness_results_by_varying_percentage.py | 4 ++-- script/collect_results_by_varying_percentage.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/script/collect_effectiveness_results_by_varying_percentage.py b/script/collect_effectiveness_results_by_varying_percentage.py index b1cfe40..2693e70 100644 --- a/script/collect_effectiveness_results_by_varying_percentage.py +++ b/script/collect_effectiveness_results_by_varying_percentage.py @@ -12,6 +12,6 @@ percentages = ["0.0", "0.25", "0.50", "0.75"] for perc in percentages: - for terms in range(1,8): + for terms in range(1,7): os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) - os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) + os.system("../build/effectiveness " + index_type + " " + k + " ../build/" + index_filename + " 7+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=7+ 2>> " + output_filename) diff --git a/script/collect_results_by_varying_percentage.py b/script/collect_results_by_varying_percentage.py index d565689..c639032 100644 --- a/script/collect_results_by_varying_percentage.py +++ b/script/collect_results_by_varying_percentage.py @@ -13,6 +13,6 @@ percentages = ["0.0", "0.25", "0.50", "0.75"] for perc in percentages: - for terms in range(1,8): + for terms in range(1,7): os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " " + str(terms) + " " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=" + str(terms) + " 2>> " + output_filename) - os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 8+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=8+ 2>> " + output_filename) + os.system("../build/benchmark_" + query_mode + " " + index_type + " " + k + " ../build/" + index_filename + " 7+ " + str(num_queries) + " " + perc + " < " + query_filename_prefix + "length=7+ 2>> " + output_filename) From 05298907a0342eb0dd35b9ab8df853fa7049633b Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 11 Mar 2020 15:15:47 +0100 Subject: [PATCH 059/102] scripts updated --- test_data/filter_and_preprocess.sh | 2 +- test_data/filter_dataset.py | 4 ++-- test_data/partition_queries_by_length.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test_data/filter_and_preprocess.sh b/test_data/filter_and_preprocess.sh index 38425d7..9a5d787 100644 --- a/test_data/filter_and_preprocess.sh +++ b/test_data/filter_and_preprocess.sh @@ -3,7 +3,7 @@ echo $1 # input filename # number of completions to exclude per completion size, -# e.g., if it is 100, then at most 8 x 100 completions are filtered out +# e.g., if it is 100, then at most 7 x 100 completions are filtered out echo $2 python partition_queries_by_length.py $1 $1.filtered.queries $2 diff --git a/test_data/filter_dataset.py b/test_data/filter_dataset.py index 4481cbe..dc68a28 100644 --- a/test_data/filter_dataset.py +++ b/test_data/filter_dataset.py @@ -6,12 +6,12 @@ to_filter = Set({}) print("loading strings to filter...") -for i in range(1,8): +for i in range(1,7): with open(queries_directory + "/queries.length=" + str(i)) as f: for line in f: s = line.rstrip('\n') to_filter.add(s) -with open(queries_directory + "/queries.length=8+") as f: +with open(queries_directory + "/queries.length=7+") as f: for line in f: s = line.rstrip('\n') to_filter.add(s) diff --git a/test_data/partition_queries_by_length.py b/test_data/partition_queries_by_length.py index eb9b95d..3d3823b 100644 --- a/test_data/partition_queries_by_length.py +++ b/test_data/partition_queries_by_length.py @@ -7,7 +7,7 @@ if not os.path.exists(output_directory): os.makedirs(output_directory) -num_shards = 7 +num_shards = 6 files = [open(output_directory + "/queries.length=" + str(i), "w") for i in range(1,num_shards + 1)] all_others = open(output_directory + "/queries.length=" + str(num_shards + 1) + "+", "w") From bac98f457efff320e881bf31defa6a163de790b9 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 11 Mar 2020 15:17:02 +0100 Subject: [PATCH 060/102] readme updated --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 12b6328..1247d50 100644 --- a/README.md +++ b/README.md @@ -156,8 +156,8 @@ to partition the input completions by number of query terms and retain 300 queries at random. Query files are placed in the output directory `trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries`. -(By default, 8 shards will be created: the ones having [1,7] query terms and -the one collecting all completions with >= 8 query terms). +(By default, 7 shards will be created: the ones having [1,6] query terms and +the one collecting all completions with *at least* 7 query terms). Then the command @@ -171,8 +171,6 @@ From within the `/build` directory, run python ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300 -You can also specify the option `--breakdown` to record timings breakdowns. - To benchmark the dictionaries (Front-Coding and trie), just run the following script from within the `script` directory: From 13c04333110ef2eac4f948d86d434f9fa9bb0f8e Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Mon, 16 Mar 2020 17:59:21 +0100 Subject: [PATCH 061/102] minimal_docids --- include/autocomplete.hpp | 20 ++--- include/autocomplete2.hpp | 17 ++-- include/autocomplete3.hpp | 5 +- include/autocomplete4.hpp | 4 +- include/autocomplete_common.hpp | 13 ++- include/compact_forward_index.hpp | 1 + include/minimal_docids.hpp | 131 ++++++++++++++++++++++++++++++ include/scored_string_pool.hpp | 5 ++ include/statistics.hpp | 26 +++--- include/types.hpp | 46 ++--------- include/unsorted_list.hpp | 74 ++++++++--------- include/util_types.hpp | 49 +++++++++++ test/test_unsorted_list.cpp | 12 +-- 13 files changed, 275 insertions(+), 128 deletions(-) create mode 100644 include/minimal_docids.hpp diff --git a/include/autocomplete.hpp b/include/autocomplete.hpp index f55b9e5..78e54ad 100644 --- a/include/autocomplete.hpp +++ b/include/autocomplete.hpp @@ -7,8 +7,8 @@ namespace autocomplete { -template +template struct autocomplete { typedef scored_string_pool::iterator iterator_type; @@ -22,8 +22,10 @@ struct autocomplete { typename Dictionary::builder di_builder(params); typename InvertedIndex::builder ii_builder(params); typename ForwardIndex::builder fi_builder(params); + m_unsorted_docs_list.build(cm_builder.doc_ids()); m_unsorted_minimal_docs_list.build(ii_builder.minimal_doc_ids()); + cm_builder.build(m_completions); di_builder.build(m_dictionary); ii_builder.build(m_inverted_index); @@ -82,15 +84,8 @@ struct autocomplete { uint32_t num_completions = 0; if (prefix.size() == 0) { suffix_lex_range.end += 1; - constexpr bool must_return_unique_results = true; num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - must_return_unique_results); - if (num_completions < k) { - suffix_lex_range.begin += 1; - num_completions = heap_topk(m_inverted_index, suffix_lex_range, - k, m_pool.scores()); - } + m_inverted_index, suffix_lex_range, k, m_pool.scores()); } else { suffix_lex_range.begin += 1; suffix_lex_range.end += 1; @@ -162,8 +157,9 @@ struct autocomplete { private: Completions m_completions; - UnsortedDocsList m_unsorted_docs_list; - UnsortedDocsList m_unsorted_minimal_docs_list; + unsorted_list_type m_unsorted_docs_list; + typedef minimal_docids minimal_docids_type; + minimal_docids_type m_unsorted_minimal_docs_list; Dictionary m_dictionary; InvertedIndex m_inverted_index; ForwardIndex m_forward_index; diff --git a/include/autocomplete2.hpp b/include/autocomplete2.hpp index f713043..eb3f994 100644 --- a/include/autocomplete2.hpp +++ b/include/autocomplete2.hpp @@ -9,8 +9,7 @@ namespace autocomplete { -template +template struct autocomplete2 { typedef scored_string_pool::iterator iterator_type; @@ -89,15 +88,8 @@ struct autocomplete2 { uint32_t num_completions = 0; if (prefix.size() == 0) { suffix_lex_range.end += 1; - constexpr bool must_return_unique_results = true; num_completions = m_unsorted_minimal_docs_list.topk( - suffix_lex_range, k, m_pool.scores(), - must_return_unique_results); - if (num_completions < k) { - suffix_lex_range.begin += 1; - num_completions = heap_topk(m_inverted_index, suffix_lex_range, - k, m_pool.scores()); - } + m_inverted_index, suffix_lex_range, k, m_pool.scores()); extract_completions(num_completions); } else { suffix_lex_range.begin += 1; @@ -173,8 +165,9 @@ struct autocomplete2 { private: Completions m_completions; - UnsortedDocsList m_unsorted_docs_list; - UnsortedDocsList m_unsorted_minimal_docs_list; + unsorted_list_type m_unsorted_docs_list; + typedef minimal_docids minimal_docids_type; + minimal_docids_type m_unsorted_minimal_docs_list; Dictionary m_dictionary; InvertedIndex m_inverted_index; compact_vector m_docid_to_lexid; diff --git a/include/autocomplete3.hpp b/include/autocomplete3.hpp index b6b76b4..6765ad6 100644 --- a/include/autocomplete3.hpp +++ b/include/autocomplete3.hpp @@ -15,8 +15,7 @@ one iterator for each termID in the lexicographic range of the last token of the query. */ -template +template struct autocomplete3 { typedef scored_string_pool::iterator iterator_type; typedef min_heap struct autocomplete4 { typedef scored_string_pool::iterator iterator_type; @@ -148,7 +148,7 @@ struct autocomplete4 { private: Completions m_completions; - UnsortedDocsList m_unsorted_docs_list; + unsorted_list_type m_unsorted_docs_list; Dictionary m_dictionary; BlockedInvertedIndex m_inverted_index; compact_vector m_docid_to_lexid; diff --git a/include/autocomplete_common.hpp b/include/autocomplete_common.hpp index f655d3f..21d952b 100644 --- a/include/autocomplete_common.hpp +++ b/include/autocomplete_common.hpp @@ -2,9 +2,14 @@ #include "util_types.hpp" #include "min_heap.hpp" +#include "unsorted_list.hpp" +#include "minimal_docids.hpp" +#include "succinct_rmq/cartesian_tree.hpp" namespace autocomplete { +typedef unsorted_list unsorted_list_type; + template bool parse(Dictionary const& dict, std::string const& query, completion_type& prefix, byte_range& suffix, bool must_find_prefix) { @@ -50,8 +55,12 @@ uint32_t heap_topk(InvertedIndex const& index, const range r, const uint32_t k, while (!q.empty()) { auto& z = q.top(); auto doc_id = *z; - topk_scores[results++] = doc_id; - if (results == k) return results; + bool alread_present = std::binary_search( + topk_scores.begin(), topk_scores.begin() + results, doc_id); + if (!alread_present) { + topk_scores[results++] = doc_id; + if (results == k) return results; + } z.next(); if (!z.has_next()) q.pop(); q.heapify(); diff --git a/include/compact_forward_index.hpp b/include/compact_forward_index.hpp index 21aaa7c..50267f4 100644 --- a/include/compact_forward_index.hpp +++ b/include/compact_forward_index.hpp @@ -104,6 +104,7 @@ struct compact_forward_index { }; forward_list_iterator_type iterator(id_type doc_id) { + assert(doc_id < num_docs()); uint64_t pos = m_pointers.access(doc_id); uint64_t n = m_pointers.access(doc_id + 1) - pos; return {m_data, pos, n}; diff --git a/include/minimal_docids.hpp b/include/minimal_docids.hpp new file mode 100644 index 0000000..a7cb8f8 --- /dev/null +++ b/include/minimal_docids.hpp @@ -0,0 +1,131 @@ +#pragma once + +#include "compact_vector.hpp" +#include "util_types.hpp" + +namespace autocomplete { + +template +struct minimal_docids { + static const uint32_t SCAN_THRESHOLD = 64; + typedef scored_range_with_list_iterator< + typename InvertedIndex::iterator_type> + range_type; + typedef scored_range_with_list_iterator_comparator< + typename range_type::iterator_type> + comparator_range_type; + + minimal_docids() {} + + void build(std::vector const& list) { + essentials::logger("building minimal_docids..."); + m_rmq.build(list, std::less()); + m_list.build(list.begin(), list.size()); + essentials::logger("DONE"); + } + + uint32_t topk(InvertedIndex const& index, const range r, const uint32_t k, + std::vector& topk_scores) { + range_type sr; + sr.r = {r.begin, r.end - 1}; // rmq needs inclusive ranges + sr.min_pos = m_rmq.rmq(sr.r.begin, sr.r.end); + sr.min_val = m_list.access(sr.min_pos); + + m_q.clear(); + m_q.push(sr); + + uint32_t results = 0; + while (!m_q.empty()) { + auto& min = m_q.top(); + auto docid = min.minimum(); + bool alread_present = std::binary_search( + topk_scores.begin(), topk_scores.begin() + results, docid); + if (!alread_present) { + topk_scores[results++] = docid; + if (results == k) break; + } + + if (min.is_open()) { + min.iterator.next(); + if (!min.iterator.has_next()) { + m_q.pop(); + } + m_q.heapify(); + } else { + // save + auto min_range = min.r; + auto min_pos = min.min_pos; + + min.set_iterator(index); + min.iterator.next(); + if (!min.iterator.has_next()) { + m_q.pop(); + } + + m_q.heapify(); + + if (min_pos > 0 and min_pos - 1 >= min_range.begin) { + range_type left; + left.r = {min_range.begin, min_pos - 1}; + if (left.r.end - left.r.begin <= SCAN_THRESHOLD) { + left.min_pos = rmq(left.r.begin, left.r.end); + } else { + left.min_pos = m_rmq.rmq(left.r.begin, left.r.end); + } + left.min_val = m_list.access(left.min_pos); + m_q.push(left); + } + + if (min_pos < size() - 1 and min_range.end >= min_pos + 1) { + range_type right; + right.r = {min_pos + 1, min_range.end}; + if (right.r.end - right.r.begin <= SCAN_THRESHOLD) { + right.min_pos = rmq(right.r.begin, right.r.end); + } else { + right.min_pos = m_rmq.rmq(right.r.begin, right.r.end); + } + right.min_val = m_list.access(right.min_pos); + m_q.push(right); + } + } + } + + return results; + } + + size_t size() const { + return m_list.size(); + } + + size_t bytes() const { + return m_rmq.bytes() + m_list.bytes(); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_rmq); + visitor.visit(m_list); + } + +private: + typedef min_heap min_priority_queue_type; + min_priority_queue_type m_q; + + RMQ m_rmq; + compact_vector m_list; + + uint64_t rmq(uint64_t lo, uint64_t hi) { // inclusive endpoints + uint64_t pos = lo; + id_type min = id_type(-1); + for (uint64_t i = lo; i <= hi; ++i) { + id_type val = m_list.access(i); + if (val < min) { + min = val; + pos = i; + } + } + return pos; + } +}; + +} // namespace autocomplete \ No newline at end of file diff --git a/include/scored_string_pool.hpp b/include/scored_string_pool.hpp index c679aeb..3f03f06 100644 --- a/include/scored_string_pool.hpp +++ b/include/scored_string_pool.hpp @@ -4,6 +4,11 @@ namespace autocomplete { +struct scored_byte_range { + byte_range string; + id_type score; +}; + struct scored_string_pool { void init() { push_back_offset(0); diff --git a/include/statistics.hpp b/include/statistics.hpp index aa1fbe0..42654ae 100644 --- a/include/statistics.hpp +++ b/include/statistics.hpp @@ -40,9 +40,9 @@ void completion_trie::print_stats() print_bps("sizes", sizes_bytes(), size()); } -template -void autocomplete +void autocomplete::print_stats() const { size_t total_bytes = bytes(); std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) @@ -86,10 +86,9 @@ void autocomplete -void autocomplete2::print_stats() const { +template +void autocomplete2::print_stats() + const { size_t total_bytes = bytes(); std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) << " [MiB]: " @@ -122,10 +121,9 @@ void autocomplete2 -void autocomplete3::print_stats() const { +template +void autocomplete3::print_stats() + const { size_t total_bytes = bytes(); std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) << " [MiB]: " @@ -148,10 +146,10 @@ void autocomplete3 -void autocomplete4::print_stats() const { +void autocomplete4::print_stats() + const { size_t total_bytes = bytes(); std::cout << "using " << essentials::convert(total_bytes, essentials::MiB) << " [MiB]: " diff --git a/include/types.hpp b/include/types.hpp index 6481276..659199d 100644 --- a/include/types.hpp +++ b/include/types.hpp @@ -3,75 +3,45 @@ #include "completion_trie.hpp" #include "fc_dictionary.hpp" #include "integer_fc_dictionary.hpp" -#include "unsorted_list.hpp" - -// #include "uint_vec.hpp" -// #include "uncompressed_list.hpp" - #include "compact_forward_index.hpp" - #include "inverted_index.hpp" #include "blocked_inverted_index.hpp" - #include "autocomplete.hpp" #include "autocomplete2.hpp" #include "autocomplete3.hpp" #include "autocomplete4.hpp" - #include "compact_vector.hpp" #include "ef/ef_sequence.hpp" #include "ef/compact_ef.hpp" -#include "succinct_rmq/cartesian_tree.hpp" namespace autocomplete { typedef uint_vec uint32_vec; typedef uint_vec uint64_vec; -// typedef completion_trie -// uint64_completion_trie; - typedef completion_trie ef_completion_trie; - typedef fc_dictionary<> fc_dictionary_type; typedef integer_fc_dictionary<> integer_fc_dictionary_type; - -typedef unsorted_list succinct_rmq; -// typedef uncompressed_list uncompressed_list32_t; - -// typedef inverted_index uncompressed_inverted_index; typedef inverted_index ef_inverted_index; - -// typedef blocked_inverted_index -// uncompressed_blocked_inverted_index; typedef blocked_inverted_index ef_blocked_inverted_index; -// typedef autocomplete -// uncompressed_autocomplete_type; - -// typedef autocomplete2 -// uncompressed_autocomplete_type2; - /* compressed indexes */ -typedef autocomplete +typedef autocomplete ef_autocomplete_type1; -typedef autocomplete2 +typedef autocomplete2 ef_autocomplete_type2; -typedef autocomplete3 +typedef autocomplete3 ef_autocomplete_type3; -typedef autocomplete4 +typedef autocomplete4 ef_autocomplete_type4; } // namespace autocomplete \ No newline at end of file diff --git a/include/unsorted_list.hpp b/include/unsorted_list.hpp index e7cfddd..bb06a86 100644 --- a/include/unsorted_list.hpp +++ b/include/unsorted_list.hpp @@ -1,48 +1,10 @@ #pragma once #include "compact_vector.hpp" +#include "util_types.hpp" namespace autocomplete { -struct scored_byte_range { - byte_range string; - id_type score; -}; - -typedef std::function - scored_range_comparator_type; -scored_range_comparator_type scored_range_comparator = - [](scored_range const& l, scored_range const& r) { - return l.min_val > r.min_val; - }; - -struct topk_queue { - void push(scored_range sr) { - m_q.push_back(sr); - std::push_heap(m_q.begin(), m_q.end(), scored_range_comparator); - } - - scored_range top() { - return m_q.front(); - } - - void pop() { - std::pop_heap(m_q.begin(), m_q.end(), scored_range_comparator); - m_q.pop_back(); - } - - void clear() { - m_q.clear(); - } - - bool empty() const { - return m_q.empty(); - } - -private: - std::vector m_q; -}; - template struct unsorted_list { static const uint32_t SCAN_THRESHOLD = 64; @@ -132,6 +94,40 @@ struct unsorted_list { } private: + struct topk_queue { + void push(scored_range sr) { + m_q.push_back(sr); + std::push_heap(m_q.begin(), m_q.end(), m_comparator); + } + + scored_range top() { + return m_q.front(); + } + + void pop() { + std::pop_heap(m_q.begin(), m_q.end(), m_comparator); + m_q.pop_back(); + } + + void clear() { + m_q.clear(); + } + + bool empty() const { + return m_q.empty(); + } + + private: + std::vector m_q; + + typedef std::function + scrored_range_comparator_type; + scrored_range_comparator_type m_comparator = [](scored_range const& l, + scored_range const& r) { + return scored_range::greater(l, r); + }; + }; + topk_queue m_q; RMQ m_rmq; compact_vector m_list; diff --git a/include/util_types.hpp b/include/util_types.hpp index 531e65d..0890002 100644 --- a/include/util_types.hpp +++ b/include/util_types.hpp @@ -62,6 +62,55 @@ struct scored_range { range r; uint32_t min_pos; id_type min_val; + + static bool greater(scored_range const& l, scored_range const& r) { + return l.min_val > r.min_val; + } +}; + +template +struct scored_range_with_list_iterator { + typedef Iterator iterator_type; + + scored_range_with_list_iterator() + : min_pos(global::invalid_term_id) + , m_open(false) {} + + range r; + uint32_t min_pos; + id_type min_val; + Iterator iterator; + + bool is_open() const { + return m_open; + } + + template + void set_iterator(InvertedIndex const& index) { + assert(min_pos != global::invalid_term_id); + m_open = true; + iterator = index.iterator(min_pos); + } + + id_type minimum() const { + return is_open() ? *iterator : min_val; + } + + // static bool greater(scored_range_with_list_iterator const& l, + // scored_range_with_list_iterator const& r) { + // return l.minimum() > r.minimum(); + // } + +private: + bool m_open; +}; + +template +struct scored_range_with_list_iterator_comparator { + bool operator()(scored_range_with_list_iterator const& l, + scored_range_with_list_iterator const& r) { + return l.minimum() > r.minimum(); + } }; struct byte_range { diff --git a/test/test_unsorted_list.cpp b/test/test_unsorted_list.cpp index 8b1ce0f..2760532 100644 --- a/test/test_unsorted_list.cpp +++ b/test/test_unsorted_list.cpp @@ -74,14 +74,14 @@ TEST_CASE("test unsorted_list on doc_ids") { // } // } - succinct_rmq list; + unsorted_list_type list; list.build(doc_ids); REQUIRE(list.size() == doc_ids.size()); - essentials::save(list, output_filename); + essentials::save(list, output_filename); } { - succinct_rmq list; + unsorted_list_type list; essentials::load(list, output_filename); std::vector topk(constants::MAX_K); @@ -137,14 +137,14 @@ TEST_CASE("test unsorted_list on minimal doc_ids") { input.close(); REQUIRE(doc_ids.size() == params.num_terms); - succinct_rmq list; + unsorted_list_type list; list.build(doc_ids); REQUIRE(list.size() == doc_ids.size()); - essentials::save(list, output_filename); + essentials::save(list, output_filename); } { - succinct_rmq list; + unsorted_list_type list; essentials::load(list, output_filename); std::vector topk(constants::MAX_K); From cba9599c74f84cbd6ef375abf1e957d809fab0da Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 12 May 2020 11:12:38 +0200 Subject: [PATCH 062/102] updated README with paper information --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1247d50..2ab2e1c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ Autocomplete ------------ -Query autocompletion in C++. +A Query Auto-Completion system based on the paper [Efficient and Effective Query Auto-Completion](http://pages.di.unipi.it/pibiri/papers/SIGIR20.pdf), by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini, +published in ACM SIGIR 2020. ##### Table of contents 1. [Installation and quick start](#install) From ff80c3ac41e3bc5b25a7457089d115a6fa4d413f Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 12 May 2020 11:14:02 +0200 Subject: [PATCH 063/102] updated README with paper information --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2ab2e1c..df9c825 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ Autocomplete ------------ -A Query Auto-Completion system based on the paper [Efficient and Effective Query Auto-Completion](http://pages.di.unipi.it/pibiri/papers/SIGIR20.pdf), by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini, +A Query Auto-Completion system based on the paper *[Efficient and Effective Query Auto-Completion](http://pages.di.unipi.it/pibiri/papers/SIGIR20.pdf)*, by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini, published in ACM SIGIR 2020. ##### Table of contents From 8154ee69aafe1aad5e103e1994f4440536215b18 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Wed, 4 Aug 2021 21:48:36 +0530 Subject: [PATCH 064/102] ci: github actions setup Signed-off-by: Rajdeep Roy Chowdhury --- .github/workflows/continuous_integration.yml | 37 ++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/continuous_integration.yml diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml new file mode 100644 index 0000000..7daf050 --- /dev/null +++ b/.github/workflows/continuous_integration.yml @@ -0,0 +1,37 @@ +name: Continuous Integration + +on: + [ push,pull_request ] + +jobs: + build: + name: Continuous Integration + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ ubuntu-latest ] + steps: + + - name: Checkout code + uses: actions/checkout@v2 + + - name: Checkout submodules + run: git submodule update --init --recursive + + - name: Check cmake version + run: cmake --version + + - name: Creating build directory + run: cmake -E make_directory ./build + + - name: Precompilation + working-directory: ./build + run: cmake .. -DCMAKE_BUILD_TYPE=Release + + - name: Compilation + working-directory: ./build + run: cmake --build . --config Release + + - name: Testing + working-directory: ./build + run: ctest From bb224284fadb62adecdcec33e01a0c341e5bb3bb Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Wed, 4 Aug 2021 21:57:51 +0530 Subject: [PATCH 065/102] ci: added data preprocessing step Signed-off-by: Rajdeep Roy Chowdhury --- .github/workflows/continuous_integration.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 7daf050..b044478 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -32,6 +32,10 @@ jobs: working-directory: ./build run: cmake --build . --config Release + - name: Data preprocessing + working-directory: ./test_data + run: bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300 + - name: Testing working-directory: ./build run: ctest From 36a78e5799fe8b7bd1de12733780da1f8a3bb1e7 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Wed, 4 Aug 2021 22:07:42 +0530 Subject: [PATCH 066/102] ci: using python2 for data preprocessing Signed-off-by: Rajdeep Roy Chowdhury --- .github/workflows/continuous_integration.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index b044478..8cd6890 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -33,6 +33,10 @@ jobs: run: cmake --build . --config Release - name: Data preprocessing + uses: actions/setup-python@v2 + with: + python-version: 2.x + architecture: x64 working-directory: ./test_data run: bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300 From e3ed47846cc2d7f817cc01f455bdd5ff3c0faae2 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Wed, 4 Aug 2021 22:11:02 +0530 Subject: [PATCH 067/102] ci: fixed yml error Signed-off-by: Rajdeep Roy Chowdhury --- .github/workflows/continuous_integration.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 8cd6890..c2c76d8 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -32,11 +32,13 @@ jobs: working-directory: ./build run: cmake --build . --config Release - - name: Data preprocessing + - name: Setup python uses: actions/setup-python@v2 - with: - python-version: 2.x - architecture: x64 + with: + python-version: 2.x + architecture: x64 + + - name: Data preprocessing working-directory: ./test_data run: bash preprocess.sh trec_05_efficiency_queries/trec_05_efficiency_queries.completions 300 From 57be3895383f89f22fdc75b3c92a12ef201a1630 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Wed, 4 Aug 2021 22:14:31 +0530 Subject: [PATCH 068/102] ci: fixed yaml indentation Signed-off-by: Rajdeep Roy Chowdhury --- .github/workflows/continuous_integration.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index c2c76d8..4a8e6ce 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -34,9 +34,9 @@ jobs: - name: Setup python uses: actions/setup-python@v2 - with: - python-version: 2.x - architecture: x64 + with: + python-version: '2.x' + architecture: 'x64' - name: Data preprocessing working-directory: ./test_data From 35acea671e8e70cb79a58d3a93b03c419d14e0fc Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Thu, 5 Aug 2021 23:30:31 +0530 Subject: [PATCH 069/102] ci: added dockerfile and ci steps Signed-off-by: Rajdeep Roy Chowdhury --- .github/workflows/continuous_integration.yml | 9 +++++++++ Dockerfile | 11 +++++++++++ 2 files changed, 20 insertions(+) create mode 100644 Dockerfile diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 4a8e6ce..f605183 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -45,3 +45,12 @@ jobs: - name: Testing working-directory: ./build run: ctest + + - name: Building docker image + run: docker build -t ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest . + + - name: Dockerhub Authentication + run: docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password ${{ secrets.DOCKERHUB_ACCESS_TOKEN }} + + - name: Publishing image to Container Registry + run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..0aefcb7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM alpine:latest + +RUN addgroup -S appgroup && adduser -S appuser -G appgroup + +USER appuser + +COPY ./build /app + +WORKDIR /app + +CMD ["./web_server", "8000", "trec_05.ef_type1.bin"] \ No newline at end of file From 23f585163819af61ba83203897584d7c6b8d9ef2 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Fri, 6 Aug 2021 08:56:50 +0530 Subject: [PATCH 070/102] ci: docker base image changed from alpine to ubuntu Signed-off-by: Rajdeep Roy Chowdhury --- Dockerfile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0aefcb7..fe19d62 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,15 @@ -FROM alpine:latest +FROM ubuntu:latest -RUN addgroup -S appgroup && adduser -S appuser -G appgroup +EXPOSE 8000 -USER appuser +RUN groupadd appgroup && useradd appuser -G appgroup COPY ./build /app WORKDIR /app -CMD ["./web_server", "8000", "trec_05.ef_type1.bin"] \ No newline at end of file +RUN chmod +x web_server + +USER appuser + +CMD ["./web_server", "8000", "trec_05.ef_type1.bin"] From 6b6f28a1a1f39ff9ae7a54a44f2240c8ed2667df Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Fri, 6 Aug 2021 21:50:09 +0530 Subject: [PATCH 071/102] ci: added binary dict building step Signed-off-by: Rajdeep Roy Chowdhury --- .github/workflows/continuous_integration.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index f605183..76645b2 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -46,6 +46,10 @@ jobs: working-directory: ./build run: ctest + - name: Build binary dictionary + working-directory: build + run: chmod +x build && ./build ef_type1 ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin + - name: Building docker image run: docker build -t ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest . From c58dee45ae1acfb0ad88982d1faf3fbd8e29ddb7 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Sat, 7 Aug 2021 12:36:52 +0530 Subject: [PATCH 072/102] ci: using root user in docker Signed-off-by: Rajdeep Roy Chowdhury --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index fe19d62..b9acc33 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,6 +10,6 @@ WORKDIR /app RUN chmod +x web_server -USER appuser +# USER appuser CMD ["./web_server", "8000", "trec_05.ef_type1.bin"] From 05e3055d136edf8fcecf4911fcc72e4d3b1f5f13 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Sat, 7 Aug 2021 19:12:35 +0530 Subject: [PATCH 073/102] ci: compiling in the dockerfile Signed-off-by: Rajdeep Roy Chowdhury --- Dockerfile | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index b9acc33..1c462ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,12 +4,22 @@ EXPOSE 8000 RUN groupadd appgroup && useradd appuser -G appgroup -COPY ./build /app +COPY . /src WORKDIR /app -RUN chmod +x web_server +RUN apt update && apt install -y cmake g++ python -# USER appuser +RUN cmake /src && cmake --build . + +RUN chmod +x web_server && chmod +x build + +RUN ./build ef_type1 /src/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin + +RUN apt purge -y cmake g++ python + +RUN rm -rf /src + +USER appuser CMD ["./web_server", "8000", "trec_05.ef_type1.bin"] From ce76e9a23a3dd190d0f6d3fd19e20bbcf0b4e175 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Sat, 7 Aug 2021 21:15:56 +0530 Subject: [PATCH 074/102] ci(workflow): docker image pushed to container registry only on master Signed-off-by: Rajdeep Roy Chowdhury --- .github/workflows/continuous_integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index 76645b2..f45a3dd 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -57,4 +57,5 @@ jobs: run: docker login --username ${{ secrets.DOCKERHUB_USERNAME }} --password ${{ secrets.DOCKERHUB_ACCESS_TOKEN }} - name: Publishing image to Container Registry + if: github.ref == 'refs/heads/master' run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/autocomplete:latest From 7e78b9e3991df0e5dad6b5e657a40fe7988b2ca5 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Sat, 7 Aug 2021 21:29:31 +0530 Subject: [PATCH 075/102] docs: docker instructions added in readme Signed-off-by: Rajdeep Roy Chowdhury --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index df9c825..4031280 100644 --- a/README.md +++ b/README.md @@ -182,3 +182,15 @@ Live demo Start the web server with the program `./web_server ` and access the demo at `localhost:`. + +Use a prebuilt docker image +---------- + +The following command pulls a prebuilt docker image and runs it locally. + +```bash +docker pull jermp/autocomplete +docker run -p 8000:8000 -d jermp/autocomplete +``` + +The demo can be accessed at [http://localhost:8000](http://localhost:8000) From 1fa8497ce4e727439f0266e7c8019eeb577d7d72 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Tue, 10 Aug 2021 15:31:29 +0200 Subject: [PATCH 076/102] commented out docker information from readme --- README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 4031280..7747a7d 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,18 @@ After that, for having a minimal running example, just run and then access the service [here](http://127.0.0.1:8000). + + Compiling the code ------------------ @@ -181,16 +193,4 @@ Live demo ---------- Start the web server with the program `./web_server ` and access the demo at -`localhost:`. - -Use a prebuilt docker image ----------- - -The following command pulls a prebuilt docker image and runs it locally. - -```bash -docker pull jermp/autocomplete -docker run -p 8000:8000 -d jermp/autocomplete -``` - -The demo can be accessed at [http://localhost:8000](http://localhost:8000) +`localhost:`. \ No newline at end of file From c91dd28fcd065e7279b91ce3de558d520e5c1c2f Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 26 Aug 2021 11:10:12 +0200 Subject: [PATCH 077/102] added instructions for using a Docker image (credits to Razdeep) --- README.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 7747a7d..40d263e 100644 --- a/README.md +++ b/README.md @@ -26,19 +26,16 @@ After that, for having a minimal running example, just run bash ./example.sh -and then access the service [here](http://127.0.0.1:8000). +and then access the service [from localhost](http://localhost:8000). - +And then access the service [from localhost](http://localhost:8000). Compiling the code ------------------ From 2bb6a118eb506519eb72227230e75fdb261bec7f Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Wed, 3 Nov 2021 22:04:26 +0100 Subject: [PATCH 078/102] removed unused copy assignment operator --- include/bit_vector.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/bit_vector.hpp b/include/bit_vector.hpp index 747faef..4afb7dd 100644 --- a/include/bit_vector.hpp +++ b/include/bit_vector.hpp @@ -242,12 +242,6 @@ struct bit_vector { build(in); } - bit_vector& operator=(bit_vector const& other) { - bit_vector tmp(other); - tmp.swap(*this); - return *this; - } - void swap(bit_vector& other) { std::swap(other.m_size, m_size); other.m_bits.swap(m_bits); From 83b921667dcb8064b02cee2cb9a8b72d7d616469 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Fri, 5 Nov 2021 09:07:13 +0100 Subject: [PATCH 079/102] added quality code badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 40d263e..12e7f61 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Language grade: C/C++](https://img.shields.io/lgtm/grade/cpp/g/jermp/autocomplete.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/jermp/autocomplete/context:cpp) + Autocomplete ------------ From 1e611a67718a89a1e56b5cb95a874c3880b33353 Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Mon, 7 Feb 2022 14:45:18 +0530 Subject: [PATCH 080/102] fix: util::find() issue fixed When linear scanning doesn't find the target element, the control must not go back to the binary search logic. Signed-off-by: Rajdeep Roy Chowdhury --- include/util.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/util.hpp b/include/util.hpp index bb20bdb..27942d3 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -45,6 +45,7 @@ uint64_t find(S const& sequence, uint64_t id, uint64_t lo, uint64_t hi) { return pos; } } + break; } uint64_t pos = lo + ((hi - lo) >> 1); uint64_t val = sequence.access(pos); From 9627eefbd01e79dd0ad55a47de5d1faf4c1bb91c Mon Sep 17 00:00:00 2001 From: Rajdeep Roy Chowdhury Date: Tue, 8 Feb 2022 12:38:46 +0530 Subject: [PATCH 081/102] fix: unsigned underflow handled Signed-off-by: Rajdeep Roy Chowdhury --- include/util.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/util.hpp b/include/util.hpp index 27942d3..b08b3b7 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -45,13 +45,17 @@ uint64_t find(S const& sequence, uint64_t id, uint64_t lo, uint64_t hi) { return pos; } } - break; +// break; } uint64_t pos = lo + ((hi - lo) >> 1); uint64_t val = sequence.access(pos); if (val == id) { return pos; } else if (val > id) { + // Rescuing hi from unsigned underflow + if (pos == 0) { + return global::not_found; + } hi = pos - 1; } else { lo = pos + 1; From 475450a5879fb390557fc4c3252b461cee8afd79 Mon Sep 17 00:00:00 2001 From: jermp Date: Tue, 8 Feb 2022 12:37:31 +0100 Subject: [PATCH 082/102] style --- include/util.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/util.hpp b/include/util.hpp index b08b3b7..4f0b89e 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -45,17 +45,13 @@ uint64_t find(S const& sequence, uint64_t id, uint64_t lo, uint64_t hi) { return pos; } } -// break; } uint64_t pos = lo + ((hi - lo) >> 1); uint64_t val = sequence.access(pos); if (val == id) { return pos; } else if (val > id) { - // Rescuing hi from unsigned underflow - if (pos == 0) { - return global::not_found; - } + if (pos == 0) return global::not_found; hi = pos - 1; } else { lo = pos + 1; From 5ddded0c6652ed25622d74eab98859bbb68cdbb8 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Thu, 14 Jul 2022 15:26:46 +0200 Subject: [PATCH 083/102] added script to build indexes for test --- script/build_indexes.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 script/build_indexes.py diff --git a/script/build_indexes.py b/script/build_indexes.py new file mode 100644 index 0000000..e01e1db --- /dev/null +++ b/script/build_indexes.py @@ -0,0 +1,6 @@ +import sys, os + +dataset_name = sys.argv[1] # e.g., aol +types = ["ef_type1", "ef_type2", "ef_type3", "ef_type4"] +for t in types: + os.system("./build " + t + " ../test_data/" + dataset_name + "/" + dataset_name + ".completions -o " + t + "." + dataset_name + ".bin -c 0.0001") \ No newline at end of file From 3320ae51689fdc2123261f140913510ee981035c Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sun, 8 Jan 2023 10:28:32 +0100 Subject: [PATCH 084/102] Update README.md --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 12e7f61..eec59b1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -[![Language grade: C/C++](https://img.shields.io/lgtm/grade/cpp/g/jermp/autocomplete.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/jermp/autocomplete/context:cpp) - Autocomplete ------------ @@ -192,4 +190,4 @@ Live demo ---------- Start the web server with the program `./web_server ` and access the demo at -`localhost:`. \ No newline at end of file +`localhost:`. From ced049a0fae414320c4ce459955719415d2b12e4 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 23 Sep 2023 16:01:52 +0200 Subject: [PATCH 085/102] updated scripts in test_data to python3; added one extra assert (redundant) --- README.md | 6 +++--- include/ef/ef_sequence.hpp | 1 + test_data/build_inverted_and_forward.py | 2 +- test_data/build_stats.py | 2 +- test_data/extract_dict.py | 5 ++--- test_data/preprocess.sh | 10 +++++----- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index eec59b1..e77b01d 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ After that, for having a minimal running example, just run and then access the service [from localhost](http://localhost:8000). -### Or you can use a prebuilt Docker image +### Or you can use a prebuilt Docker image The following command pulls a prebuilt Docker image and runs it locally. @@ -160,7 +160,7 @@ They should have been created already if you have run the script `preprocess.sh`, otherwise you can use - python partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries 300 + python3 partition_queries_by_length.py trec_05_efficiency_queries/trec_05_efficiency_queries.completions trec_05_efficiency_queries/trec_05_efficiency_queries.completions.queries 300 to partition the input completions by number of query terms and retain 300 queries at random. @@ -179,7 +179,7 @@ of the prefix of the last token is retained. We automated the collection of results with the script `script/collected_topk_results_by_varying_percentage.py`. From within the `/build` directory, run - python ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300 + python3 ../script/collect_results_by_varying_percentage.py ef_type1 topk trec05.ef_type1.bin ../test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions 10 300 To benchmark the dictionaries (Front-Coding and trie), just run the following script from within the `script` directory: diff --git a/include/ef/ef_sequence.hpp b/include/ef/ef_sequence.hpp index 0632f83..2e9e293 100644 --- a/include/ef/ef_sequence.hpp +++ b/include/ef/ef_sequence.hpp @@ -49,6 +49,7 @@ struct ef_sequence { ++within; } assert(values.size() == n); + assert(std::is_sorted(values.begin(), values.end())); compress(values.begin(), values.size(), values.back()); } diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py index acf4b8e..0966d99 100644 --- a/test_data/build_inverted_and_forward.py +++ b/test_data/build_inverted_and_forward.py @@ -52,7 +52,7 @@ if not discard: # NOTE: not sorted! if doc_id >= num_docs: - print doc_id,num_docs + print(doc_id,num_docs) forward_index[doc_id] = mapped; lines += 1 diff --git a/test_data/build_stats.py b/test_data/build_stats.py index 8e60a39..880bcd3 100644 --- a/test_data/build_stats.py +++ b/test_data/build_stats.py @@ -43,7 +43,7 @@ output_file.write(str(lines) + "\n") output_file.write(str(universe + 1) + "\n") output_file.write(str(len(nodes_per_level)) + "\n") -for key, value in sorted(nodes_per_level.iteritems(), key = lambda kv: kv[0]): +for key, value in sorted(nodes_per_level.items(), key = lambda kv: kv[0]): output_file.write(str(value) + "\n") output_file.close() diff --git a/test_data/extract_dict.py b/test_data/extract_dict.py index e3c05b5..e9b48d0 100644 --- a/test_data/extract_dict.py +++ b/test_data/extract_dict.py @@ -1,9 +1,8 @@ import sys -from sets import Set input_filename = sys.argv[1] -tokens = Set({}) +tokens = set() lines = 0 print("parsing input file...") @@ -14,7 +13,7 @@ tokens.add(x[i]) lines += 1 if lines % 1000000 == 0: - print "processed " + str(lines) + " lines" + print("processed " + str(lines) + " lines") print("processed " + str(lines) + " lines") print("dictionary has " + str(len(tokens)) + " keys") diff --git a/test_data/preprocess.sh b/test_data/preprocess.sh index e3d96f7..b795bfe 100755 --- a/test_data/preprocess.sh +++ b/test_data/preprocess.sh @@ -2,8 +2,8 @@ echo $1 # input filename echo $2 # number of queries for each size -python extract_dict.py $1 -python map_dataset.py $1 -python build_stats.py $1.mapped -python build_inverted_and_forward.py $1 -python partition_queries_by_length.py $1 $1.queries $2 +python3 extract_dict.py $1 +python3 map_dataset.py $1 +python3 build_stats.py $1.mapped +python3 build_inverted_and_forward.py $1 +python3 partition_queries_by_length.py $1 $1.queries $2 From ee542b4afe133f81888be609adc05069bbe27840 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 23 Sep 2023 16:14:51 +0200 Subject: [PATCH 086/102] updated doctest and CMake version --- CMakeLists.txt | 2 +- external/doctest | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2908d2c..9b3c162 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 3.5) project(AUTOCOMPLETE) if(CMAKE_BUILD_TYPE MATCHES Debug) diff --git a/external/doctest b/external/doctest index 7ac22cc..ae7a135 160000 --- a/external/doctest +++ b/external/doctest @@ -1 +1 @@ -Subproject commit 7ac22cc2190eb090ff66509015fb2d995bce957e +Subproject commit ae7a13539fb71f270b87eb2e874fbac80bc8dda2 From 36db40f5c8263d27afb8499c0ce5016bd515b2f6 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 23 Sep 2023 16:25:36 +0200 Subject: [PATCH 087/102] updated python version in workflow file --- .github/workflows/continuous_integration.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml index f45a3dd..bf625be 100644 --- a/.github/workflows/continuous_integration.yml +++ b/.github/workflows/continuous_integration.yml @@ -33,9 +33,9 @@ jobs: run: cmake --build . --config Release - name: Setup python - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: - python-version: '2.x' + python-version: '3.x' architecture: 'x64' - name: Data preprocessing From 5a4d67dda8abca3a5da2ff6210e38389064bead6 Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sat, 23 Sep 2023 16:32:42 +0200 Subject: [PATCH 088/102] updated python version in docker file --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1c462ab..f29c164 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ COPY . /src WORKDIR /app -RUN apt update && apt install -y cmake g++ python +RUN apt update && apt install -y cmake g++ python3 RUN cmake /src && cmake --build . @@ -16,7 +16,7 @@ RUN chmod +x web_server && chmod +x build RUN ./build ef_type1 /src/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions -o trec_05.ef_type1.bin -RUN apt purge -y cmake g++ python +RUN apt purge -y cmake g++ python3 RUN rm -rf /src From 134e1a97cc864e334405f45cdb1eb0f23710633d Mon Sep 17 00:00:00 2001 From: Giulio Ermanno Pibiri Date: Sun, 24 Sep 2023 08:28:32 +0200 Subject: [PATCH 089/102] minor to readme --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e77b01d..69fe339 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ Autocomplete ------------ -A Query Auto-Completion system based on the paper *[Efficient and Effective Query Auto-Completion](http://pages.di.unipi.it/pibiri/papers/SIGIR20.pdf)*, by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini, +A Query Auto-Completion system based on the paper *[Efficient and Effective Query Auto-Completion](https://dl.acm.org/doi/10.1145/3397271.3401432)*, by Simon Gog, Giulio Ermanno Pibiri, and Rossano Venturini, published in ACM SIGIR 2020. +Please, cite the paper if you use the data structures from this library. + ##### Table of contents 1. [Installation and quick start](#install) 2. [Compiling the code](#compiling) @@ -40,7 +42,8 @@ And then access the service [from localhost](http://localhost:8000). Compiling the code ------------------ -The code has been tested on Linux with `gcc` 7.4.0, 8.3.0, 9.0.0 and on Mac 10.14 with `clang` 10.0.0. +The code has been tested on Linux with `gcc` 7.4.0, 8.3.0, 9.0.0, on Mac OS 10.14 and 12.4 with `clang` 10.0.0 and 13.0.0. + To build the code, [`CMake`](https://cmake.org/) is required. Clone the repository with From 3dfb83af25f52b90f2f8845762a1af0d1783c73e Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 10:24:03 -0400 Subject: [PATCH 090/102] initial commit, porting primitieves --- .../workflows/continuous_integration.yml | 0 CMakeLists.txt => archive/CMakeLists.txt | 0 Dockerfile => archive/Dockerfile | 0 .../benchmark}/CMakeLists.txt | 0 .../benchmark}/benchmark_common.hpp | 0 .../benchmark}/benchmark_conjunctive_topk.cpp | 0 .../benchmark}/benchmark_fc_dictionary.cpp | 0 .../benchmark_integer_fc_dictionary.cpp | 0 .../benchmark}/benchmark_locate_prefix.cpp | 0 .../benchmark}/benchmark_prefix_topk.cpp | 0 .../benchmark}/benchmark_topk.cpp | 0 .../benchmark}/effectiveness.cpp | 0 example.sh => archive/example.sh | 0 {external => archive/external}/CMakeLists.txt | 0 {include => archive/include}/autocomplete.hpp | 0 .../include}/autocomplete2.hpp | 0 .../include}/autocomplete3.hpp | 0 .../include}/autocomplete4.hpp | 0 .../include}/autocomplete_common.hpp | 0 {include => archive/include}/bit_vector.hpp | 0 .../include}/blocked_inverted_index.hpp | 0 .../include}/compact_forward_index.hpp | 0 .../include}/compact_vector.hpp | 0 .../include}/completion_trie.hpp | 0 {include => archive/include}/constants.hpp | 0 .../include}/ef/compact_ef.hpp | 0 {include => archive/include}/ef/darray.hpp | 0 .../include}/ef/ef_parameters.hpp | 0 .../include}/ef/ef_sequence.hpp | 0 .../include}/fc_dictionary.hpp | 0 .../include}/integer_codes.hpp | 0 .../include}/integer_fc_dictionary.hpp | 0 .../include}/inverted_index.hpp | 0 {include => archive/include}/min_heap.hpp | 0 .../include}/minimal_docids.hpp | 0 {include => archive/include}/parameters.hpp | 0 {include => archive/include}/probe.hpp | 0 .../include}/scored_string_pool.hpp | 0 {include => archive/include}/statistics.hpp | 0 .../include}/succinct_rmq/README.md | 0 .../include}/succinct_rmq/bp_vector.hpp | 0 .../succinct_rmq/bp_vector_support.hpp | 0 .../include}/succinct_rmq/cartesian_tree.hpp | 0 .../include}/succinct_rmq/rs_bit_vector.hpp | 0 {include => archive/include}/types.hpp | 0 {include => archive/include}/uint_vec.hpp | 0 .../include}/uncompressed_list.hpp | 0 .../include}/unsorted_list.hpp | 0 {include => archive/include}/util.hpp | 0 {include => archive/include}/util_types.hpp | 0 install.sh => archive/install.sh | 0 .../script}/benchmark_dictionaries.sh | 0 ...ctiveness_results_by_varying_percentage.py | 0 ...te_prefix_results_by_varying_percentage.py | 0 .../collect_results_by_varying_percentage.py | 0 {src => archive/src}/CMakeLists.txt | 0 {src => archive/src}/check_topk.cpp | 0 {src => archive/src}/map_queries.cpp | 0 {src => archive/src}/output_ds2i_format.cpp | 0 {src => archive/src}/statistics.cpp | 0 {src => archive/src}/web_server.cpp | 0 {test => archive/test}/test_autocomplete.cpp | 0 .../test}/test_blocked_inverted_index.cpp | 0 {test => archive/test}/test_common.hpp | 0 .../test}/test_compact_forward_index.cpp | 0 .../test}/test_completion_trie.cpp | 0 {test => archive/test}/test_fc_dictionary.cpp | 0 .../test}/test_integer_fc_dictionary.cpp | 0 .../test}/test_inverted_index.cpp | 0 {test => archive/test}/test_locate_prefix.cpp | 0 {test => archive/test}/test_unsorted_list.cpp | 0 .../test_data}/extract_dict.py | 0 .../test_data}/filter_and_preprocess.sh | 0 .../test_data}/filter_dataset.py | 0 .../test_data}/map_dataset.py | 0 .../test_data}/partition_queries_by_length.py | 0 .../test_data}/preprocess.sh | 0 .../trec_05_efficiency_queries.completions | 0 {web => archive/web}/index.html | 0 {web => archive/web}/jquery-1.8.2.min.js | 0 {web => archive/web}/jquery.autocomplete.js | 0 {web => archive/web}/styles.css | 0 {web => archive/web}/topkcomp.js | 0 autocomplete-rs/Cargo.lock | 191 ++++++++++++++++++ autocomplete-rs/Cargo.toml | 9 + autocomplete-rs/README.md | 44 ++++ autocomplete-rs/src/constants.rs | 8 + autocomplete-rs/src/lib.rs | 7 + autocomplete-rs/src/main.rs | 3 + autocomplete-rs/src/parameters.rs | 115 +++++++++++ autocomplete-rs/src/probe.rs | 81 ++++++++ autocomplete-rs/tests/constants_tests.rs | 21 ++ autocomplete-rs/tests/parameters_tests.rs | 98 +++++++++ autocomplete-rs/tests/probe_tests.rs | 79 ++++++++ external/cmd_line_parser | 1 - external/doctest | 1 - external/essentials | 1 - external/jQuery-Autocomplete | 1 - external/mongoose | 1 - include/building_util.hpp | 39 ---- script/build_indexes.py | 6 - src/build.cpp | 62 ------ test_data/build_inverted_and_forward.py | 74 ------- test_data/build_stats.py | 49 ----- 104 files changed, 656 insertions(+), 235 deletions(-) rename {.github => archive/.github}/workflows/continuous_integration.yml (100%) rename CMakeLists.txt => archive/CMakeLists.txt (100%) rename Dockerfile => archive/Dockerfile (100%) rename {benchmark => archive/benchmark}/CMakeLists.txt (100%) rename {benchmark => archive/benchmark}/benchmark_common.hpp (100%) rename {benchmark => archive/benchmark}/benchmark_conjunctive_topk.cpp (100%) rename {benchmark => archive/benchmark}/benchmark_fc_dictionary.cpp (100%) rename {benchmark => archive/benchmark}/benchmark_integer_fc_dictionary.cpp (100%) rename {benchmark => archive/benchmark}/benchmark_locate_prefix.cpp (100%) rename {benchmark => archive/benchmark}/benchmark_prefix_topk.cpp (100%) rename {benchmark => archive/benchmark}/benchmark_topk.cpp (100%) rename {benchmark => archive/benchmark}/effectiveness.cpp (100%) rename example.sh => archive/example.sh (100%) rename {external => archive/external}/CMakeLists.txt (100%) rename {include => archive/include}/autocomplete.hpp (100%) rename {include => archive/include}/autocomplete2.hpp (100%) rename {include => archive/include}/autocomplete3.hpp (100%) rename {include => archive/include}/autocomplete4.hpp (100%) rename {include => archive/include}/autocomplete_common.hpp (100%) rename {include => archive/include}/bit_vector.hpp (100%) rename {include => archive/include}/blocked_inverted_index.hpp (100%) rename {include => archive/include}/compact_forward_index.hpp (100%) rename {include => archive/include}/compact_vector.hpp (100%) rename {include => archive/include}/completion_trie.hpp (100%) rename {include => archive/include}/constants.hpp (100%) rename {include => archive/include}/ef/compact_ef.hpp (100%) rename {include => archive/include}/ef/darray.hpp (100%) rename {include => archive/include}/ef/ef_parameters.hpp (100%) rename {include => archive/include}/ef/ef_sequence.hpp (100%) rename {include => archive/include}/fc_dictionary.hpp (100%) rename {include => archive/include}/integer_codes.hpp (100%) rename {include => archive/include}/integer_fc_dictionary.hpp (100%) rename {include => archive/include}/inverted_index.hpp (100%) rename {include => archive/include}/min_heap.hpp (100%) rename {include => archive/include}/minimal_docids.hpp (100%) rename {include => archive/include}/parameters.hpp (100%) rename {include => archive/include}/probe.hpp (100%) rename {include => archive/include}/scored_string_pool.hpp (100%) rename {include => archive/include}/statistics.hpp (100%) rename {include => archive/include}/succinct_rmq/README.md (100%) rename {include => archive/include}/succinct_rmq/bp_vector.hpp (100%) rename {include => archive/include}/succinct_rmq/bp_vector_support.hpp (100%) rename {include => archive/include}/succinct_rmq/cartesian_tree.hpp (100%) rename {include => archive/include}/succinct_rmq/rs_bit_vector.hpp (100%) rename {include => archive/include}/types.hpp (100%) rename {include => archive/include}/uint_vec.hpp (100%) rename {include => archive/include}/uncompressed_list.hpp (100%) rename {include => archive/include}/unsorted_list.hpp (100%) rename {include => archive/include}/util.hpp (100%) rename {include => archive/include}/util_types.hpp (100%) rename install.sh => archive/install.sh (100%) rename {script => archive/script}/benchmark_dictionaries.sh (100%) rename {script => archive/script}/collect_effectiveness_results_by_varying_percentage.py (100%) rename {script => archive/script}/collect_locate_prefix_results_by_varying_percentage.py (100%) rename {script => archive/script}/collect_results_by_varying_percentage.py (100%) rename {src => archive/src}/CMakeLists.txt (100%) rename {src => archive/src}/check_topk.cpp (100%) rename {src => archive/src}/map_queries.cpp (100%) rename {src => archive/src}/output_ds2i_format.cpp (100%) rename {src => archive/src}/statistics.cpp (100%) rename {src => archive/src}/web_server.cpp (100%) rename {test => archive/test}/test_autocomplete.cpp (100%) rename {test => archive/test}/test_blocked_inverted_index.cpp (100%) rename {test => archive/test}/test_common.hpp (100%) rename {test => archive/test}/test_compact_forward_index.cpp (100%) rename {test => archive/test}/test_completion_trie.cpp (100%) rename {test => archive/test}/test_fc_dictionary.cpp (100%) rename {test => archive/test}/test_integer_fc_dictionary.cpp (100%) rename {test => archive/test}/test_inverted_index.cpp (100%) rename {test => archive/test}/test_locate_prefix.cpp (100%) rename {test => archive/test}/test_unsorted_list.cpp (100%) rename {test_data => archive/test_data}/extract_dict.py (100%) rename {test_data => archive/test_data}/filter_and_preprocess.sh (100%) rename {test_data => archive/test_data}/filter_dataset.py (100%) rename {test_data => archive/test_data}/map_dataset.py (100%) rename {test_data => archive/test_data}/partition_queries_by_length.py (100%) rename {test_data => archive/test_data}/preprocess.sh (100%) rename {test_data => archive/test_data}/trec_05_efficiency_queries/trec_05_efficiency_queries.completions (100%) rename {web => archive/web}/index.html (100%) rename {web => archive/web}/jquery-1.8.2.min.js (100%) rename {web => archive/web}/jquery.autocomplete.js (100%) rename {web => archive/web}/styles.css (100%) rename {web => archive/web}/topkcomp.js (100%) create mode 100644 autocomplete-rs/Cargo.lock create mode 100644 autocomplete-rs/Cargo.toml create mode 100644 autocomplete-rs/README.md create mode 100644 autocomplete-rs/src/constants.rs create mode 100644 autocomplete-rs/src/lib.rs create mode 100644 autocomplete-rs/src/main.rs create mode 100644 autocomplete-rs/src/parameters.rs create mode 100644 autocomplete-rs/src/probe.rs create mode 100644 autocomplete-rs/tests/constants_tests.rs create mode 100644 autocomplete-rs/tests/parameters_tests.rs create mode 100644 autocomplete-rs/tests/probe_tests.rs delete mode 160000 external/cmd_line_parser delete mode 160000 external/doctest delete mode 160000 external/essentials delete mode 160000 external/jQuery-Autocomplete delete mode 160000 external/mongoose delete mode 100644 include/building_util.hpp delete mode 100644 script/build_indexes.py delete mode 100644 src/build.cpp delete mode 100644 test_data/build_inverted_and_forward.py delete mode 100644 test_data/build_stats.py diff --git a/.github/workflows/continuous_integration.yml b/archive/.github/workflows/continuous_integration.yml similarity index 100% rename from .github/workflows/continuous_integration.yml rename to archive/.github/workflows/continuous_integration.yml diff --git a/CMakeLists.txt b/archive/CMakeLists.txt similarity index 100% rename from CMakeLists.txt rename to archive/CMakeLists.txt diff --git a/Dockerfile b/archive/Dockerfile similarity index 100% rename from Dockerfile rename to archive/Dockerfile diff --git a/benchmark/CMakeLists.txt b/archive/benchmark/CMakeLists.txt similarity index 100% rename from benchmark/CMakeLists.txt rename to archive/benchmark/CMakeLists.txt diff --git a/benchmark/benchmark_common.hpp b/archive/benchmark/benchmark_common.hpp similarity index 100% rename from benchmark/benchmark_common.hpp rename to archive/benchmark/benchmark_common.hpp diff --git a/benchmark/benchmark_conjunctive_topk.cpp b/archive/benchmark/benchmark_conjunctive_topk.cpp similarity index 100% rename from benchmark/benchmark_conjunctive_topk.cpp rename to archive/benchmark/benchmark_conjunctive_topk.cpp diff --git a/benchmark/benchmark_fc_dictionary.cpp b/archive/benchmark/benchmark_fc_dictionary.cpp similarity index 100% rename from benchmark/benchmark_fc_dictionary.cpp rename to archive/benchmark/benchmark_fc_dictionary.cpp diff --git a/benchmark/benchmark_integer_fc_dictionary.cpp b/archive/benchmark/benchmark_integer_fc_dictionary.cpp similarity index 100% rename from benchmark/benchmark_integer_fc_dictionary.cpp rename to archive/benchmark/benchmark_integer_fc_dictionary.cpp diff --git a/benchmark/benchmark_locate_prefix.cpp b/archive/benchmark/benchmark_locate_prefix.cpp similarity index 100% rename from benchmark/benchmark_locate_prefix.cpp rename to archive/benchmark/benchmark_locate_prefix.cpp diff --git a/benchmark/benchmark_prefix_topk.cpp b/archive/benchmark/benchmark_prefix_topk.cpp similarity index 100% rename from benchmark/benchmark_prefix_topk.cpp rename to archive/benchmark/benchmark_prefix_topk.cpp diff --git a/benchmark/benchmark_topk.cpp b/archive/benchmark/benchmark_topk.cpp similarity index 100% rename from benchmark/benchmark_topk.cpp rename to archive/benchmark/benchmark_topk.cpp diff --git a/benchmark/effectiveness.cpp b/archive/benchmark/effectiveness.cpp similarity index 100% rename from benchmark/effectiveness.cpp rename to archive/benchmark/effectiveness.cpp diff --git a/example.sh b/archive/example.sh similarity index 100% rename from example.sh rename to archive/example.sh diff --git a/external/CMakeLists.txt b/archive/external/CMakeLists.txt similarity index 100% rename from external/CMakeLists.txt rename to archive/external/CMakeLists.txt diff --git a/include/autocomplete.hpp b/archive/include/autocomplete.hpp similarity index 100% rename from include/autocomplete.hpp rename to archive/include/autocomplete.hpp diff --git a/include/autocomplete2.hpp b/archive/include/autocomplete2.hpp similarity index 100% rename from include/autocomplete2.hpp rename to archive/include/autocomplete2.hpp diff --git a/include/autocomplete3.hpp b/archive/include/autocomplete3.hpp similarity index 100% rename from include/autocomplete3.hpp rename to archive/include/autocomplete3.hpp diff --git a/include/autocomplete4.hpp b/archive/include/autocomplete4.hpp similarity index 100% rename from include/autocomplete4.hpp rename to archive/include/autocomplete4.hpp diff --git a/include/autocomplete_common.hpp b/archive/include/autocomplete_common.hpp similarity index 100% rename from include/autocomplete_common.hpp rename to archive/include/autocomplete_common.hpp diff --git a/include/bit_vector.hpp b/archive/include/bit_vector.hpp similarity index 100% rename from include/bit_vector.hpp rename to archive/include/bit_vector.hpp diff --git a/include/blocked_inverted_index.hpp b/archive/include/blocked_inverted_index.hpp similarity index 100% rename from include/blocked_inverted_index.hpp rename to archive/include/blocked_inverted_index.hpp diff --git a/include/compact_forward_index.hpp b/archive/include/compact_forward_index.hpp similarity index 100% rename from include/compact_forward_index.hpp rename to archive/include/compact_forward_index.hpp diff --git a/include/compact_vector.hpp b/archive/include/compact_vector.hpp similarity index 100% rename from include/compact_vector.hpp rename to archive/include/compact_vector.hpp diff --git a/include/completion_trie.hpp b/archive/include/completion_trie.hpp similarity index 100% rename from include/completion_trie.hpp rename to archive/include/completion_trie.hpp diff --git a/include/constants.hpp b/archive/include/constants.hpp similarity index 100% rename from include/constants.hpp rename to archive/include/constants.hpp diff --git a/include/ef/compact_ef.hpp b/archive/include/ef/compact_ef.hpp similarity index 100% rename from include/ef/compact_ef.hpp rename to archive/include/ef/compact_ef.hpp diff --git a/include/ef/darray.hpp b/archive/include/ef/darray.hpp similarity index 100% rename from include/ef/darray.hpp rename to archive/include/ef/darray.hpp diff --git a/include/ef/ef_parameters.hpp b/archive/include/ef/ef_parameters.hpp similarity index 100% rename from include/ef/ef_parameters.hpp rename to archive/include/ef/ef_parameters.hpp diff --git a/include/ef/ef_sequence.hpp b/archive/include/ef/ef_sequence.hpp similarity index 100% rename from include/ef/ef_sequence.hpp rename to archive/include/ef/ef_sequence.hpp diff --git a/include/fc_dictionary.hpp b/archive/include/fc_dictionary.hpp similarity index 100% rename from include/fc_dictionary.hpp rename to archive/include/fc_dictionary.hpp diff --git a/include/integer_codes.hpp b/archive/include/integer_codes.hpp similarity index 100% rename from include/integer_codes.hpp rename to archive/include/integer_codes.hpp diff --git a/include/integer_fc_dictionary.hpp b/archive/include/integer_fc_dictionary.hpp similarity index 100% rename from include/integer_fc_dictionary.hpp rename to archive/include/integer_fc_dictionary.hpp diff --git a/include/inverted_index.hpp b/archive/include/inverted_index.hpp similarity index 100% rename from include/inverted_index.hpp rename to archive/include/inverted_index.hpp diff --git a/include/min_heap.hpp b/archive/include/min_heap.hpp similarity index 100% rename from include/min_heap.hpp rename to archive/include/min_heap.hpp diff --git a/include/minimal_docids.hpp b/archive/include/minimal_docids.hpp similarity index 100% rename from include/minimal_docids.hpp rename to archive/include/minimal_docids.hpp diff --git a/include/parameters.hpp b/archive/include/parameters.hpp similarity index 100% rename from include/parameters.hpp rename to archive/include/parameters.hpp diff --git a/include/probe.hpp b/archive/include/probe.hpp similarity index 100% rename from include/probe.hpp rename to archive/include/probe.hpp diff --git a/include/scored_string_pool.hpp b/archive/include/scored_string_pool.hpp similarity index 100% rename from include/scored_string_pool.hpp rename to archive/include/scored_string_pool.hpp diff --git a/include/statistics.hpp b/archive/include/statistics.hpp similarity index 100% rename from include/statistics.hpp rename to archive/include/statistics.hpp diff --git a/include/succinct_rmq/README.md b/archive/include/succinct_rmq/README.md similarity index 100% rename from include/succinct_rmq/README.md rename to archive/include/succinct_rmq/README.md diff --git a/include/succinct_rmq/bp_vector.hpp b/archive/include/succinct_rmq/bp_vector.hpp similarity index 100% rename from include/succinct_rmq/bp_vector.hpp rename to archive/include/succinct_rmq/bp_vector.hpp diff --git a/include/succinct_rmq/bp_vector_support.hpp b/archive/include/succinct_rmq/bp_vector_support.hpp similarity index 100% rename from include/succinct_rmq/bp_vector_support.hpp rename to archive/include/succinct_rmq/bp_vector_support.hpp diff --git a/include/succinct_rmq/cartesian_tree.hpp b/archive/include/succinct_rmq/cartesian_tree.hpp similarity index 100% rename from include/succinct_rmq/cartesian_tree.hpp rename to archive/include/succinct_rmq/cartesian_tree.hpp diff --git a/include/succinct_rmq/rs_bit_vector.hpp b/archive/include/succinct_rmq/rs_bit_vector.hpp similarity index 100% rename from include/succinct_rmq/rs_bit_vector.hpp rename to archive/include/succinct_rmq/rs_bit_vector.hpp diff --git a/include/types.hpp b/archive/include/types.hpp similarity index 100% rename from include/types.hpp rename to archive/include/types.hpp diff --git a/include/uint_vec.hpp b/archive/include/uint_vec.hpp similarity index 100% rename from include/uint_vec.hpp rename to archive/include/uint_vec.hpp diff --git a/include/uncompressed_list.hpp b/archive/include/uncompressed_list.hpp similarity index 100% rename from include/uncompressed_list.hpp rename to archive/include/uncompressed_list.hpp diff --git a/include/unsorted_list.hpp b/archive/include/unsorted_list.hpp similarity index 100% rename from include/unsorted_list.hpp rename to archive/include/unsorted_list.hpp diff --git a/include/util.hpp b/archive/include/util.hpp similarity index 100% rename from include/util.hpp rename to archive/include/util.hpp diff --git a/include/util_types.hpp b/archive/include/util_types.hpp similarity index 100% rename from include/util_types.hpp rename to archive/include/util_types.hpp diff --git a/install.sh b/archive/install.sh similarity index 100% rename from install.sh rename to archive/install.sh diff --git a/script/benchmark_dictionaries.sh b/archive/script/benchmark_dictionaries.sh similarity index 100% rename from script/benchmark_dictionaries.sh rename to archive/script/benchmark_dictionaries.sh diff --git a/script/collect_effectiveness_results_by_varying_percentage.py b/archive/script/collect_effectiveness_results_by_varying_percentage.py similarity index 100% rename from script/collect_effectiveness_results_by_varying_percentage.py rename to archive/script/collect_effectiveness_results_by_varying_percentage.py diff --git a/script/collect_locate_prefix_results_by_varying_percentage.py b/archive/script/collect_locate_prefix_results_by_varying_percentage.py similarity index 100% rename from script/collect_locate_prefix_results_by_varying_percentage.py rename to archive/script/collect_locate_prefix_results_by_varying_percentage.py diff --git a/script/collect_results_by_varying_percentage.py b/archive/script/collect_results_by_varying_percentage.py similarity index 100% rename from script/collect_results_by_varying_percentage.py rename to archive/script/collect_results_by_varying_percentage.py diff --git a/src/CMakeLists.txt b/archive/src/CMakeLists.txt similarity index 100% rename from src/CMakeLists.txt rename to archive/src/CMakeLists.txt diff --git a/src/check_topk.cpp b/archive/src/check_topk.cpp similarity index 100% rename from src/check_topk.cpp rename to archive/src/check_topk.cpp diff --git a/src/map_queries.cpp b/archive/src/map_queries.cpp similarity index 100% rename from src/map_queries.cpp rename to archive/src/map_queries.cpp diff --git a/src/output_ds2i_format.cpp b/archive/src/output_ds2i_format.cpp similarity index 100% rename from src/output_ds2i_format.cpp rename to archive/src/output_ds2i_format.cpp diff --git a/src/statistics.cpp b/archive/src/statistics.cpp similarity index 100% rename from src/statistics.cpp rename to archive/src/statistics.cpp diff --git a/src/web_server.cpp b/archive/src/web_server.cpp similarity index 100% rename from src/web_server.cpp rename to archive/src/web_server.cpp diff --git a/test/test_autocomplete.cpp b/archive/test/test_autocomplete.cpp similarity index 100% rename from test/test_autocomplete.cpp rename to archive/test/test_autocomplete.cpp diff --git a/test/test_blocked_inverted_index.cpp b/archive/test/test_blocked_inverted_index.cpp similarity index 100% rename from test/test_blocked_inverted_index.cpp rename to archive/test/test_blocked_inverted_index.cpp diff --git a/test/test_common.hpp b/archive/test/test_common.hpp similarity index 100% rename from test/test_common.hpp rename to archive/test/test_common.hpp diff --git a/test/test_compact_forward_index.cpp b/archive/test/test_compact_forward_index.cpp similarity index 100% rename from test/test_compact_forward_index.cpp rename to archive/test/test_compact_forward_index.cpp diff --git a/test/test_completion_trie.cpp b/archive/test/test_completion_trie.cpp similarity index 100% rename from test/test_completion_trie.cpp rename to archive/test/test_completion_trie.cpp diff --git a/test/test_fc_dictionary.cpp b/archive/test/test_fc_dictionary.cpp similarity index 100% rename from test/test_fc_dictionary.cpp rename to archive/test/test_fc_dictionary.cpp diff --git a/test/test_integer_fc_dictionary.cpp b/archive/test/test_integer_fc_dictionary.cpp similarity index 100% rename from test/test_integer_fc_dictionary.cpp rename to archive/test/test_integer_fc_dictionary.cpp diff --git a/test/test_inverted_index.cpp b/archive/test/test_inverted_index.cpp similarity index 100% rename from test/test_inverted_index.cpp rename to archive/test/test_inverted_index.cpp diff --git a/test/test_locate_prefix.cpp b/archive/test/test_locate_prefix.cpp similarity index 100% rename from test/test_locate_prefix.cpp rename to archive/test/test_locate_prefix.cpp diff --git a/test/test_unsorted_list.cpp b/archive/test/test_unsorted_list.cpp similarity index 100% rename from test/test_unsorted_list.cpp rename to archive/test/test_unsorted_list.cpp diff --git a/test_data/extract_dict.py b/archive/test_data/extract_dict.py similarity index 100% rename from test_data/extract_dict.py rename to archive/test_data/extract_dict.py diff --git a/test_data/filter_and_preprocess.sh b/archive/test_data/filter_and_preprocess.sh similarity index 100% rename from test_data/filter_and_preprocess.sh rename to archive/test_data/filter_and_preprocess.sh diff --git a/test_data/filter_dataset.py b/archive/test_data/filter_dataset.py similarity index 100% rename from test_data/filter_dataset.py rename to archive/test_data/filter_dataset.py diff --git a/test_data/map_dataset.py b/archive/test_data/map_dataset.py similarity index 100% rename from test_data/map_dataset.py rename to archive/test_data/map_dataset.py diff --git a/test_data/partition_queries_by_length.py b/archive/test_data/partition_queries_by_length.py similarity index 100% rename from test_data/partition_queries_by_length.py rename to archive/test_data/partition_queries_by_length.py diff --git a/test_data/preprocess.sh b/archive/test_data/preprocess.sh similarity index 100% rename from test_data/preprocess.sh rename to archive/test_data/preprocess.sh diff --git a/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions b/archive/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions similarity index 100% rename from test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions rename to archive/test_data/trec_05_efficiency_queries/trec_05_efficiency_queries.completions diff --git a/web/index.html b/archive/web/index.html similarity index 100% rename from web/index.html rename to archive/web/index.html diff --git a/web/jquery-1.8.2.min.js b/archive/web/jquery-1.8.2.min.js similarity index 100% rename from web/jquery-1.8.2.min.js rename to archive/web/jquery-1.8.2.min.js diff --git a/web/jquery.autocomplete.js b/archive/web/jquery.autocomplete.js similarity index 100% rename from web/jquery.autocomplete.js rename to archive/web/jquery.autocomplete.js diff --git a/web/styles.css b/archive/web/styles.css similarity index 100% rename from web/styles.css rename to archive/web/styles.css diff --git a/web/topkcomp.js b/archive/web/topkcomp.js similarity index 100% rename from web/topkcomp.js rename to archive/web/topkcomp.js diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock new file mode 100644 index 0000000..6cb35bc --- /dev/null +++ b/autocomplete-rs/Cargo.lock @@ -0,0 +1,191 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "autocomplete-rs" +version = "0.1.0" +dependencies = [ + "tempfile", +] + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "errno" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi", +] + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + +[[package]] +name = "rustix" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "tempfile" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags", +] diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml new file mode 100644 index 0000000..7d62c58 --- /dev/null +++ b/autocomplete-rs/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "autocomplete-rs" +version = "0.1.0" +edition = "2021" + +[dependencies] + +[dev-dependencies] +tempfile = "3.8" diff --git a/autocomplete-rs/README.md b/autocomplete-rs/README.md new file mode 100644 index 0000000..801e4b2 --- /dev/null +++ b/autocomplete-rs/README.md @@ -0,0 +1,44 @@ +# Autocomplete-rs + +This project is a Rust port of the original C++ autocomplete system. The goal is to maintain the same functionality while leveraging Rust's safety guarantees and modern tooling. + +## Project Status + +Currently, we are in the process of porting the core components from C++ to Rust. The following components have been ported: + +- Basic constants and configuration +- Parameters management +- Performance measurement probes + +## Next Steps + +1. Continue porting core components: + - Scored string pool + - Completion trie + - Blocked inverted index + - Front-coded dictionary + +2. Port and adapt unit tests to ensure functionality matches the original implementation + +3. Containerize the application using Docker for easy deployment and testing + +## Building and Testing + +```bash +# Build the project +cargo build + +# Run tests +cargo test + +# Run with specific test +cargo test test_name -- --nocapture +``` + +## Original Project + +This is a port of the original C++ autocomplete system, which provides efficient string completion functionality. The original implementation can be found in the `archive` directory. + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. \ No newline at end of file diff --git a/autocomplete-rs/src/constants.rs b/autocomplete-rs/src/constants.rs new file mode 100644 index 0000000..b949eb7 --- /dev/null +++ b/autocomplete-rs/src/constants.rs @@ -0,0 +1,8 @@ +// Constants for the autocomplete system +pub const MAX_K: u32 = 15; +pub const MAX_NUM_TERMS_PER_QUERY: u32 = 64; +pub const MAX_NUM_CHARS_PER_QUERY: u32 = 128; +pub const POOL_SIZE: usize = (MAX_K as usize) * (MAX_NUM_CHARS_PER_QUERY as usize); + +// Compile-time assertion +const _: () = assert!(MAX_NUM_TERMS_PER_QUERY < 256, "MAX_NUM_TERMS_PER_QUERY must be < 256"); \ No newline at end of file diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs new file mode 100644 index 0000000..c5c3755 --- /dev/null +++ b/autocomplete-rs/src/lib.rs @@ -0,0 +1,7 @@ +pub mod constants; +pub mod parameters; +pub mod probe; + +pub use constants::*; +pub use parameters::*; +pub use probe::*; \ No newline at end of file diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs new file mode 100644 index 0000000..e7a11a9 --- /dev/null +++ b/autocomplete-rs/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} diff --git a/autocomplete-rs/src/parameters.rs b/autocomplete-rs/src/parameters.rs new file mode 100644 index 0000000..38d5fec --- /dev/null +++ b/autocomplete-rs/src/parameters.rs @@ -0,0 +1,115 @@ +use std::fs::File; +use std::io::{self, BufRead, BufReader}; +use std::path::Path; + +use crate::constants::{MAX_NUM_CHARS_PER_QUERY, MAX_NUM_TERMS_PER_QUERY}; + +/// Parameters for the autocomplete system +#[derive(Debug, Default)] +pub struct Parameters { + pub num_terms: u32, + pub max_string_length: u32, + pub num_completions: u32, + pub universe: u32, + pub num_levels: u32, + pub nodes_per_level: Vec, + pub collection_basename: String, +} + +impl Parameters { + /// Creates a new empty Parameters instance + pub fn new() -> Self { + Self::default() + } + + /// Loads parameters from a statistics file + pub fn load(&mut self) -> io::Result<()> { + let stats_path = if self.collection_basename.ends_with(".mapped.stats") { + Path::new(&self.collection_basename).to_path_buf() + } else { + Path::new(&self.collection_basename).with_extension("mapped.stats") + }; + + let file = File::open(stats_path)?; + let reader = BufReader::new(file); + let mut lines = reader.lines(); + + // Read basic statistics + self.num_terms = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_terms"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.max_string_length = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing max_string_length"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.num_completions = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_completions"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.universe = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing universe"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.num_levels = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing num_levels"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + // Validate basic statistics + if self.num_terms == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "num_terms must be > 0")); + } + if self.max_string_length == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "max_string_length must be > 0")); + } + if self.num_completions == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "num_completions must be > 0")); + } + if self.universe < self.num_completions { + return Err(io::Error::new(io::ErrorKind::InvalidData, "universe must be >= num_completions")); + } + if self.num_levels == 0 { + return Err(io::Error::new(io::ErrorKind::InvalidData, "num_levels must be > 0")); + } + + // Validate against constants + if self.max_string_length > MAX_NUM_CHARS_PER_QUERY { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("max_string_length ({}) exceeds MAX_NUM_CHARS_PER_QUERY ({})", + self.max_string_length, MAX_NUM_CHARS_PER_QUERY) + )); + } + if self.num_levels > MAX_NUM_TERMS_PER_QUERY { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("num_levels ({}) exceeds MAX_NUM_TERMS_PER_QUERY ({})", + self.num_levels, MAX_NUM_TERMS_PER_QUERY) + )); + } + + // Read nodes per level + self.nodes_per_level = Vec::with_capacity(self.num_levels as usize); + for _ in 0..self.num_levels { + let count = lines.next() + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Missing nodes_per_level data"))?? + .parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + self.nodes_per_level.push(count); + } + + if self.nodes_per_level.len() != self.num_levels as usize { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "File with statistics may be truncated or malformed" + )); + } + + Ok(()) + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/probe.rs b/autocomplete-rs/src/probe.rs new file mode 100644 index 0000000..c82f825 --- /dev/null +++ b/autocomplete-rs/src/probe.rs @@ -0,0 +1,81 @@ +use std::time::{Duration, Instant}; + +/// A trait for performance measurement probes +pub trait Probe { + /// Start timing an operation + fn start(&mut self, id: u64); + /// Stop timing an operation + fn stop(&mut self, id: u64); +} + +/// A no-operation probe that does nothing +#[derive(Debug, Default)] +pub struct NopProbe; + +impl Probe for NopProbe { + fn start(&mut self, _id: u64) {} + fn stop(&mut self, _id: u64) {} +} + +/// A timer probe that measures operation durations +#[derive(Debug)] +pub struct TimerProbe { + timers: Vec, +} + +#[derive(Debug, Default, Clone)] +struct Timer { + start_time: Option, + total_duration: Duration, +} + +impl Timer { + fn new() -> Self { + Self { + start_time: None, + total_duration: Duration::default(), + } + } + + fn start(&mut self) { + self.start_time = Some(Instant::now()); + } + + fn stop(&mut self) { + if let Some(start) = self.start_time { + self.total_duration += start.elapsed(); + self.start_time = None; + } + } + + fn get_duration(&self) -> Duration { + self.total_duration + } +} + +impl TimerProbe { + /// Creates a new TimerProbe with the specified number of timers + pub fn new(num_timers: u64) -> Self { + Self { + timers: vec![Timer::new(); num_timers as usize], + } + } + + /// Gets the total duration for a specific timer + pub fn get_duration(&self, id: u64) -> Duration { + assert!(id < self.timers.len() as u64, "Timer ID out of bounds"); + self.timers[id as usize].get_duration() + } +} + +impl Probe for TimerProbe { + fn start(&mut self, id: u64) { + assert!(id < self.timers.len() as u64, "Timer ID out of bounds"); + self.timers[id as usize].start(); + } + + fn stop(&mut self, id: u64) { + assert!(id < self.timers.len() as u64, "Timer ID out of bounds"); + self.timers[id as usize].stop(); + } +} \ No newline at end of file diff --git a/autocomplete-rs/tests/constants_tests.rs b/autocomplete-rs/tests/constants_tests.rs new file mode 100644 index 0000000..94123cc --- /dev/null +++ b/autocomplete-rs/tests/constants_tests.rs @@ -0,0 +1,21 @@ +use autocomplete_rs::constants::*; + +#[test] +fn test_constants() { + // Test MAX_K + assert!(MAX_K > 0, "MAX_K should be positive"); + assert!(MAX_K <= 100, "MAX_K should be reasonably small"); + + // Test MAX_NUM_TERMS_PER_QUERY + assert!(MAX_NUM_TERMS_PER_QUERY > 0, "MAX_NUM_TERMS_PER_QUERY should be positive"); + assert!(MAX_NUM_TERMS_PER_QUERY < 256, "MAX_NUM_TERMS_PER_QUERY must be < 256"); + + // Test MAX_NUM_CHARS_PER_QUERY + assert!(MAX_NUM_CHARS_PER_QUERY > 0, "MAX_NUM_CHARS_PER_QUERY should be positive"); + assert!(MAX_NUM_CHARS_PER_QUERY >= MAX_K, "MAX_NUM_CHARS_PER_QUERY should be >= MAX_K"); + + // Test POOL_SIZE + assert!(POOL_SIZE > 0, "POOL_SIZE should be positive"); + assert_eq!(POOL_SIZE, (MAX_K as usize) * (MAX_NUM_CHARS_PER_QUERY as usize), + "POOL_SIZE should be MAX_K * MAX_NUM_CHARS_PER_QUERY"); +} \ No newline at end of file diff --git a/autocomplete-rs/tests/parameters_tests.rs b/autocomplete-rs/tests/parameters_tests.rs new file mode 100644 index 0000000..2bd6762 --- /dev/null +++ b/autocomplete-rs/tests/parameters_tests.rs @@ -0,0 +1,98 @@ +use std::fs::File; +use std::io::Write; +use std::path::Path; +use tempfile::NamedTempFile; +use autocomplete_rs::parameters::Parameters; +use autocomplete_rs::constants::{MAX_NUM_CHARS_PER_QUERY, MAX_NUM_TERMS_PER_QUERY}; + +fn create_test_stats_file() -> NamedTempFile { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "1000").unwrap(); // num_terms + writeln!(file, "50").unwrap(); // max_string_length + writeln!(file, "500").unwrap(); // num_completions + writeln!(file, "1000").unwrap(); // universe + writeln!(file, "3").unwrap(); // num_levels + writeln!(file, "100").unwrap(); // nodes_per_level[0] + writeln!(file, "200").unwrap(); // nodes_per_level[1] + writeln!(file, "300").unwrap(); // nodes_per_level[2] + file +} + +#[test] +fn test_parameters_load_valid() { + let test_file = create_test_stats_file(); + let mut params = Parameters::new(); + let path = test_file.path().to_str().unwrap().to_string(); + println!("Test file path: {}", path); + params.collection_basename = path; + + match params.load() { + Ok(_) => println!("Load succeeded"), + Err(e) => println!("Load failed: {}", e), + } + + assert!(params.load().is_ok()); + assert_eq!(params.num_terms, 1000); + assert_eq!(params.max_string_length, 50); + assert_eq!(params.num_completions, 500); + assert_eq!(params.universe, 1000); + assert_eq!(params.num_levels, 3); + assert_eq!(params.nodes_per_level, vec![100, 200, 300]); +} + +#[test] +fn test_parameters_load_invalid_file() { + let mut params = Parameters::new(); + params.collection_basename = "nonexistent_file".to_string(); + assert!(params.load().is_err()); +} + +#[test] +fn test_parameters_load_invalid_data() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "0").unwrap(); // invalid num_terms + writeln!(file, "50").unwrap(); + writeln!(file, "500").unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "3").unwrap(); + writeln!(file, "100").unwrap(); + writeln!(file, "200").unwrap(); + writeln!(file, "300").unwrap(); + + let mut params = Parameters::new(); + params.collection_basename = file.path().to_str().unwrap().to_string(); + assert!(params.load().is_err()); +} + +#[test] +fn test_parameters_load_invalid_constants() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "{}", MAX_NUM_CHARS_PER_QUERY + 1).unwrap(); // exceeds MAX_NUM_CHARS_PER_QUERY + writeln!(file, "500").unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "3").unwrap(); + writeln!(file, "100").unwrap(); + writeln!(file, "200").unwrap(); + writeln!(file, "300").unwrap(); + + let mut params = Parameters::new(); + params.collection_basename = file.path().to_str().unwrap().to_string(); + assert!(params.load().is_err()); +} + +#[test] +fn test_parameters_load_truncated() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "50").unwrap(); + writeln!(file, "500").unwrap(); + writeln!(file, "1000").unwrap(); + writeln!(file, "3").unwrap(); + writeln!(file, "100").unwrap(); + // Missing nodes_per_level entries + + let mut params = Parameters::new(); + params.collection_basename = file.path().to_str().unwrap().to_string(); + assert!(params.load().is_err()); +} \ No newline at end of file diff --git a/autocomplete-rs/tests/probe_tests.rs b/autocomplete-rs/tests/probe_tests.rs new file mode 100644 index 0000000..7e869e1 --- /dev/null +++ b/autocomplete-rs/tests/probe_tests.rs @@ -0,0 +1,79 @@ +use std::thread; +use std::time::Duration; +use autocomplete_rs::probe::{Probe, NopProbe, TimerProbe}; + +#[test] +fn test_nop_probe() { + let mut probe = NopProbe; + // These should not panic + probe.start(0); + probe.stop(0); +} + +#[test] +fn test_timer_probe_single() { + let mut probe = TimerProbe::new(1); + + probe.start(0); + thread::sleep(Duration::from_millis(100)); + probe.stop(0); + + let duration = probe.get_duration(0); + assert!(duration >= Duration::from_millis(100)); +} + +#[test] +fn test_timer_probe_multiple() { + let mut probe = TimerProbe::new(3); + + // Timer 0 + probe.start(0); + thread::sleep(Duration::from_millis(100)); + probe.stop(0); + + // Timer 1 + probe.start(1); + thread::sleep(Duration::from_millis(200)); + probe.stop(1); + + // Timer 2 + probe.start(2); + thread::sleep(Duration::from_millis(300)); + probe.stop(2); + + assert!(probe.get_duration(0) >= Duration::from_millis(100)); + assert!(probe.get_duration(1) >= Duration::from_millis(200)); + assert!(probe.get_duration(2) >= Duration::from_millis(300)); +} + +#[test] +fn test_timer_probe_accumulation() { + let mut probe = TimerProbe::new(1); + + // First interval + probe.start(0); + thread::sleep(Duration::from_millis(100)); + probe.stop(0); + + // Second interval + probe.start(0); + thread::sleep(Duration::from_millis(100)); + probe.stop(0); + + let duration = probe.get_duration(0); + assert!(duration >= Duration::from_millis(200)); +} + +#[test] +#[should_panic(expected = "Timer ID out of bounds")] +fn test_timer_probe_invalid_id() { + let mut probe = TimerProbe::new(1); + probe.start(1); // Should panic as we only have timer 0 +} + +#[test] +#[should_panic(expected = "Timer ID out of bounds")] +fn test_timer_probe_get_invalid_id() { + let probe = TimerProbe::new(1); + probe.get_duration(1); // Should panic as we only have timer 0 +} \ No newline at end of file diff --git a/external/cmd_line_parser b/external/cmd_line_parser deleted file mode 160000 index 1776808..0000000 --- a/external/cmd_line_parser +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1776808718445425dcad42ba2d1b6adf2cb5e496 diff --git a/external/doctest b/external/doctest deleted file mode 160000 index ae7a135..0000000 --- a/external/doctest +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ae7a13539fb71f270b87eb2e874fbac80bc8dda2 diff --git a/external/essentials b/external/essentials deleted file mode 160000 index da66810..0000000 --- a/external/essentials +++ /dev/null @@ -1 +0,0 @@ -Subproject commit da6681019cbad6bef62804927801dd09832e512e diff --git a/external/jQuery-Autocomplete b/external/jQuery-Autocomplete deleted file mode 160000 index 0ba2565..0000000 --- a/external/jQuery-Autocomplete +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0ba256501bc365814f43066999f51f0619e739a9 diff --git a/external/mongoose b/external/mongoose deleted file mode 160000 index dce60c6..0000000 --- a/external/mongoose +++ /dev/null @@ -1 +0,0 @@ -Subproject commit dce60c6dbb096f3b96e1a45cbfdfd55e18b38bb6 diff --git a/include/building_util.hpp b/include/building_util.hpp deleted file mode 100644 index 0398879..0000000 --- a/include/building_util.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include "util.hpp" -#include "bit_vector.hpp" - -namespace autocomplete { -namespace util { - -std::vector invert(std::vector const& docid_to_lexid, - uint64_t size) { - std::vector lexid_to_docid(size); - for (uint64_t doc_id = 0; doc_id != docid_to_lexid.size(); ++doc_id) { - if (docid_to_lexid[doc_id] < size) { - lexid_to_docid[docid_to_lexid[doc_id]] = doc_id; - } - } - return lexid_to_docid; -} - -void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) { - uint64_t mod = bvb.size() % alignment; - if (mod) { - uint64_t pad = alignment - mod; - bvb.append_bits(0, pad); - assert(bvb.size() % alignment == 0); - } -} - -void eat_pad(bits_iterator& it, uint64_t alignment = 8) { - uint64_t mod = it.position() % alignment; - if (mod) { - uint64_t pad = alignment - mod; - it.get_bits(pad); - assert(it.position() % alignment == 0); - } -} - -} // namespace util -} // namespace autocomplete \ No newline at end of file diff --git a/script/build_indexes.py b/script/build_indexes.py deleted file mode 100644 index e01e1db..0000000 --- a/script/build_indexes.py +++ /dev/null @@ -1,6 +0,0 @@ -import sys, os - -dataset_name = sys.argv[1] # e.g., aol -types = ["ef_type1", "ef_type2", "ef_type3", "ef_type4"] -for t in types: - os.system("./build " + t + " ../test_data/" + dataset_name + "/" + dataset_name + ".completions -o " + t + "." + dataset_name + ".bin -c 0.0001") \ No newline at end of file diff --git a/src/build.cpp b/src/build.cpp deleted file mode 100644 index ba73954..0000000 --- a/src/build.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include - -#include "types.hpp" -#include "statistics.hpp" -#include "../external/cmd_line_parser/include/parser.hpp" - -using namespace autocomplete; - -template -void build(parameters const& params, std::string const& output_filename) { - Index index(params); - index.print_stats(); - if (output_filename != "") { - essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename.c_str()); - essentials::logger("DONE"); - } -} - -void build_type4(parameters const& params, const float c, - std::string const& output_filename) { - ef_autocomplete_type4 index(params, c); - index.print_stats(); - if (output_filename != "") { - essentials::logger("saving data structure to disk..."); - essentials::save(index, output_filename.c_str()); - essentials::logger("DONE"); - } -} - -int main(int argc, char** argv) { - cmd_line_parser::parser parser(argc, argv); - parser.add("type", "Index type."); - parser.add("collection_basename", "Collection basename."); - parser.add("output_filename", "Output filename.", "-o", false); - parser.add( - "c", - "Value for Bast and Weber's technique: c must be a float in (0,1].", - "-c", false); - if (!parser.parse()) return 1; - - auto type = parser.get("type"); - parameters params; - params.collection_basename = parser.get("collection_basename"); - params.load(); - auto output_filename = parser.get("output_filename"); - - if (type == "ef_type1") { - build(params, output_filename); - } else if (type == "ef_type2") { - build(params, output_filename); - } else if (type == "ef_type3") { - build(params, output_filename); - } else if (type == "ef_type4") { - auto c = parser.get("c"); - build_type4(params, c, output_filename); - } else { - return 1; - } - - return 0; -} \ No newline at end of file diff --git a/test_data/build_inverted_and_forward.py b/test_data/build_inverted_and_forward.py deleted file mode 100644 index 0966d99..0000000 --- a/test_data/build_inverted_and_forward.py +++ /dev/null @@ -1,74 +0,0 @@ -import sys - -input_filename = sys.argv[1] - -tokens = {} -print("building dictionary...") -id = 1 # reserve id 0 to mark the end of a string -with open(input_filename + ".dict") as f: - for line in f: - t = line.rstrip('\n') - tokens[t] = id - id += 1 - -lines = 0 -inverted = open(input_filename + ".inverted", 'w') -forward = open(input_filename + ".forward", 'w') - -num_terms = 0 -num_docs = 0 -with open(input_filename + ".mapped.stats") as f: - num_terms = int(f.readline()) - print("terms: " + str(num_terms)) - f.readline() # skip line: max num. of query terms - f.readline() # skip line: num. of completions - num_docs = int(f.readline()) - print("universe: " + str(num_docs)) - -inverted_index = [[] for i in range(num_terms + 1)] # id 0 is not assigned -forward_index = [[] for i in range(num_docs)] - -with open(input_filename, 'r') as f: - for line in f: - x = line.rstrip('\n').split() - mapped = [] - doc_id = int(x[0]) - discard = False - for i in range(1, len(x)): - try: - term = x[i] - try: - term_id = tokens[term] - if term_id not in mapped: - inverted_index[term_id].append(doc_id) - mapped.append(term_id) - except KeyError: - print("'" + term + "' not found in dictionary") - print(line) - exit() - except UnicodeDecodeError: - discard = True - - if not discard: - # NOTE: not sorted! - if doc_id >= num_docs: - print(doc_id,num_docs) - forward_index[doc_id] = mapped; - - lines += 1 - if lines % 1000000 == 0: - print("processed " + str(lines) + " lines") - -for i in range(0, num_docs): - s = [str(k) for k in forward_index[i]] - forward.write(str(len(forward_index[i])) + " ") - forward.write(" ".join(s) + "\n") -forward.close() - -for i in range(1, num_terms + 1): - posting_list = inverted_index[i] - unique = sorted(set(posting_list)); - s = [str(i) for i in unique] # remove any possible duplicate - inverted.write(str(len(unique)) + " ") - inverted.write(" ".join(s) + "\n") -inverted.close() diff --git a/test_data/build_stats.py b/test_data/build_stats.py deleted file mode 100644 index 880bcd3..0000000 --- a/test_data/build_stats.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys - -input_filename = sys.argv[1] # e.g., "completions.mapped" - -nodes_per_level = {} # (level_id, num_nodes) -lines = 0 -print("building stats...") - -output_file = open(input_filename + ".stats", 'a') -prev = [] -universe = 0; -with open(input_filename, 'r') as f: - for line in f: - x = line.rstrip('\n').split() - docid = int(x[0]) - - if docid > universe: - universe = docid - - q = x[1:len(x)] - - level_id = 0 - while level_id < len(q) and level_id < len(prev) and q[level_id] == prev[level_id]: - level_id += 1 - - while level_id < len(q): - if level_id in nodes_per_level: - nodes_per_level[level_id] += 1 - else: - nodes_per_level[level_id] = 1 - level_id += 1 - - prev = q - lines += 1 - if lines % 1000000 == 0: - print("processed " + str(lines) + " lines") - -# number of completions -# number of levels in the trie -# number of nodes for each level -print("universe: " + str(universe + 1)) -print("completions: " + str(lines)) -output_file.write(str(lines) + "\n") -output_file.write(str(universe + 1) + "\n") -output_file.write(str(len(nodes_per_level)) + "\n") -for key, value in sorted(nodes_per_level.items(), key = lambda kv: kv[0]): - output_file.write(str(value) + "\n") -output_file.close() - From 6d159eba0efc798417ff03d15f0d4467d10c9072 Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 10:26:59 -0400 Subject: [PATCH 091/102] initial commit, porting primitieves --- .gitignore | 84 ++++++++++++++++++- README.md | 42 ++++++++++ archive/include/building_util.hpp | 39 +++++++++ archive/script/build_indexes.py | 6 ++ archive/src/build.cpp | 62 ++++++++++++++ .../test_data/build_inverted_and_forward.py | 74 ++++++++++++++++ archive/test_data/build_stats.py | 49 +++++++++++ 7 files changed, 355 insertions(+), 1 deletion(-) create mode 100644 archive/include/building_util.hpp create mode 100644 archive/script/build_indexes.py create mode 100644 archive/src/build.cpp create mode 100644 archive/test_data/build_inverted_and_forward.py create mode 100644 archive/test_data/build_stats.py diff --git a/.gitignore b/.gitignore index 51855af..69bd68e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,84 @@ +# Rust specific +/target/ +**/*.rs.bk +Cargo.lock +*.pdb + +# C++ specific +*.o +*.obj +*.exe +*.out +*.app +*.dll +*.so +*.dylib +*.a +*.lib +*.d +*.lo +*.la +*.lai +*.Plo +*.Pla +*.l +*.o +*.obj +*.elf +*.bin +*.hex +*.map +*.lst +*.sym +*.lss +*.eep +*.elf +*.hex +*.bin +*.map +*.lst +*.sym +*.lss +*.eep +*.elf +*.hex +*.bin +*.map +*.lst +*.sym +*.lss +*.eep + +# Build directories +/build/ +/debug_build/ +/CMakeFiles/ +/CMakeCache.txt +/CMakeScripts/ +/Testing/ +/Makefile +/cmake_install.cmake +/install_manifest.txt +/compile_commands.json +/CTestTestfile.cmake +/_deps +/.cmake + +# IDE specific +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS specific .DS_Store -build* +Thumbs.db + +# Project specific +*.mapped +*.mapped.stats +*.dict +*.inverted +*.forward +*.bin diff --git a/README.md b/README.md index 69fe339..624670f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,45 @@ +# Autocomplete System + +This repository contains an autocomplete system implementation. The original C++ implementation is being ported to Rust and will be containerized for easier deployment and testing. + +## Project Structure + +- `autocomplete-rs/`: The Rust port of the original C++ implementation +- `archive/`: Original C++ implementation and related files + +## Goals + +1. Port the C++ implementation to Rust while maintaining the same functionality +2. Leverage Rust's safety guarantees and modern tooling +3. Containerize the application using Docker for easy deployment and testing + +## Current Status + +The porting process is ongoing. The following components have been ported to Rust: + +- Basic constants and configuration +- Parameters management +- Performance measurement probes + +## Building and Testing + +### Original C++ Implementation +```bash +cd archive +make +``` + +### Rust Implementation +```bash +cd autocomplete-rs +cargo build +cargo test +``` + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. + Autocomplete ------------ diff --git a/archive/include/building_util.hpp b/archive/include/building_util.hpp new file mode 100644 index 0000000..0398879 --- /dev/null +++ b/archive/include/building_util.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include "util.hpp" +#include "bit_vector.hpp" + +namespace autocomplete { +namespace util { + +std::vector invert(std::vector const& docid_to_lexid, + uint64_t size) { + std::vector lexid_to_docid(size); + for (uint64_t doc_id = 0; doc_id != docid_to_lexid.size(); ++doc_id) { + if (docid_to_lexid[doc_id] < size) { + lexid_to_docid[docid_to_lexid[doc_id]] = doc_id; + } + } + return lexid_to_docid; +} + +void push_pad(bit_vector_builder& bvb, uint64_t alignment = 8) { + uint64_t mod = bvb.size() % alignment; + if (mod) { + uint64_t pad = alignment - mod; + bvb.append_bits(0, pad); + assert(bvb.size() % alignment == 0); + } +} + +void eat_pad(bits_iterator& it, uint64_t alignment = 8) { + uint64_t mod = it.position() % alignment; + if (mod) { + uint64_t pad = alignment - mod; + it.get_bits(pad); + assert(it.position() % alignment == 0); + } +} + +} // namespace util +} // namespace autocomplete \ No newline at end of file diff --git a/archive/script/build_indexes.py b/archive/script/build_indexes.py new file mode 100644 index 0000000..e01e1db --- /dev/null +++ b/archive/script/build_indexes.py @@ -0,0 +1,6 @@ +import sys, os + +dataset_name = sys.argv[1] # e.g., aol +types = ["ef_type1", "ef_type2", "ef_type3", "ef_type4"] +for t in types: + os.system("./build " + t + " ../test_data/" + dataset_name + "/" + dataset_name + ".completions -o " + t + "." + dataset_name + ".bin -c 0.0001") \ No newline at end of file diff --git a/archive/src/build.cpp b/archive/src/build.cpp new file mode 100644 index 0000000..ba73954 --- /dev/null +++ b/archive/src/build.cpp @@ -0,0 +1,62 @@ +#include + +#include "types.hpp" +#include "statistics.hpp" +#include "../external/cmd_line_parser/include/parser.hpp" + +using namespace autocomplete; + +template +void build(parameters const& params, std::string const& output_filename) { + Index index(params); + index.print_stats(); + if (output_filename != "") { + essentials::logger("saving data structure to disk..."); + essentials::save(index, output_filename.c_str()); + essentials::logger("DONE"); + } +} + +void build_type4(parameters const& params, const float c, + std::string const& output_filename) { + ef_autocomplete_type4 index(params, c); + index.print_stats(); + if (output_filename != "") { + essentials::logger("saving data structure to disk..."); + essentials::save(index, output_filename.c_str()); + essentials::logger("DONE"); + } +} + +int main(int argc, char** argv) { + cmd_line_parser::parser parser(argc, argv); + parser.add("type", "Index type."); + parser.add("collection_basename", "Collection basename."); + parser.add("output_filename", "Output filename.", "-o", false); + parser.add( + "c", + "Value for Bast and Weber's technique: c must be a float in (0,1].", + "-c", false); + if (!parser.parse()) return 1; + + auto type = parser.get("type"); + parameters params; + params.collection_basename = parser.get("collection_basename"); + params.load(); + auto output_filename = parser.get("output_filename"); + + if (type == "ef_type1") { + build(params, output_filename); + } else if (type == "ef_type2") { + build(params, output_filename); + } else if (type == "ef_type3") { + build(params, output_filename); + } else if (type == "ef_type4") { + auto c = parser.get("c"); + build_type4(params, c, output_filename); + } else { + return 1; + } + + return 0; +} \ No newline at end of file diff --git a/archive/test_data/build_inverted_and_forward.py b/archive/test_data/build_inverted_and_forward.py new file mode 100644 index 0000000..0966d99 --- /dev/null +++ b/archive/test_data/build_inverted_and_forward.py @@ -0,0 +1,74 @@ +import sys + +input_filename = sys.argv[1] + +tokens = {} +print("building dictionary...") +id = 1 # reserve id 0 to mark the end of a string +with open(input_filename + ".dict") as f: + for line in f: + t = line.rstrip('\n') + tokens[t] = id + id += 1 + +lines = 0 +inverted = open(input_filename + ".inverted", 'w') +forward = open(input_filename + ".forward", 'w') + +num_terms = 0 +num_docs = 0 +with open(input_filename + ".mapped.stats") as f: + num_terms = int(f.readline()) + print("terms: " + str(num_terms)) + f.readline() # skip line: max num. of query terms + f.readline() # skip line: num. of completions + num_docs = int(f.readline()) + print("universe: " + str(num_docs)) + +inverted_index = [[] for i in range(num_terms + 1)] # id 0 is not assigned +forward_index = [[] for i in range(num_docs)] + +with open(input_filename, 'r') as f: + for line in f: + x = line.rstrip('\n').split() + mapped = [] + doc_id = int(x[0]) + discard = False + for i in range(1, len(x)): + try: + term = x[i] + try: + term_id = tokens[term] + if term_id not in mapped: + inverted_index[term_id].append(doc_id) + mapped.append(term_id) + except KeyError: + print("'" + term + "' not found in dictionary") + print(line) + exit() + except UnicodeDecodeError: + discard = True + + if not discard: + # NOTE: not sorted! + if doc_id >= num_docs: + print(doc_id,num_docs) + forward_index[doc_id] = mapped; + + lines += 1 + if lines % 1000000 == 0: + print("processed " + str(lines) + " lines") + +for i in range(0, num_docs): + s = [str(k) for k in forward_index[i]] + forward.write(str(len(forward_index[i])) + " ") + forward.write(" ".join(s) + "\n") +forward.close() + +for i in range(1, num_terms + 1): + posting_list = inverted_index[i] + unique = sorted(set(posting_list)); + s = [str(i) for i in unique] # remove any possible duplicate + inverted.write(str(len(unique)) + " ") + inverted.write(" ".join(s) + "\n") +inverted.close() diff --git a/archive/test_data/build_stats.py b/archive/test_data/build_stats.py new file mode 100644 index 0000000..880bcd3 --- /dev/null +++ b/archive/test_data/build_stats.py @@ -0,0 +1,49 @@ +import sys + +input_filename = sys.argv[1] # e.g., "completions.mapped" + +nodes_per_level = {} # (level_id, num_nodes) +lines = 0 +print("building stats...") + +output_file = open(input_filename + ".stats", 'a') +prev = [] +universe = 0; +with open(input_filename, 'r') as f: + for line in f: + x = line.rstrip('\n').split() + docid = int(x[0]) + + if docid > universe: + universe = docid + + q = x[1:len(x)] + + level_id = 0 + while level_id < len(q) and level_id < len(prev) and q[level_id] == prev[level_id]: + level_id += 1 + + while level_id < len(q): + if level_id in nodes_per_level: + nodes_per_level[level_id] += 1 + else: + nodes_per_level[level_id] = 1 + level_id += 1 + + prev = q + lines += 1 + if lines % 1000000 == 0: + print("processed " + str(lines) + " lines") + +# number of completions +# number of levels in the trie +# number of nodes for each level +print("universe: " + str(universe + 1)) +print("completions: " + str(lines)) +output_file.write(str(lines) + "\n") +output_file.write(str(universe + 1) + "\n") +output_file.write(str(len(nodes_per_level)) + "\n") +for key, value in sorted(nodes_per_level.items(), key = lambda kv: kv[0]): + output_file.write(str(value) + "\n") +output_file.close() + From 423c6d409c782ae137162afefc1aa509c4e45aba Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 10:29:16 -0400 Subject: [PATCH 092/102] add doc --- doc/component_diagram.md | 45 ++++++++++++ doc/cpp_structure.md | 153 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+) create mode 100644 doc/component_diagram.md create mode 100644 doc/cpp_structure.md diff --git a/doc/component_diagram.md b/doc/component_diagram.md new file mode 100644 index 0000000..5c9fd83 --- /dev/null +++ b/doc/component_diagram.md @@ -0,0 +1,45 @@ +# Component Relationships + +```mermaid +graph TD + subgraph Core + Constants[Constants] + Parameters[Parameters] + Probe[Performance Probe] + end + + subgraph Data Structures + StringPool[String Pool] + Trie[Completion Trie] + Dictionary[Front-Coded Dictionary] + Index[Blocked Inverted Index] + end + + subgraph Pipeline + Input[Input Processing] + Build[Index Building] + Query[Query Processing] + end + + %% Core Dependencies + Constants --> Parameters + Parameters --> StringPool + Parameters --> Trie + Parameters --> Dictionary + Parameters --> Index + Probe --> Query + + %% Data Structure Dependencies + Dictionary --> Trie + Trie --> Index + StringPool --> Dictionary + StringPool --> Trie + StringPool --> Index + + %% Pipeline Dependencies + Input --> Build + Build --> Query + Query --> Trie + Query --> Index + Query --> Dictionary +``` \ No newline at end of file diff --git a/doc/cpp_structure.md b/doc/cpp_structure.md new file mode 100644 index 0000000..bfa42d0 --- /dev/null +++ b/doc/cpp_structure.md @@ -0,0 +1,153 @@ +# C++ Code Structure Documentation + +This document outlines the structure of the original C++ implementation that is being ported to Rust. + +## Core Components + +### 1. Constants and Configuration +- **File**: `constants.hpp` +- **Purpose**: Defines system-wide constants and limits +- **Key Constants**: + - `MAX_K`: Maximum number of completions + - `MAX_NUM_TERMS_PER_QUERY`: Maximum terms per query + - `MAX_NUM_CHARS_PER_QUERY`: Maximum characters per query + - `POOL_SIZE`: Size of the string pool + +### 2. Parameters Management +- **File**: `parameters.hpp` +- **Purpose**: Manages system configuration parameters +- **Key Struct**: `parameters` + - `num_terms`: Total number of terms + - `max_string_length`: Maximum string length + - `num_completions`: Number of completions + - `universe`: Size of the universe + - `num_levels`: Number of levels in the index + - `nodes_per_level`: Vector of nodes per level + - `collection_basename`: Base name for collection files + +### 3. Performance Measurement +- **File**: `probe.hpp` +- **Purpose**: Performance measurement and timing +- **Key Structs**: + - `nop_probe`: No-operation probe + - `timer_probe`: Timer-based performance measurement + +### 4. String Pool Management +- **File**: `scored_string_pool.hpp` +- **Purpose**: Manages a pool of scored strings +- **Key Components**: + - String storage + - Score management + - Pool operations + +### 5. Completion Trie +- **File**: `completion_trie.hpp` +- **Purpose**: Implements the completion trie data structure +- **Key Features**: + - Prefix-based completion + - Node management + - Traversal operations + +### 6. Blocked Inverted Index +- **File**: `blocked_inverted_index.hpp` +- **Purpose**: Implements blocked inverted indexing +- **Key Components**: + - Block management + - Index operations + - Query processing + +### 7. Front-Coded Dictionary +- **File**: `fc_dictionary.hpp` +- **Purpose**: Implements front-coding for dictionary compression +- **Key Features**: + - String compression + - Dictionary operations + - Lookup functionality + +## Data Pipeline + +1. **Input Processing** + - Read input completions + - Sort lexicographically + - Generate statistics + +2. **Index Building** + - Build front-coded dictionary + - Construct completion trie + - Create blocked inverted index + +3. **Query Processing** + - Parse input query + - Traverse completion trie + - Search inverted index + - Return top-k completions + +## Key Methods and Operations + +### Dictionary Operations +```cpp +// Front-coded dictionary +void build_dictionary(); +void compress_strings(); +std::string lookup(uint32_t id); +``` + +### Trie Operations +```cpp +// Completion trie +void insert(const std::string& completion); +std::vector complete(const std::string& prefix); +``` + +### Index Operations +```cpp +// Blocked inverted index +void build_index(); +std::vector search(const std::vector& terms); +``` + +### Query Processing +```cpp +// Query handling +std::vector process_query(const std::string& query); +void rank_completions(std::vector& completions); +``` + +## Dependencies and Relationships + +1. **Core Dependencies** + - Constants → Parameters + - Parameters → All major components + - Probe → Performance measurement + +2. **Data Structure Dependencies** + - Front-coded Dictionary → Completion Trie + - Completion Trie → Blocked Inverted Index + - All components → String Pool + +3. **Pipeline Dependencies** + - Input Processing → Index Building + - Index Building → Query Processing + - Query Processing → All components + +## Porting Strategy + +1. **Phase 1: Core Components** + - Constants and configuration + - Parameters management + - Performance measurement + +2. **Phase 2: Data Structures** + - String pool + - Completion trie + - Front-coded dictionary + +3. **Phase 3: Index and Query** + - Blocked inverted index + - Query processing + - Pipeline integration + +4. **Phase 4: Testing and Optimization** + - Unit tests + - Integration tests + - Performance optimization \ No newline at end of file From 64a1902a17c077f7da087cea4ecd984d5632229c Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 10:32:23 -0400 Subject: [PATCH 093/102] add ds doc --- doc/data_structures.md | 253 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 doc/data_structures.md diff --git a/doc/data_structures.md b/doc/data_structures.md new file mode 100644 index 0000000..9da8761 --- /dev/null +++ b/doc/data_structures.md @@ -0,0 +1,253 @@ +# Data Structures Documentation + +This document details the key data structures used in the autocomplete system. + +## 1. Scored String Pool + +### Purpose +Manages a fixed-size pool of strings with associated scores, optimized for fast retrieval and updates. + +### Structure +```cpp +struct scored_string_pool { + std::vector strings; // String storage + std::vector scores; // Associated scores + size_t size; // Current pool size + size_t capacity; // Maximum capacity +}; +``` + +### Visualization +```mermaid +graph TD + subgraph String Pool + direction LR + S1[String 1] --> SC1[Score 0.8] + S2[String 2] --> SC2[Score 0.6] + S3[String 3] --> SC3[Score 0.9] + S4[String 4] --> SC4[Score 0.7] + end + style String Pool fill:#f9f,stroke:#333,stroke-width:2px +``` + +### Key Operations +- `insert(string, score)`: Add a new string with its score +- `get_score(index)`: Retrieve score for a string +- `get_string(index)`: Retrieve string by index +- `update_score(index, score)`: Update score for a string +- `clear()`: Reset the pool + +### Memory Management +- Fixed-size allocation to prevent reallocations +- Contiguous memory layout for cache efficiency +- Score and string data stored separately for better cache utilization + +## 2. Completion Trie + +### Purpose +Efficient prefix-based string completion using a trie data structure. + +### Structure +```cpp +struct trie_node { + std::unordered_map children; + bool is_terminal; + std::vector completion_ids; +}; + +struct completion_trie { + trie_node* root; + size_t num_nodes; + size_t num_completions; +}; +``` + +### Visualization +```mermaid +graph TD + Root((Root)) --> H((h)) + H --> HE((e)) + HE --> HEL((l)) + HEL --> HELL((l)) + HELL --> HELLO((o)) + HELLO --> HELLOW((w)) + HELLOW --> HELLOWO((o)) + HELLOWO --> HELLOWOR((r)) + HELLOWOR --> HELLOWORL((l)) + HELLOWORL --> HELLOWORLD((d)) + + style Root fill:#f9f,stroke:#333,stroke-width:2px + style HELLOWORLD fill:#9f9,stroke:#333,stroke-width:2px +``` + +### Key Operations +- `insert(completion)`: Add a new completion string +- `complete(prefix)`: Find all completions for a prefix +- `remove(completion)`: Remove a completion string +- `clear()`: Reset the trie + +### Optimizations +- Path compression for common prefixes +- Node sharing for similar completions +- Lazy deletion for better performance + +## 3. Front-Coded Dictionary + +### Purpose +Compressed string dictionary using front-coding technique. + +### Structure +```cpp +struct fc_dictionary { + std::vector data; // Compressed string data + std::vector offsets; // String offsets + size_t num_strings; // Number of strings + size_t total_size; // Total compressed size +}; +``` + +### Visualization +```mermaid +graph LR + subgraph Front-Coded Dictionary + direction LR + S1[hello] --> |shared prefix| S2[helloworld] + S2 --> |shared prefix| S3[hellothere] + S3 --> |shared prefix| S4[hellokitty] + end + style Front-Coded Dictionary fill:#f9f,stroke:#333,stroke-width:2px +``` + +### Key Operations +- `build(strings)`: Build dictionary from string list +- `lookup(id)`: Retrieve string by ID +- `compress()`: Apply front-coding compression +- `decompress(id)`: Decompress a specific string + +### Compression Details +- Common prefixes shared between consecutive strings +- Variable-length encoding for shared prefix lengths +- Delta encoding for string differences + +## 4. Blocked Inverted Index + +### Purpose +Efficient term-based search using blocked inverted indexing. + +### Structure +```cpp +struct block { + std::vector doc_ids; // Document IDs in block + uint32_t min_doc_id; // Minimum doc ID in block + uint32_t max_doc_id; // Maximum doc ID in block +}; + +struct inverted_index { + std::vector blocks; // Index blocks + std::unordered_map> term_to_blocks; + size_t block_size; // Size of each block +}; +``` + +### Visualization +```mermaid +graph TD + subgraph Inverted Index + direction TB + T1[Term 1] --> B1[Block 1] + T1 --> B2[Block 2] + T2[Term 2] --> B2 + T2 --> B3[Block 3] + T3[Term 3] --> B1 + T3 --> B3 + + subgraph Block 1 + D1[Doc 1] + D2[Doc 2] + D3[Doc 3] + end + + subgraph Block 2 + D4[Doc 4] + D5[Doc 5] + end + + subgraph Block 3 + D6[Doc 6] + D7[Doc 7] + end + end + style Inverted Index fill:#f9f,stroke:#333,stroke-width:2px +``` + +### Key Operations +- `add_document(doc_id, terms)`: Add document to index +- `search(terms)`: Find documents containing terms +- `merge_blocks()`: Optimize block structure +- `clear()`: Reset the index + +### Blocking Strategy +- Fixed-size blocks for predictable memory usage +- Block-level compression for space efficiency +- Skip pointers for faster traversal + +## Memory and Performance Considerations + +### Memory Layout +1. **Contiguous Storage** + - Strings stored in contiguous memory + - Scores aligned for SIMD operations + - Block data packed efficiently + +2. **Cache Optimization** + - Hot data kept together + - Cold data separated + - Alignment for cache lines + +### Performance Optimizations +1. **String Operations** + - String interning for deduplication + - Small string optimization + - Custom string comparison + +2. **Search Optimizations** + - Block-level skipping + - Term frequency caching + - Result set intersection optimization + +3. **Memory Management** + - Custom allocators for specific structures + - Memory pooling for frequent allocations + - Lazy initialization where appropriate + +## Usage Examples + +### String Pool Usage +```cpp +scored_string_pool pool(POOL_SIZE); +pool.insert("completion1", 0.8); +pool.insert("completion2", 0.6); +auto completions = pool.get_top_k(10); +``` + +### Trie Usage +```cpp +completion_trie trie; +trie.insert("hello world"); +trie.insert("hello there"); +auto results = trie.complete("hello"); +``` + +### Dictionary Usage +```cpp +fc_dictionary dict; +dict.build(strings); +auto str = dict.lookup(42); +``` + +### Index Usage +```cpp +inverted_index index; +index.add_document(1, {"term1", "term2"}); +auto docs = index.search({"term1", "term2"}); +``` \ No newline at end of file From 79987bdfdcc103899de9a5deecab171a116dcdec Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 13:55:27 -0400 Subject: [PATCH 094/102] add class diagram --- doc/class_diagram.md | 219 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 doc/class_diagram.md diff --git a/doc/class_diagram.md b/doc/class_diagram.md new file mode 100644 index 0000000..6561e4a --- /dev/null +++ b/doc/class_diagram.md @@ -0,0 +1,219 @@ +# C++ Class Diagram + +This document provides a comprehensive view of all classes in the C++ implementation and their relationships. + +## Main Class Diagram + +```mermaid +classDiagram + class Parameters { + +uint32_t num_terms + +uint32_t max_string_length + +uint32_t num_completions + +uint32_t universe + +uint32_t num_levels + +vector~uint32_t~ nodes_per_level + +string collection_basename + +load() + } + + class Probe { + <> + +start(id: uint64_t) + +stop(id: uint64_t) + } + + class NopProbe { + +start(id: uint64_t) + +stop(id: uint64_t) + } + + class TimerProbe { + -vector~Timer~ timers + +start(id: uint64_t) + +stop(id: uint64_t) + +get_duration(id: uint64_t) + } + + class Timer { + -Instant start_time + -Duration total_duration + +start() + +stop() + +get_duration() + } + + class ScoredStringPool { + -vector~string~ strings + -vector~float~ scores + -size_t size + -size_t capacity + +insert(string, float) + +get_score(size_t) + +get_string(size_t) + +update_score(size_t, float) + +clear() + } + + class TrieNode { + -unordered_map~char, TrieNode*~ children + -bool is_terminal + -vector~uint32_t~ completion_ids + +add_child(char) + +get_child(char) + +is_terminal() + } + + class CompletionTrie { + -TrieNode* root + -size_t num_nodes + -size_t num_completions + +insert(string) + +complete(string) + +remove(string) + +clear() + } + + class FCDictionary { + -vector~char~ data + -vector~uint32_t~ offsets + -size_t num_strings + -size_t total_size + +build(vector~string~) + +lookup(uint32_t) + +compress() + +decompress(uint32_t) + } + + class Block { + -vector~uint32_t~ doc_ids + -uint32_t min_doc_id + -uint32_t max_doc_id + +add_doc(uint32_t) + +get_docs() + +get_range() + } + + class InvertedIndex { + -vector~Block~ blocks + -unordered_map~string, vector~uint32_t~~ term_to_blocks + -size_t block_size + +add_document(uint32_t, vector~string~) + +search(vector~string~) + +merge_blocks() + +clear() + } + + class Autocomplete { + -Parameters params + -ScoredStringPool string_pool + -CompletionTrie trie + -FCDictionary dictionary + -InvertedIndex index + +build_index(string) + +complete(string) + +search(vector~string~) + } + + %% Relationships + Probe <|-- NopProbe + Probe <|-- TimerProbe + TimerProbe *-- Timer + Autocomplete *-- Parameters + Autocomplete *-- ScoredStringPool + Autocomplete *-- CompletionTrie + Autocomplete *-- FCDictionary + Autocomplete *-- InvertedIndex + CompletionTrie *-- TrieNode + InvertedIndex *-- Block +``` + +## Component Dependencies + +```mermaid +graph TD + subgraph Core + Parameters + Probe + end + + subgraph Data Structures + ScoredStringPool + CompletionTrie + FCDictionary + InvertedIndex + end + + subgraph Implementation + Autocomplete + end + + %% Dependencies + Parameters --> ScoredStringPool + Parameters --> CompletionTrie + Parameters --> FCDictionary + Parameters --> InvertedIndex + + ScoredStringPool --> Autocomplete + CompletionTrie --> Autocomplete + FCDictionary --> Autocomplete + InvertedIndex --> Autocomplete + + style Core fill:#f9f,stroke:#333,stroke-width:2px + style Data Structures fill:#9f9,stroke:#333,stroke-width:2px + style Implementation fill:#99f,stroke:#333,stroke-width:2px +``` + +## Memory Layout + +```mermaid +graph TD + subgraph Memory Organization + direction TB + Stack[Stack Memory] --> Heap[Heap Memory] + Heap --> Data[Data Structures] + Data --> Strings[String Pool] + Data --> Trie[Trie Nodes] + Data --> Dict[Dictionary] + Data --> Index[Inverted Index] + end + + style Memory Organization fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Key Features and Methods + +### Core Components +- **Parameters**: Configuration management +- **Probe**: Performance measurement interface +- **Timer**: Time tracking implementation + +### Data Structures +- **ScoredStringPool**: String and score management +- **CompletionTrie**: Prefix-based completion +- **FCDictionary**: String compression +- **InvertedIndex**: Term-based search + +### Main Implementation +- **Autocomplete**: Orchestrates all components + +## Usage Example + +```cpp +// Initialize components +Parameters params; +params.load("config.stats"); + +ScoredStringPool pool(POOL_SIZE); +CompletionTrie trie; +FCDictionary dict; +InvertedIndex index; + +// Build autocomplete system +Autocomplete ac(params, pool, trie, dict, index); +ac.build_index("data.txt"); + +// Use the system +auto completions = ac.complete("hello"); +auto results = ac.search({"hello", "world"}); +``` \ No newline at end of file From 601fad7cbac54eb6631e814099d0067e67a7a084 Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 13:58:58 -0400 Subject: [PATCH 095/102] add more details --- doc/activity_diagram.md | 157 ++++++++++++++++++++++++++++++++++++++++ doc/class_diagram.md | 132 ++++++++++++++++++++++++++++++--- 2 files changed, 280 insertions(+), 9 deletions(-) create mode 100644 doc/activity_diagram.md diff --git a/doc/activity_diagram.md b/doc/activity_diagram.md new file mode 100644 index 0000000..993101b --- /dev/null +++ b/doc/activity_diagram.md @@ -0,0 +1,157 @@ +# Activity Diagrams + +This document provides activity diagrams for the main workflows in the autocomplete system. + +## System Initialization and Index Building + +```mermaid +graph TD + Start([Start]) --> LoadParams[Load Parameters] + LoadParams --> InitComponents[Initialize Components] + InitComponents --> BuildTrie[Build Completion Trie] + BuildTrie --> BuildDict[Build Front-Coded Dictionary] + BuildDict --> BuildIndex[Build Inverted Index] + BuildIndex --> BuildForwardIndex[Build Forward Index] + BuildForwardIndex --> End([End]) + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Autocomplete Query Processing + +```mermaid +graph TD + Start([Start]) --> InputQuery[Input Query] + InputQuery --> ParseQuery[Parse Query Terms] + ParseQuery --> CheckPrefix[Check Prefix in Trie] + + CheckPrefix -->|Prefix Found| GetCompletions[Get Completions] + CheckPrefix -->|No Prefix| ReturnEmpty[Return Empty Results] + + GetCompletions --> ScoreCompletions[Score Completions] + ScoreCompletions --> SortResults[Sort by Score] + SortResults --> ReturnResults[Return Top-K Results] + + ReturnEmpty --> End([End]) + ReturnResults --> End + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Search Operation Flow + +```mermaid +graph TD + Start([Start]) --> InputTerms[Input Search Terms] + InputTerms --> ParseTerms[Parse Search Terms] + ParseTerms --> LookupTerms[Lookup Terms in Dictionary] + + LookupTerms -->|All Terms Found| GetPostings[Get Posting Lists] + LookupTerms -->|Terms Not Found| ReturnEmpty[Return Empty Results] + + GetPostings --> IntersectLists[Intersect Posting Lists] + IntersectLists --> ScoreDocs[Score Documents] + ScoreDocs --> SortResults[Sort by Score] + SortResults --> ReturnResults[Return Top-K Results] + + ReturnEmpty --> End([End]) + ReturnResults --> End + + style Start fill:#f9f,stroke:#333,stroke-width:2px +``` + +## String Pool Management + +```mermaid +graph TD + Start([Start]) --> CheckCapacity[Check Pool Capacity] + CheckCapacity -->|Full| RemoveLowest[Remove Lowest Score] + CheckCapacity -->|Space Available| AddString[Add New String] + + RemoveLowest --> AddString + AddString --> UpdateScores[Update Scores] + UpdateScores --> SortPool[Sort Pool by Score] + SortPool --> End([End]) + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Blocked Inverted Index Operations + +```mermaid +graph TD + Start([Start]) --> InputDoc[Input Document] + InputDoc --> ExtractTerms[Extract Terms] + ExtractTerms --> CheckBlocks[Check Existing Blocks] + + CheckBlocks -->|Block Found| UpdateBlock[Update Block] + CheckBlocks -->|New Block| CreateBlock[Create New Block] + + UpdateBlock --> MergeCheck[Check Merge Condition] + CreateBlock --> MergeCheck + + MergeCheck -->|Merge Needed| MergeBlocks[Merge Blocks] + MergeCheck -->|No Merge| UpdateIndex[Update Index] + + MergeBlocks --> UpdateIndex + UpdateIndex --> End([End]) + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Performance Measurement Flow + +```mermaid +graph TD + Start([Start]) --> StartTimer[Start Timer] + StartTimer --> Operation[Perform Operation] + Operation --> StopTimer[Stop Timer] + StopTimer --> RecordMetrics[Record Metrics] + RecordMetrics --> AnalyzePerformance[Analyze Performance] + AnalyzePerformance --> End([End]) + + style Start fill:#f9f,stroke:#333,stroke-width:2px + style End fill:#f9f,stroke:#333,stroke-width:2px +``` + +## Key Operations Description + +### System Initialization +1. Load configuration parameters +2. Initialize core components +3. Build data structures +4. Set up indexes + +### Query Processing +1. Parse and validate input +2. Check prefix in trie +3. Retrieve and score completions +4. Sort and return results + +### Search Operations +1. Process search terms +2. Lookup in dictionary +3. Retrieve and intersect posting lists +4. Score and rank results + +### String Pool Management +1. Maintain fixed-size pool +2. Handle insertions and removals +3. Update and sort scores +4. Manage memory efficiently + +### Blocked Index Operations +1. Process document updates +2. Manage block structure +3. Handle block merges +4. Maintain index consistency + +### Performance Measurement +1. Track operation timing +2. Record performance metrics +3. Analyze system behavior +4. Optimize based on results \ No newline at end of file diff --git a/doc/class_diagram.md b/doc/class_diagram.md index 6561e4a..4a5f4db 100644 --- a/doc/class_diagram.md +++ b/doc/class_diagram.md @@ -44,15 +44,23 @@ classDiagram } class ScoredStringPool { - -vector~string~ strings - -vector~float~ scores - -size_t size - -size_t capacity - +insert(string, float) - +get_score(size_t) - +get_string(size_t) - +update_score(size_t, float) + -vector~id_type~ m_scores + -vector~size_t~ m_offsets + -vector~uint8_t~ m_data + +init() + +resize(size_t, uint32_t) +clear() + +size() + +bytes() + +data() + +push_back_offset(size_t) + +scores() + +const_scores() + } + + class ScoredByteRange { + +byte_range string + +id_type score } class TrieNode { @@ -85,6 +93,15 @@ classDiagram +decompress(uint32_t) } + class IntegerFCDictionary { + -vector~uint32_t~ m_headers + -vector~uint8_t~ m_buckets + -size_t m_size + +build(vector~string~) + +lookup(uint32_t) + +extract(id_type, completion_type) + } + class Block { -vector~uint32_t~ doc_ids -uint32_t min_doc_id @@ -104,6 +121,37 @@ classDiagram +clear() } + class CompactVector { + -vector~uint64_t~ m_bits + -uint8_t m_width + -uint64_t m_mask + +build(vector~uint64_t~) + +access(uint64_t) + +size() + } + + class BitVector { + -vector~uint64_t~ m_bits + -size_t m_size + +build(bit_vector_builder*) + +size() + +bytes() + +operator[](uint64_t) + +get_bits(uint64_t, uint64_t) + } + + class MinHeap { + -vector~T~ m_q + -Comparator m_comparator + +reserve(uint64_t) + +top() + +push(T) + +pop() + +clear() + +empty() + +size() + } + class Autocomplete { -Parameters params -ScoredStringPool string_pool @@ -115,6 +163,41 @@ classDiagram +search(vector~string~) } + class Autocomplete2 { + -Parameters params + -ScoredStringPool string_pool + -CompletionTrie trie + -FCDictionary dictionary + -InvertedIndex index + -CompactVector docid_to_lexid + +build_index(string) + +complete(string) + +search(vector~string~) + } + + class Autocomplete3 { + -Parameters params + -ScoredStringPool string_pool + -CompletionTrie trie + -FCDictionary dictionary + -InvertedIndex index + -MinHeap min_priority_queue + +build_index(string) + +complete(string) + +search(vector~string~) + } + + class Autocomplete4 { + -Parameters params + -ScoredStringPool string_pool + -CompletionTrie trie + -FCDictionary dictionary + -BlockedInvertedIndex index + +build_index(string) + +complete(string) + +search(vector~string~) + } + %% Relationships Probe <|-- NopProbe Probe <|-- TimerProbe @@ -126,6 +209,13 @@ classDiagram Autocomplete *-- InvertedIndex CompletionTrie *-- TrieNode InvertedIndex *-- Block + ScoredStringPool *-- ScoredByteRange + Autocomplete2 --|> Autocomplete + Autocomplete3 --|> Autocomplete + Autocomplete4 --|> Autocomplete + Autocomplete3 *-- MinHeap + Autocomplete2 *-- CompactVector + Autocomplete4 *-- BlockedInvertedIndex ``` ## Component Dependencies @@ -135,17 +225,26 @@ graph TD subgraph Core Parameters Probe + Timer end subgraph Data Structures ScoredStringPool CompletionTrie FCDictionary + IntegerFCDictionary InvertedIndex + BlockedInvertedIndex + CompactVector + BitVector + MinHeap end subgraph Implementation Autocomplete + Autocomplete2 + Autocomplete3 + Autocomplete4 end %% Dependencies @@ -153,11 +252,16 @@ graph TD Parameters --> CompletionTrie Parameters --> FCDictionary Parameters --> InvertedIndex + Parameters --> IntegerFCDictionary ScoredStringPool --> Autocomplete CompletionTrie --> Autocomplete FCDictionary --> Autocomplete InvertedIndex --> Autocomplete + IntegerFCDictionary --> Autocomplete2 + CompactVector --> Autocomplete2 + MinHeap --> Autocomplete3 + BlockedInvertedIndex --> Autocomplete4 style Core fill:#f9f,stroke:#333,stroke-width:2px style Data Structures fill:#9f9,stroke:#333,stroke-width:2px @@ -176,6 +280,8 @@ graph TD Data --> Trie[Trie Nodes] Data --> Dict[Dictionary] Data --> Index[Inverted Index] + Data --> Compact[Compact Vectors] + Data --> BitVec[Bit Vectors] end style Memory Organization fill:#f9f,stroke:#333,stroke-width:2px @@ -192,10 +298,18 @@ graph TD - **ScoredStringPool**: String and score management - **CompletionTrie**: Prefix-based completion - **FCDictionary**: String compression +- **IntegerFCDictionary**: Integer-based dictionary - **InvertedIndex**: Term-based search +- **BlockedInvertedIndex**: Blocked term-based search +- **CompactVector**: Space-efficient vector +- **BitVector**: Bit-level operations +- **MinHeap**: Priority queue implementation ### Main Implementation -- **Autocomplete**: Orchestrates all components +- **Autocomplete**: Base implementation +- **Autocomplete2**: Integer-based optimization +- **Autocomplete3**: Min-heap based optimization +- **Autocomplete4**: Blocked index optimization ## Usage Example From 8921c936396149c06d83f1ca390986aa51260933 Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 14:30:31 -0400 Subject: [PATCH 096/102] unstable --- .gitignore | 28 + autocomplete-rs/Cargo.lock | 1047 +++++++++++++++++++++- autocomplete-rs/Cargo.toml | 7 + autocomplete-rs/build.rs | 4 + autocomplete-rs/examples/client.rs | 36 + autocomplete-rs/proto/autocomplete.proto | 58 ++ autocomplete-rs/src/autocomplete.rs | 222 +++++ autocomplete-rs/src/dictionary.rs | 199 ++++ autocomplete-rs/src/index.rs | 204 +++++ autocomplete-rs/src/lib.rs | 14 +- autocomplete-rs/src/server.rs | 89 ++ autocomplete-rs/src/string_pool.rs | 151 ++++ autocomplete-rs/src/trie.rs | 182 ++++ autocomplete-rs/src/types.rs | 92 ++ 14 files changed, 2325 insertions(+), 8 deletions(-) create mode 100644 autocomplete-rs/build.rs create mode 100644 autocomplete-rs/examples/client.rs create mode 100644 autocomplete-rs/proto/autocomplete.proto create mode 100644 autocomplete-rs/src/autocomplete.rs create mode 100644 autocomplete-rs/src/dictionary.rs create mode 100644 autocomplete-rs/src/index.rs create mode 100644 autocomplete-rs/src/server.rs create mode 100644 autocomplete-rs/src/string_pool.rs create mode 100644 autocomplete-rs/src/trie.rs create mode 100644 autocomplete-rs/src/types.rs diff --git a/.gitignore b/.gitignore index 69bd68e..2d7573c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,24 @@ # Rust specific /target/ +**/target/ **/*.rs.bk Cargo.lock *.pdb +# Protocol Buffers +*.pb.h +*.pb.cc +*.pb.go +*.pb.swift +*.pb.dart +*.pb.js +*.pb.ts +*.pb.rs + +# Generated Rust files +/src/autocomplete_proto.rs +/src/autocomplete_proto/*.rs + # C++ specific *.o *.obj @@ -51,18 +66,31 @@ Cargo.lock # Build directories /build/ +**/build/ /debug_build/ +**/debug_build/ /CMakeFiles/ +**/CMakeFiles/ /CMakeCache.txt +**/CMakeCache.txt /CMakeScripts/ +**/CMakeScripts/ /Testing/ +**/Testing/ /Makefile +**/Makefile /cmake_install.cmake +**/cmake_install.cmake /install_manifest.txt +**/install_manifest.txt /compile_commands.json +**/compile_commands.json /CTestTestfile.cmake +**/CTestTestfile.cmake /_deps +**/_deps /.cmake +**/.cmake # IDE specific .vscode/ diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock index 6cb35bc..d1b8fd2 100644 --- a/autocomplete-rs/Cargo.lock +++ b/autocomplete-rs/Cargo.lock @@ -2,25 +2,189 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.88" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + [[package]] name = "autocomplete-rs" version = "0.1.0" dependencies = [ + "futures", + "prost", "tempfile", + "tokio", + "tonic", + "tonic-build", +] + +[[package]] +name = "axum" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" +dependencies = [ + "async-trait", + "axum-core", + "bitflags 1.3.2", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", + "rustversion", + "tower-layer", + "tower-service", +] + +[[package]] +name = "backtrace" +version = "0.3.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "errno" version = "0.3.12" @@ -28,7 +192,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -37,6 +201,118 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.3.3" @@ -46,9 +322,157 @@ dependencies = [ "cfg-if", "libc", "r-efi", - "wasi", + "wasi 0.14.2+wasi-0.2.4", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "h2" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap 2.9.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-timeout" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", ] +[[package]] +name = "indexmap" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +dependencies = [ + "equivalent", + "hashbrown 0.15.3", +] + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + [[package]] name = "libc" version = "0.2.172" @@ -61,31 +485,418 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "miniz_oxide" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" +dependencies = [ + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.52.0", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap 2.9.0", +] + +[[package]] +name = "pin-project" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "664ec5419c51e34154eec046ebcba56312d5a2fc3b09a06da188e1ad21afadf6" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + [[package]] name = "r-efi" version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.16", +] + +[[package]] +name = "redox_syscall" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" +dependencies = [ + "bitflags 2.9.1", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + [[package]] name = "rustix" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" dependencies = [ - "bitflags", + "bitflags 2.9.1", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410" +dependencies = [ + "libc", +] + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" + +[[package]] +name = "socket2" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + [[package]] name = "tempfile" version = "3.20.0" @@ -93,12 +904,205 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" dependencies = [ "fastrand", - "getrandom", + "getrandom 0.3.3", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "tokio" +version = "1.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2513ca694ef9ede0fb23fe71a4ee4107cb102b9dc1930f6d0fd77aae068ae165" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-io-timeout" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tonic" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64", + "bytes", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "quote", + "syn", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", ] +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" @@ -108,6 +1112,15 @@ dependencies = [ "wit-bindgen-rt", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.59.0" @@ -187,5 +1200,25 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags", + "bitflags 2.9.1", +] + +[[package]] +name = "zerocopy" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml index 7d62c58..fc1d1f9 100644 --- a/autocomplete-rs/Cargo.toml +++ b/autocomplete-rs/Cargo.toml @@ -4,6 +4,13 @@ version = "0.1.0" edition = "2021" [dependencies] +tonic = "0.10" +prost = "0.12" +tokio = { version = "1.0", features = ["full"] } +futures = "0.3" [dev-dependencies] tempfile = "3.8" + +[build-dependencies] +tonic-build = "0.10" diff --git a/autocomplete-rs/build.rs b/autocomplete-rs/build.rs new file mode 100644 index 0000000..7d082f1 --- /dev/null +++ b/autocomplete-rs/build.rs @@ -0,0 +1,4 @@ +fn main() -> Result<(), Box> { + tonic_build::compile_protos("proto/autocomplete.proto")?; + Ok(()) +} \ No newline at end of file diff --git a/autocomplete-rs/examples/client.rs b/autocomplete-rs/examples/client.rs new file mode 100644 index 0000000..cbdb2c9 --- /dev/null +++ b/autocomplete-rs/examples/client.rs @@ -0,0 +1,36 @@ +use autocomplete_proto::{ + autocomplete_service_client::AutocompleteServiceClient, + CompleteRequest, InitRequest, StringScore, +}; + +pub mod autocomplete_proto { + tonic::include_proto!("autocomplete"); +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let mut client = AutocompleteServiceClient::connect("http://[::1]:50051").await?; + + // Initialize with some test data + let init_request = InitRequest { + strings: vec![ + StringScore { text: "hello".to_string(), score: 1.0 }, + StringScore { text: "help".to_string(), score: 0.8 }, + StringScore { text: "hell".to_string(), score: 0.6 }, + ], + }; + + let response = client.init(init_request).await?; + println!("INIT RESPONSE: {:?}", response); + + // Get completions + let request = CompleteRequest { + prefix: "hel".to_string(), + max_results: 10, + }; + + let response = client.complete(request).await?; + println!("COMPLETE RESPONSE: {:?}", response); + + Ok(()) +} \ No newline at end of file diff --git a/autocomplete-rs/proto/autocomplete.proto b/autocomplete-rs/proto/autocomplete.proto new file mode 100644 index 0000000..12c2e74 --- /dev/null +++ b/autocomplete-rs/proto/autocomplete.proto @@ -0,0 +1,58 @@ +syntax = "proto3"; + +package autocomplete; + +// The autocomplete service definition +service AutocompleteService { + // Get completions for a prefix + rpc Complete (CompleteRequest) returns (CompleteResponse) {} + + // Initialize the autocomplete system with strings and scores + rpc Init (InitRequest) returns (InitResponse) {} + + // Get system statistics + rpc GetStats (StatsRequest) returns (StatsResponse) {} +} + +// Request message for completion +message CompleteRequest { + string prefix = 1; + int32 max_results = 2; // Optional: limit number of results +} + +// Response message containing completions +message CompleteResponse { + repeated Completion completions = 1; +} + +// A single completion result +message Completion { + string text = 1; + float score = 2; +} + +// Request message for initialization +message InitRequest { + repeated StringScore strings = 1; +} + +// A string with its score +message StringScore { + string text = 1; + float score = 2; +} + +// Response message for initialization +message InitResponse { + bool success = 1; + string error = 2; // Empty if success is true +} + +// Request message for stats +message StatsRequest {} + +// Response message containing system statistics +message StatsResponse { + int32 num_terms = 1; + int64 memory_bytes = 2; +} \ No newline at end of file diff --git a/autocomplete-rs/src/autocomplete.rs b/autocomplete-rs/src/autocomplete.rs new file mode 100644 index 0000000..1191a75 --- /dev/null +++ b/autocomplete-rs/src/autocomplete.rs @@ -0,0 +1,222 @@ +use std::collections::HashMap; +use crate::types::{IdType, ByteRange, global}; +use crate::string_pool::ScoredStringPool; +use crate::trie::CompletionTrie; +use crate::dictionary::{FCDictionary, IntegerFCDictionary}; +use crate::index::{BlockedInvertedIndex, CompactVector, BitVector}; + +const BLOCK_SIZE: usize = 1024; + +/// Main autocomplete implementation +pub struct Autocomplete { + string_pool: ScoredStringPool, + trie: CompletionTrie, + dictionary: FCDictionary, + index: BlockedInvertedIndex, + term_to_id: HashMap, + id_to_term: Vec, + num_terms: usize, +} + +impl Autocomplete { + /// Create a new autocomplete instance + pub fn new() -> Self { + Self { + string_pool: ScoredStringPool::new(), + trie: CompletionTrie::new(), + dictionary: FCDictionary::new(), + index: BlockedInvertedIndex::new(BLOCK_SIZE), + term_to_id: HashMap::new(), + id_to_term: Vec::new(), + num_terms: 0, + } + } + + /// Initialize the autocomplete system + pub fn init(&mut self, strings: &[String], scores: &[IdType]) { + assert_eq!(strings.len(), scores.len()); + + // Build string pool + self.string_pool = ScoredStringPool::new(); + let mut offsets = Vec::with_capacity(strings.len() + 1); + let mut all_scores = Vec::with_capacity(strings.len()); + let mut total_bytes = 0; + offsets.push(0); + for (string, &score) in strings.iter().zip(scores) { + total_bytes += string.len(); + offsets.push(total_bytes); + all_scores.push(score); + } + self.string_pool.set_offsets(offsets); + self.string_pool.set_scores(all_scores); + self.string_pool.set_data(strings.iter().flat_map(|s| s.as_bytes()).cloned().collect()); + + // Build dictionary + self.dictionary.build(strings); + + // Build term mappings + self.term_to_id.clear(); + self.id_to_term.clear(); + for (i, string) in strings.iter().enumerate() { + self.term_to_id.insert(string.clone(), (i + 1) as IdType); + self.id_to_term.push(string.clone()); + } + self.num_terms = strings.len(); + + // Build trie + self.trie.clear(); + for (i, string) in strings.iter().enumerate() { + self.trie.insert(string, (i + 1) as IdType); + } + + // Build index + self.index.clear(); + for (i, _string) in strings.iter().enumerate() { + let term_id = (i + 1) as IdType; + self.index.add_doc(term_id, term_id); + } + } + + /// Find completions for a prefix + pub fn complete(&self, prefix: &str) -> Vec<(String, IdType)> { + let mut results = Vec::new(); + + // Get completion IDs from trie + let completion_ids = self.trie.complete(prefix); + + // Look up strings and scores + for &id in &completion_ids { + if let Some(string) = self.dictionary.lookup(id) { + let scored_range = self.string_pool.get(id as usize); + results.push((string, scored_range.score)); + } + } + + // Sort by score (descending) + results.sort_by(|a, b| b.1.cmp(&a.1)); + results + } + + /// Get the number of terms + pub fn num_terms(&self) -> usize { + self.num_terms + } + + /// Get the size in bytes + pub fn bytes(&self) -> usize { + self.string_pool.bytes() + + self.trie.num_nodes() * std::mem::size_of::() + + self.dictionary.bytes() + + self.index.num_blocks() * std::mem::size_of::() + + self.term_to_id.capacity() * std::mem::size_of::<(String, IdType)>() + + self.id_to_term.capacity() * std::mem::size_of::() + } +} + +/// Integer-based autocomplete implementation +pub struct Autocomplete2 { + string_pool: ScoredStringPool, + trie: CompletionTrie, + dictionary: IntegerFCDictionary, + index: BlockedInvertedIndex, + term_to_id: HashMap, + id_to_term: Vec, + num_terms: usize, +} + +impl Autocomplete2 { + /// Create a new integer-based autocomplete instance + pub fn new() -> Self { + Self { + string_pool: ScoredStringPool::new(), + trie: CompletionTrie::new(), + dictionary: IntegerFCDictionary::new(), + index: BlockedInvertedIndex::new(BLOCK_SIZE), + term_to_id: HashMap::new(), + id_to_term: Vec::new(), + num_terms: 0, + } + } + + /// Initialize the autocomplete system + pub fn init(&mut self, strings: &[String], scores: &[IdType]) { + assert_eq!(strings.len(), scores.len()); + + // Build string pool + self.string_pool = ScoredStringPool::new(); + let mut offsets = Vec::with_capacity(strings.len() + 1); + let mut all_scores = Vec::with_capacity(strings.len()); + let mut total_bytes = 0; + offsets.push(0); + for (string, &score) in strings.iter().zip(scores) { + total_bytes += string.len(); + offsets.push(total_bytes); + all_scores.push(score); + } + self.string_pool.set_offsets(offsets); + self.string_pool.set_scores(all_scores); + self.string_pool.set_data(strings.iter().flat_map(|s| s.as_bytes()).cloned().collect()); + + // Build dictionary + self.dictionary.build(strings); + + // Build term mappings + self.term_to_id.clear(); + self.id_to_term.clear(); + for (i, string) in strings.iter().enumerate() { + self.term_to_id.insert(string.clone(), (i + 1) as IdType); + self.id_to_term.push(string.clone()); + } + self.num_terms = strings.len(); + + // Build trie + self.trie.clear(); + for (i, string) in strings.iter().enumerate() { + self.trie.insert(string, (i + 1) as IdType); + } + + // Build index + self.index.clear(); + for (i, _string) in strings.iter().enumerate() { + let term_id = (i + 1) as IdType; + self.index.add_doc(term_id, term_id); + } + } + + /// Find completions for a prefix + pub fn complete(&self, prefix: &str) -> Vec<(String, IdType)> { + let mut results = Vec::new(); + let mut completion = Vec::new(); + + // Get completion IDs from trie + let completion_ids = self.trie.complete(prefix); + + // Look up strings and scores + for &id in &completion_ids { + let len = self.dictionary.extract(id, &mut completion); + if len > 0 { + let scored_range = self.string_pool.get(id as usize); + let string = String::from_utf8_lossy(&completion).into_owned(); + results.push((string, scored_range.score)); + } + } + + // Sort by score (descending) + results.sort_by(|a, b| b.1.cmp(&a.1)); + results + } + + /// Get the number of terms + pub fn num_terms(&self) -> usize { + self.num_terms + } + + /// Get the size in bytes + pub fn bytes(&self) -> usize { + self.string_pool.bytes() + + self.trie.num_nodes() * std::mem::size_of::() + + self.index.num_blocks() * std::mem::size_of::() + + self.term_to_id.capacity() * std::mem::size_of::<(String, IdType)>() + + self.id_to_term.capacity() * std::mem::size_of::() + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/dictionary.rs b/autocomplete-rs/src/dictionary.rs new file mode 100644 index 0000000..99a37b2 --- /dev/null +++ b/autocomplete-rs/src/dictionary.rs @@ -0,0 +1,199 @@ +use std::collections::HashMap; +use crate::types::{ByteRange, IdType, global}; + +/// Front-coded dictionary for string compression +pub struct FCDictionary { + data: Vec, + offsets: Vec, + num_strings: usize, + total_size: usize, +} + +impl FCDictionary { + /// Create a new front-coded dictionary + pub fn new() -> Self { + Self { + data: Vec::new(), + offsets: Vec::new(), + num_strings: 0, + total_size: 0, + } + } + + /// Build the dictionary from a list of strings + pub fn build(&mut self, strings: &[String]) { + if strings.is_empty() { + return; + } + + self.num_strings = strings.len(); + self.offsets.clear(); + self.data.clear(); + self.total_size = 0; + + // Sort strings for better compression + let mut sorted_strings: Vec<_> = strings.iter().collect(); + sorted_strings.sort(); + + // First string is stored completely + let first = sorted_strings[0]; + self.offsets.push(0); + self.data.extend_from_slice(first.as_bytes()); + self.total_size += first.len(); + + // Process remaining strings + for i in 1..sorted_strings.len() { + let prev = sorted_strings[i - 1]; + let curr = sorted_strings[i]; + + // Find common prefix + let lcp = self.longest_common_prefix(prev, curr); + + // Store offset and remaining string + self.offsets.push(self.total_size as u32); + self.data.push(lcp as u8); + self.data.extend_from_slice(&curr.as_bytes()[lcp..]); + self.total_size += 1 + curr.len() - lcp; + } + } + + /// Find the longest common prefix between two strings + fn longest_common_prefix(&self, a: &str, b: &str) -> usize { + a.bytes() + .zip(b.bytes()) + .take_while(|(x, y)| x == y) + .count() + } + + /// Look up a string in the dictionary + pub fn lookup(&self, id: IdType) -> Option { + if id == 0 || id > self.num_strings as IdType { + return None; + } + + let id = (id - 1) as usize; + let offset = self.offsets[id] as usize; + + if id == 0 { + // First string is stored completely + let end = if id + 1 < self.offsets.len() { + self.offsets[id + 1] as usize + } else { + self.data.len() + }; + Some(String::from_utf8_lossy(&self.data[offset..end]).into_owned()) + } else { + // Other strings are front-coded + let lcp = self.data[offset] as usize; + let prev = self.lookup(id as IdType - 1)?; + let mut result = prev[..lcp].to_string(); + let end = if id + 1 < self.offsets.len() { + self.offsets[id + 1] as usize + } else { + self.data.len() + }; + result.push_str(std::str::from_utf8(&self.data[offset + 1..end]).unwrap()); + Some(result) + } + } + + /// Get the number of strings in the dictionary + pub fn size(&self) -> usize { + self.num_strings + } + + /// Get the total size of the compressed data + pub fn total_size(&self) -> usize { + self.total_size + } + + /// Get the size of the dictionary in bytes + pub fn bytes(&self) -> usize { + std::mem::size_of_val(&self.num_strings) + + std::mem::size_of_val(&self.total_size) + + self.offsets.len() * std::mem::size_of::() + + self.data.len() + } +} + +/// Integer-based front-coded dictionary +pub struct IntegerFCDictionary { + headers: Vec, + buckets: Vec, + size: usize, +} + +impl IntegerFCDictionary { + /// Create a new integer-based front-coded dictionary + pub fn new() -> Self { + Self { + headers: Vec::new(), + buckets: Vec::new(), + size: 0, + } + } + + /// Build the dictionary from a list of strings + pub fn build(&mut self, strings: &[String]) { + if strings.is_empty() { + return; + } + + self.size = strings.len(); + self.headers.clear(); + self.buckets.clear(); + + // Sort strings for better compression + let mut sorted_strings: Vec<_> = strings.iter().collect(); + sorted_strings.sort(); + + // Process strings + for i in 0..sorted_strings.len() { + let curr = sorted_strings[i]; + let lcp = if i > 0 { + self.longest_common_prefix(sorted_strings[i - 1], curr) + } else { + 0 + }; + + // Store header + self.headers.extend_from_slice(curr.as_bytes()); + + // Store bucket + self.buckets.push(lcp as u8); + self.buckets.push((curr.len() - lcp) as u8); + self.buckets.extend_from_slice(&curr.as_bytes()[lcp..]); + } + } + + /// Find the longest common prefix between two strings + fn longest_common_prefix(&self, a: &str, b: &str) -> usize { + a.bytes() + .zip(b.bytes()) + .take_while(|(x, y)| x == y) + .count() + } + + /// Extract a string from the dictionary + pub fn extract(&self, id: IdType, completion: &mut Vec) -> u8 { + if id == 0 || id > self.size as IdType { + return 0; + } + + let id = (id - 1) as usize; + let bucket_start = id * 2; + let lcp = self.buckets[bucket_start] as usize; + let remaining = self.buckets[bucket_start + 1] as usize; + + completion.clear(); + completion.extend_from_slice(&self.headers[id..id + lcp]); + completion.extend_from_slice(&self.buckets[bucket_start + 2..bucket_start + 2 + remaining]); + + (lcp + remaining) as u8 + } + + /// Get the number of strings in the dictionary + pub fn size(&self) -> usize { + self.size + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/index.rs b/autocomplete-rs/src/index.rs new file mode 100644 index 0000000..47d176e --- /dev/null +++ b/autocomplete-rs/src/index.rs @@ -0,0 +1,204 @@ +use std::collections::HashMap; +use crate::types::{IdType, global}; + +/// Block in the inverted index +struct Block { + term_id: IdType, + num_docs: usize, + docs: Vec, +} + +impl Block { + /// Create a new block + fn new(term_id: IdType) -> Self { + Self { + term_id, + num_docs: 0, + docs: Vec::new(), + } + } + + /// Add a document to the block + fn add_doc(&mut self, doc_id: IdType) { + self.docs.push(doc_id); + self.num_docs += 1; + } + + /// Get the number of documents in the block + fn size(&self) -> usize { + self.num_docs + } +} + +/// Blocked inverted index for efficient document retrieval +pub struct BlockedInvertedIndex { + blocks: Vec, + term_to_block: HashMap, + block_size: usize, +} + +impl BlockedInvertedIndex { + /// Create a new blocked inverted index + pub fn new(block_size: usize) -> Self { + Self { + blocks: Vec::new(), + term_to_block: HashMap::new(), + block_size, + } + } + + /// Add a document to the index + pub fn add_doc(&mut self, term_id: IdType, doc_id: IdType) { + let block_idx = self.term_to_block.entry(term_id).or_insert_with(|| { + self.blocks.push(Block::new(term_id)); + self.blocks.len() - 1 + }); + + let block = &mut self.blocks[*block_idx]; + block.add_doc(doc_id); + + // If block is full, create a new one + if block.size() >= self.block_size { + self.blocks.push(Block::new(term_id)); + *block_idx = self.blocks.len() - 1; + } + } + + /// Get documents for a term + pub fn get_docs(&self, term_id: IdType) -> Vec { + let mut docs = Vec::new(); + + // Find all blocks for the term + let mut current_idx = self.term_to_block.get(&term_id).copied(); + while let Some(idx) = current_idx { + let block = &self.blocks[idx]; + docs.extend_from_slice(&block.docs); + + // Check if there's a next block for the same term + current_idx = if idx + 1 < self.blocks.len() && self.blocks[idx + 1].term_id == term_id { + Some(idx + 1) + } else { + None + }; + } + + docs + } + + /// Get the number of blocks + pub fn num_blocks(&self) -> usize { + self.blocks.len() + } + + /// Get the total number of documents + pub fn num_docs(&self) -> usize { + self.blocks.iter().map(|b| b.size()).sum() + } + + /// Clear the index + pub fn clear(&mut self) { + self.blocks.clear(); + self.term_to_block.clear(); + } +} + +/// Compact vector for efficient storage +pub struct CompactVector { + data: Vec, + element_size: usize, + num_elements: usize, +} + +impl CompactVector { + /// Create a new compact vector + pub fn new(element_size: usize) -> Self { + Self { + data: Vec::new(), + element_size, + num_elements: 0, + } + } + + /// Add an element to the vector + pub fn push(&mut self, element: &[u8]) { + assert_eq!(element.len(), self.element_size); + self.data.extend_from_slice(element); + self.num_elements += 1; + } + + /// Get an element from the vector + pub fn get(&self, index: usize) -> Option<&[u8]> { + if index >= self.num_elements { + return None; + } + let start = index * self.element_size; + let end = start + self.element_size; + Some(&self.data[start..end]) + } + + /// Get the number of elements + pub fn size(&self) -> usize { + self.num_elements + } + + /// Get the size in bytes + pub fn bytes(&self) -> usize { + self.data.len() + } +} + +/// Bit vector for efficient bit-level operations +pub struct BitVector { + data: Vec, + num_bits: usize, +} + +impl BitVector { + /// Create a new bit vector + pub fn new(num_bits: usize) -> Self { + let num_bytes = (num_bits + 7) / 8; + Self { + data: vec![0; num_bytes], + num_bits, + } + } + + /// Set a bit + pub fn set(&mut self, index: usize) { + if index < self.num_bits { + let byte_idx = index / 8; + let bit_idx = index % 8; + self.data[byte_idx] |= 1 << bit_idx; + } + } + + /// Clear a bit + pub fn clear(&mut self, index: usize) { + if index < self.num_bits { + let byte_idx = index / 8; + let bit_idx = index % 8; + self.data[byte_idx] &= !(1 << bit_idx); + } + } + + /// Test a bit + pub fn test(&self, index: usize) -> bool { + if index < self.num_bits { + let byte_idx = index / 8; + let bit_idx = index % 8; + (self.data[byte_idx] & (1 << bit_idx)) != 0 + } else { + false + } + } + + /// Get the number of bits + pub fn size(&self) -> usize { + self.num_bits + } + + /// Get the size in bytes + pub fn bytes(&self) -> usize { + self.data.len() + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs index c5c3755..4004de9 100644 --- a/autocomplete-rs/src/lib.rs +++ b/autocomplete-rs/src/lib.rs @@ -1,7 +1,19 @@ pub mod constants; pub mod parameters; pub mod probe; +pub mod types; +pub mod string_pool; +pub mod trie; +pub mod dictionary; +pub mod index; +pub mod autocomplete; pub use constants::*; pub use parameters::*; -pub use probe::*; \ No newline at end of file +pub use probe::*; +pub use types::*; +pub use string_pool::*; +pub use trie::*; +pub use dictionary::*; +pub use index::*; +pub use autocomplete::*; \ No newline at end of file diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs new file mode 100644 index 0000000..6ddacbd --- /dev/null +++ b/autocomplete-rs/src/server.rs @@ -0,0 +1,89 @@ +use tonic::{transport::Server, Request, Response, Status}; +use crate::autocomplete::{Autocomplete, Autocomplete2}; + +pub mod autocomplete_proto { + tonic::include_proto!("autocomplete"); +} + +use autocomplete_proto::{ + autocomplete_service_server::{AutocompleteService, AutocompleteServiceServer}, + CompleteRequest, CompleteResponse, Completion, + InitRequest, InitResponse, + StatsRequest, StatsResponse, +}; + +pub struct AutocompleteServiceImpl { + autocomplete: Autocomplete, +} + +#[tonic::async_trait] +impl AutocompleteService for AutocompleteServiceImpl { + async fn complete( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let completions = self.autocomplete.complete(&req.prefix); + + let response = CompleteResponse { + completions: completions.into_iter() + .map(|(text, score)| Completion { + text, + score, + }) + .collect(), + }; + + Ok(Response::new(response)) + } + + async fn init( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let strings: Vec<(String, f32)> = req.strings + .into_iter() + .map(|s| (s.text, s.score)) + .collect(); + + match self.autocomplete.init(&strings) { + Ok(_) => Ok(Response::new(InitResponse { + success: true, + error: String::new(), + })), + Err(e) => Ok(Response::new(InitResponse { + success: false, + error: e.to_string(), + })), + } + } + + async fn get_stats( + &self, + _request: Request, + ) -> Result, Status> { + let response = StatsResponse { + num_terms: self.autocomplete.num_terms() as i32, + memory_bytes: self.autocomplete.bytes() as i64, + }; + + Ok(Response::new(response)) + } +} + +pub async fn run_server(addr: &str) -> Result<(), Box> { + let addr = addr.parse()?; + let service = AutocompleteServiceImpl { + autocomplete: Autocomplete::new(), + }; + + println!("Autocomplete server listening on {}", addr); + + Server::builder() + .add_service(AutocompleteServiceServer::new(service)) + .serve(addr) + .await?; + + Ok(()) +} \ No newline at end of file diff --git a/autocomplete-rs/src/string_pool.rs b/autocomplete-rs/src/string_pool.rs new file mode 100644 index 0000000..0dc8ea5 --- /dev/null +++ b/autocomplete-rs/src/string_pool.rs @@ -0,0 +1,151 @@ +use crate::types::{ByteRange, IdType}; + +/// Represents a scored byte range +#[derive(Debug, Clone)] +pub struct ScoredByteRange { + pub string: ByteRange, + pub score: IdType, +} + +/// Manages a pool of scored strings +pub struct ScoredStringPool { + scores: Vec, + offsets: Vec, + data: Vec, +} + +impl ScoredStringPool { + /// Create a new empty string pool + pub fn new() -> Self { + let mut pool = Self { + scores: Vec::new(), + offsets: Vec::new(), + data: Vec::new(), + }; + pool.init(); + pool + } + + /// Initialize the pool + pub fn init(&mut self) { + self.push_back_offset(0); + } + + /// Resize the pool + pub fn resize(&mut self, num_bytes: usize, k: u32) { + self.scores.resize(k as usize, 0); + self.data.resize(num_bytes, 0); + } + + /// Clear the pool + pub fn clear(&mut self) { + self.offsets.clear(); + } + + /// Get the number of strings in the pool + pub fn size(&self) -> usize { + assert!(!self.offsets.is_empty()); + self.offsets.len() - 1 + } + + /// Get the total number of bytes used + pub fn bytes(&self) -> usize { + self.offsets.last().copied().unwrap_or(0) + } + + /// Get a mutable reference to the data + pub fn data_mut(&mut self) -> &mut [u8] { + &mut self.data + } + + /// Add a new offset + pub fn push_back_offset(&mut self, offset: usize) { + self.offsets.push(offset); + } + + /// Get a mutable reference to the scores + pub fn scores_mut(&mut self) -> &mut [IdType] { + &mut self.scores + } + + /// Get a reference to the scores + pub fn scores(&self) -> &[IdType] { + &self.scores + } + + /// Get a scored byte range at the given index + pub fn get(&self, i: usize) -> ScoredByteRange { + assert!(i < self.size()); + ScoredByteRange { + string: ByteRange { + begin: unsafe { self.data.as_ptr().add(self.offsets[i]) }, + end: unsafe { self.data.as_ptr().add(self.offsets[i + 1]) }, + }, + score: self.scores[i], + } + } + + /// Set the offsets vector + pub fn set_offsets(&mut self, offsets: Vec) { + self.offsets = offsets; + } + + /// Set the scores vector + pub fn set_scores(&mut self, scores: Vec) { + self.scores = scores; + } + + /// Set the data vector + pub fn set_data(&mut self, data: Vec) { + self.data = data; + } +} + +/// Iterator over scored strings in the pool +pub struct ScoredStringPoolIterator<'a> { + pool: &'a ScoredStringPool, + pos: usize, +} + +impl<'a> ScoredStringPoolIterator<'a> { + /// Create a new iterator + pub fn new(pool: &'a ScoredStringPool, pos: usize) -> Self { + Self { pool, pos } + } + + /// Check if the iterator is empty + pub fn empty(&self) -> bool { + self.size() == 0 + } + + /// Get the number of strings + pub fn size(&self) -> usize { + self.pool.size() + } + + /// Get the pool + pub fn pool(&self) -> &ScoredStringPool { + self.pool + } +} + +impl<'a> Iterator for ScoredStringPoolIterator<'a> { + type Item = ScoredByteRange; + + fn next(&mut self) -> Option { + if self.pos < self.pool.size() { + let item = self.pool.get(self.pos); + self.pos += 1; + Some(item) + } else { + None + } + } +} + +impl ScoredStringPool { + /// Get an iterator over the scored strings + pub fn iter(&self) -> ScoredStringPoolIterator { + ScoredStringPoolIterator::new(self, 0) + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/trie.rs b/autocomplete-rs/src/trie.rs new file mode 100644 index 0000000..1b24c73 --- /dev/null +++ b/autocomplete-rs/src/trie.rs @@ -0,0 +1,182 @@ +use std::collections::HashMap; +use crate::types::{IdType, CompletionType}; + +/// A node in the completion trie +pub struct TrieNode { + children: HashMap, + is_terminal: bool, + completion_ids: Vec, +} + +impl TrieNode { + /// Create a new trie node + pub fn new() -> Self { + Self { + children: HashMap::new(), + is_terminal: false, + completion_ids: Vec::new(), + } + } + + /// Add a child node + pub fn add_child(&mut self, c: char) -> &mut TrieNode { + self.children.entry(c).or_insert_with(TrieNode::new) + } + + /// Get a child node + pub fn get_child(&self, c: char) -> Option<&TrieNode> { + self.children.get(&c) + } + + /// Check if this is a terminal node + pub fn is_terminal(&self) -> bool { + self.is_terminal + } + + /// Set this node as terminal + pub fn set_terminal(&mut self) { + self.is_terminal = true; + } + + /// Add a completion ID + pub fn add_completion_id(&mut self, id: IdType) { + self.completion_ids.push(id); + } + + /// Get completion IDs + pub fn completion_ids(&self) -> &[IdType] { + &self.completion_ids + } +} + +/// A trie for prefix-based completion +pub struct CompletionTrie { + root: TrieNode, + num_nodes: usize, + num_completions: usize, +} + +impl CompletionTrie { + /// Create a new completion trie + pub fn new() -> Self { + Self { + root: TrieNode::new(), + num_nodes: 1, + num_completions: 0, + } + } + + /// Insert a completion string + pub fn insert(&mut self, completion: &str, id: IdType) { + let mut node = &mut self.root; + for c in completion.chars() { + node = node.add_child(c); + self.num_nodes += 1; + } + node.set_terminal(); + node.add_completion_id(id); + self.num_completions += 1; + } + + /// Find all completions for a prefix + pub fn complete(&self, prefix: &str) -> Vec { + let mut node = &self.root; + for c in prefix.chars() { + match node.get_child(c) { + Some(next) => node = next, + None => return Vec::new(), + } + } + self.collect_completions(node) + } + + /// Collect all completion IDs from a node and its children + fn collect_completions(&self, node: &TrieNode) -> Vec { + let mut completions = Vec::new(); + self.collect_completions_recursive(node, &mut completions); + completions + } + + /// Recursive helper for collecting completions + fn collect_completions_recursive(&self, node: &TrieNode, completions: &mut Vec) { + if node.is_terminal() { + completions.extend_from_slice(node.completion_ids()); + } + for child in node.children.values() { + self.collect_completions_recursive(child, completions); + } + } + + /// Remove a completion string + pub fn remove(&mut self, completion: &str) -> bool { + let mut chars: Vec = completion.chars().collect(); + if chars.is_empty() { + return false; + } + + // First, find if the completion exists and build the path + let mut path = Vec::new(); + let mut current = &self.root; + + for &c in &chars { + match current.get_child(c) { + Some(next) => { + path.push(c); + current = next; + } + None => return false, + } + } + + if !current.is_terminal() { + return false; + } + + // Now remove it by traversing the path again + let mut current = &mut self.root; + let mut parent = None; + + for &c in &path { + if let Some(next) = current.children.get_mut(&c) { + parent = Some((c, current)); + current = next; + } + } + + // Remove the completion + current.completion_ids.clear(); + current.is_terminal = false; + self.num_completions -= 1; + + // Clean up empty nodes + while let Some((c, p)) = parent { + if current.children.is_empty() && !current.is_terminal() { + p.children.remove(&c); + self.num_nodes -= 1; + current = p; + parent = None; + } else { + break; + } + } + + true + } + + /// Clear the trie + pub fn clear(&mut self) { + self.root = TrieNode::new(); + self.num_nodes = 1; + self.num_completions = 0; + } + + /// Get the number of nodes + pub fn num_nodes(&self) -> usize { + self.num_nodes + } + + /// Get the number of completions + pub fn num_completions(&self) -> usize { + self.num_completions + } +} \ No newline at end of file diff --git a/autocomplete-rs/src/types.rs b/autocomplete-rs/src/types.rs new file mode 100644 index 0000000..5490d59 --- /dev/null +++ b/autocomplete-rs/src/types.rs @@ -0,0 +1,92 @@ +use std::ops::Range; + +/// Type alias for document and term IDs +pub type IdType = u32; + +/// Type alias for completion type (vector of term IDs) +pub type CompletionType = Vec; + +/// Represents a range of values +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ValueRange { + pub begin: u64, + pub end: u64, +} + +impl ValueRange { + /// Check if the range is invalid + pub fn is_invalid(&self) -> bool { + self.begin == u64::MAX || self.end == u64::MAX || self.begin > self.end + } + + /// Check if the range is valid + pub fn is_valid(&self) -> bool { + !self.is_invalid() + } + + /// Check if a value is contained in the range + pub fn contains(&self, val: u64) -> bool { + val >= self.begin && val <= self.end + } +} + +/// Represents a scored range +#[derive(Debug, Clone)] +pub struct ScoredRange { + pub range: ValueRange, + pub min_pos: u32, + pub min_val: IdType, +} + +impl ScoredRange { + /// Compare two scored ranges + pub fn greater(l: &ScoredRange, r: &ScoredRange) -> bool { + l.min_val > r.min_val + } +} + +/// Represents a byte range +#[derive(Debug, Clone, Copy)] +pub struct ByteRange { + pub begin: *const u8, + pub end: *const u8, +} + +/// Represents a range of 32-bit integers +#[derive(Debug, Clone, Copy)] +pub struct Uint32Range { + pub begin: *const u32, + pub end: *const u32, +} + +/// Global constants +pub mod global { + use super::IdType; + + /// Invalid term ID + pub const INVALID_TERM_ID: IdType = IdType::MAX; + + /// Terminator value + pub const TERMINATOR: IdType = 0; + + /// Not found value + pub const NOT_FOUND: u64 = u64::MAX; + + /// Linear scan threshold + pub const LINEAR_SCAN_THRESHOLD: u64 = 8; +} + +/// Convert a string to a byte range +pub fn string_to_byte_range(s: &str) -> ByteRange { + let begin = s.as_ptr(); + let end = unsafe { begin.add(s.len()) }; + ByteRange { begin, end } +} + +/// Convert a completion to a uint32 range +pub fn completion_to_uint32_range(c: &CompletionType) -> Uint32Range { + Uint32Range { + begin: c.as_ptr(), + end: unsafe { c.as_ptr().add(c.len()) }, + } +} \ No newline at end of file From 97293449a961c4c42ac6e5ffa1b20afce99d3903 Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 14:34:04 -0400 Subject: [PATCH 097/102] define entry points --- autocomplete-rs/Cargo.lock | 1194 ++++++++++++++++++++++++- autocomplete-rs/Cargo.toml | 4 + autocomplete-rs/schema/schema.graphql | 41 + autocomplete-rs/src/graphql.rs | 90 ++ autocomplete-rs/src/main.rs | 39 +- autocomplete-rs/src/server.rs | 59 +- 6 files changed, 1386 insertions(+), 41 deletions(-) create mode 100644 autocomplete-rs/schema/schema.graphql create mode 100644 autocomplete-rs/src/graphql.rs diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock index d1b8fd2..486a6ac 100644 --- a/autocomplete-rs/Cargo.lock +++ b/autocomplete-rs/Cargo.lock @@ -2,6 +2,16 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "Inflector" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" +dependencies = [ + "lazy_static", + "regex", +] + [[package]] name = "addr2line" version = "0.24.2" @@ -26,12 +36,160 @@ dependencies = [ "memchr", ] +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.59.0", +] + [[package]] name = "anyhow" version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" +[[package]] +name = "ascii_utils" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71938f30533e4d95a6d17aa530939da3842c2ab6f4f84b9dae68447e4129f74a" + +[[package]] +name = "async-graphql" +version = "6.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298a5d587d6e6fdb271bf56af2dc325a80eb291fd0fc979146584b9a05494a8c" +dependencies = [ + "async-graphql-derive", + "async-graphql-parser", + "async-graphql-value", + "async-stream", + "async-trait", + "base64 0.13.1", + "bytes", + "fast_chemail", + "fnv", + "futures-util", + "handlebars", + "http 0.2.12", + "indexmap 2.9.0", + "mime", + "multer", + "num-traits", + "once_cell", + "pin-project-lite", + "regex", + "serde", + "serde_json", + "serde_urlencoded", + "static_assertions", + "tempfile", + "thiserror 1.0.69", +] + +[[package]] +name = "async-graphql-axum" +version = "6.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01a1c20a2059bffbc95130715b23435a05168c518fba9709c81fa2a38eed990c" +dependencies = [ + "async-graphql", + "async-trait", + "axum 0.6.20", + "bytes", + "futures-util", + "serde_json", + "tokio", + "tokio-stream", + "tokio-util", + "tower-service", +] + +[[package]] +name = "async-graphql-derive" +version = "6.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f329c7eb9b646a72f70c9c4b516c70867d356ec46cb00dcac8ad343fd006b0" +dependencies = [ + "Inflector", + "async-graphql-parser", + "darling", + "proc-macro-crate", + "proc-macro2", + "quote", + "strum", + "syn", + "thiserror 1.0.69", +] + +[[package]] +name = "async-graphql-parser" +version = "6.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6139181845757fd6a73fbb8839f3d036d7150b798db0e9bb3c6e83cdd65bd53b" +dependencies = [ + "async-graphql-value", + "pest", + "serde", + "serde_json", +] + +[[package]] +name = "async-graphql-value" +version = "6.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "323a5143f5bdd2030f45e3f2e0c821c9b1d36e79cf382129c64299c50a7f3750" +dependencies = [ + "bytes", + "indexmap 2.9.0", + "serde", + "serde_json", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -75,6 +233,10 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" name = "autocomplete-rs" version = "0.1.0" dependencies = [ + "async-graphql", + "async-graphql-axum", + "axum 0.7.9", + "clap", "futures", "prost", "tempfile", @@ -90,13 +252,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" dependencies = [ "async-trait", - "axum-core", + "axum-core 0.3.4", + "base64 0.21.7", "bitflags 1.3.2", "bytes", "futures-util", - "http", - "http-body", - "hyper", + "headers", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.32", "itoa", "matchit", "memchr", @@ -105,12 +269,52 @@ dependencies = [ "pin-project-lite", "rustversion", "serde", - "sync_wrapper", - "tower", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sha1", + "sync_wrapper 0.1.2", + "tokio", + "tokio-tungstenite", + "tower 0.4.13", "tower-layer", "tower-service", ] +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core 0.4.5", + "bytes", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "hyper 1.6.0", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper 1.0.2", + "tokio", + "tower 0.5.2", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "axum-core" version = "0.3.4" @@ -120,12 +324,33 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http", - "http-body", + "http 0.2.12", + "http-body 0.4.6", + "mime", + "rustversion", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", "mime", + "pin-project-lite", "rustversion", + "sync_wrapper 1.0.2", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -143,6 +368,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -161,11 +392,29 @@ version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +dependencies = [ + "serde", +] [[package]] name = "cfg-if" @@ -173,12 +422,148 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "clap" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "data-encoding" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -195,6 +580,15 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "fast_chemail" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "495a39d30d624c2caabe6312bfead73e7717692b44e0b32df168c275a2e8e9e4" +dependencies = [ + "ascii_utils", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -213,6 +607,15 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + [[package]] name = "futures" version = "0.3.31" @@ -302,6 +705,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -342,7 +755,7 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http", + "http 0.2.12", "indexmap 2.9.0", "slab", "tokio", @@ -350,6 +763,20 @@ dependencies = [ "tracing", ] +[[package]] +name = "handlebars" +version = "4.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faa67bab9ff362228eb3d00bd024a4965d8231bbb7921167f0cfa66c6626b225" +dependencies = [ + "log", + "pest", + "pest_derive", + "serde", + "serde_json", + "thiserror 1.0.69", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -362,6 +789,36 @@ version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" +[[package]] +name = "headers" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06683b93020a07e3dbcf5f8c0f6d40080d725bea7936fc01ad345c01b97dc270" +dependencies = [ + "base64 0.21.7", + "bytes", + "headers-core", + "http 0.2.12", + "httpdate", + "mime", + "sha1", +] + +[[package]] +name = "headers-core" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" +dependencies = [ + "http 0.2.12", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" @@ -379,6 +836,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http-body" version = "0.4.6" @@ -386,7 +854,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", - "http", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.3.1", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.3.1", + "http-body 1.0.1", "pin-project-lite", ] @@ -413,8 +904,8 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", - "http-body", + "http 0.2.12", + "http-body 0.4.6", "httparse", "httpdate", "itoa", @@ -426,18 +917,166 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", +] + [[package]] name = "hyper-timeout" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.32", "pin-project-lite", "tokio", "tokio-io-timeout", ] +[[package]] +name = "hyper-util" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710" +dependencies = [ + "bytes", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "hyper 1.6.0", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -456,8 +1095,15 @@ checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", "hashbrown 0.15.3", + "serde", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.12.1" @@ -473,6 +1119,12 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.172" @@ -485,6 +1137,12 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + [[package]] name = "lock_api" version = "0.4.12" @@ -539,12 +1197,39 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "multer" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01acbdc23469fd8fe07ab135923371d5f5a422fbf9c522158677c8eb15bc51c2" +dependencies = [ + "bytes", + "encoding_rs", + "futures-util", + "http 0.2.12", + "httparse", + "log", + "memchr", + "mime", + "spin", + "version_check", +] + [[package]] name = "multimap" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "object" version = "0.36.7" @@ -560,6 +1245,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + [[package]] name = "parking_lot" version = "0.12.3" @@ -574,20 +1265,65 @@ dependencies = [ name = "parking_lot_core" version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pest" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "198db74531d58c70a361c42201efde7e2591e976d518caf7662a47dc5720e7b6" +dependencies = [ + "memchr", + "thiserror 2.0.12", + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d725d9cfd79e87dccc9341a2ef39d1b6f6353d68c4b33c177febbe1a402c97c5" +dependencies = [ + "pest", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db7d01726be8ab66ab32f9df467ae8b1148906685bbe75c82d1e65d7f5b3f841" dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets", + "pest", + "pest_meta", + "proc-macro2", + "quote", + "syn", ] [[package]] -name = "percent-encoding" -version = "2.3.1" +name = "pest_meta" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +checksum = "7f9f832470494906d1fca5329f8ab5791cc60beb230c74815dff541cbd2b5ca0" +dependencies = [ + "once_cell", + "pest", + "sha2", +] [[package]] name = "petgraph" @@ -631,6 +1367,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "zerovec", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -650,6 +1395,16 @@ dependencies = [ "syn", ] +[[package]] +name = "proc-macro-crate" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" +dependencies = [ + "once_cell", + "toml_edit", +] + [[package]] name = "proc-macro2" version = "1.0.95" @@ -676,7 +1431,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", - "heck", + "heck 0.5.0", "itertools", "log", "multimap", @@ -820,6 +1575,12 @@ version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + [[package]] name = "scopeguard" version = "1.2.0" @@ -846,6 +1607,62 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "serde_path_to_error" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a" +dependencies = [ + "itoa", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "signal-hook-registry" version = "1.4.5" @@ -880,6 +1697,52 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "syn" version = "2.0.101" @@ -897,6 +1760,23 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tempfile" version = "3.20.0" @@ -910,6 +1790,56 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +dependencies = [ + "thiserror-impl 2.0.12", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tokio" version = "1.45.0" @@ -960,6 +1890,18 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-tungstenite" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "212d5dcb2a1ce06d81107c3d0ffa3121fe974b73f068c8282cb1c32328113b6c" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite", +] + [[package]] name = "tokio-util" version = "0.7.15" @@ -968,11 +1910,29 @@ checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" dependencies = [ "bytes", "futures-core", + "futures-io", "futures-sink", "pin-project-lite", "tokio", ] +[[package]] +name = "toml_datetime" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3" + +[[package]] +name = "toml_edit" +version = "0.19.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" +dependencies = [ + "indexmap 2.9.0", + "toml_datetime", + "winnow", +] + [[package]] name = "tonic" version = "0.10.2" @@ -981,20 +1941,20 @@ checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e" dependencies = [ "async-stream", "async-trait", - "axum", - "base64", + "axum 0.6.20", + "base64 0.21.7", "bytes", "h2", - "http", - "http-body", - "hyper", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.32", "hyper-timeout", "percent-encoding", "pin-project", "prost", "tokio", "tokio-stream", - "tower", + "tower 0.4.13", "tower-layer", "tower-service", "tracing", @@ -1033,6 +1993,22 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper 1.0.2", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tower-layer" version = "0.3.3" @@ -1051,6 +2027,7 @@ version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -1082,12 +2059,78 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "tungstenite" +version = "0.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http 0.2.12", + "httparse", + "log", + "rand", + "sha1", + "thiserror 1.0.69", + "url", + "utf-8", +] + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + +[[package]] +name = "ucd-trie" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" + [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "want" version = "0.3.1" @@ -1194,6 +2237,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.5.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen-rt" version = "0.39.0" @@ -1203,6 +2255,36 @@ dependencies = [ "bitflags 2.9.1", ] +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.8.25" @@ -1222,3 +2304,57 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml index fc1d1f9..74c25c0 100644 --- a/autocomplete-rs/Cargo.toml +++ b/autocomplete-rs/Cargo.toml @@ -8,6 +8,10 @@ tonic = "0.10" prost = "0.12" tokio = { version = "1.0", features = ["full"] } futures = "0.3" +async-graphql = "6.0" +async-graphql-axum = "6.0" +axum = "0.7" +clap = { version = "4.4", features = ["derive"] } [dev-dependencies] tempfile = "3.8" diff --git a/autocomplete-rs/schema/schema.graphql b/autocomplete-rs/schema/schema.graphql new file mode 100644 index 0000000..70da230 --- /dev/null +++ b/autocomplete-rs/schema/schema.graphql @@ -0,0 +1,41 @@ +type Query { + # Get completions for a prefix + complete(prefix: String!, maxResults: Int): CompleteResponse! + + # Get system statistics + stats: Stats! +} + +type Mutation { + # Initialize the autocomplete system with strings and scores + init(strings: [StringScoreInput!]!): InitResponse! +} + +# Input type for string with score +input StringScoreInput { + text: String! + score: Float! +} + +# Response type for completions +type CompleteResponse { + completions: [Completion!]! +} + +# A single completion result +type Completion { + text: String! + score: Float! +} + +# Response type for initialization +type InitResponse { + success: Boolean! + error: String +} + +# System statistics +type Stats { + numTerms: Int! + memoryBytes: Int! +} \ No newline at end of file diff --git a/autocomplete-rs/src/graphql.rs b/autocomplete-rs/src/graphql.rs new file mode 100644 index 0000000..197c180 --- /dev/null +++ b/autocomplete-rs/src/graphql.rs @@ -0,0 +1,90 @@ +use async_graphql::{Object, Schema, SimpleObject, InputObject}; +use crate::autocomplete::Autocomplete; + +#[derive(SimpleObject)] +struct Completion { + text: String, + score: f32, +} + +#[derive(SimpleObject)] +struct CompleteResponse { + completions: Vec, +} + +#[derive(SimpleObject)] +struct Stats { + num_terms: i32, + memory_bytes: i64, +} + +#[derive(SimpleObject)] +struct InitResponse { + success: bool, + error: Option, +} + +#[derive(InputObject)] +struct StringScoreInput { + text: String, + score: f32, +} + +pub struct QueryRoot { + autocomplete: Autocomplete, +} + +#[Object] +impl QueryRoot { + async fn complete(&self, prefix: String, max_results: Option) -> CompleteResponse { + let completions = self.autocomplete.complete(&prefix); + let completions = completions.into_iter() + .map(|(text, score)| Completion { text, score }) + .collect(); + + CompleteResponse { completions } + } + + async fn stats(&self) -> Stats { + Stats { + num_terms: self.autocomplete.num_terms() as i32, + memory_bytes: self.autocomplete.bytes() as i64, + } + } +} + +pub struct MutationRoot { + autocomplete: Autocomplete, +} + +#[Object] +impl MutationRoot { + async fn init(&self, strings: Vec) -> InitResponse { + let strings: Vec<(String, f32)> = strings + .into_iter() + .map(|s| (s.text, s.score)) + .collect(); + + match self.autocomplete.init(&strings) { + Ok(_) => InitResponse { + success: true, + error: None, + }, + Err(e) => InitResponse { + success: false, + error: Some(e.to_string()), + }, + } + } +} + +pub type AppSchema = Schema; + +pub fn create_schema(autocomplete: Autocomplete) -> AppSchema { + Schema::build( + QueryRoot { autocomplete: autocomplete.clone() }, + MutationRoot { autocomplete }, + async_graphql::EmptySubscription, + ) + .finish() +} \ No newline at end of file diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs index e7a11a9..ee606d7 100644 --- a/autocomplete-rs/src/main.rs +++ b/autocomplete-rs/src/main.rs @@ -1,3 +1,38 @@ -fn main() { - println!("Hello, world!"); +use std::error::Error; +use clap::Parser; + +mod autocomplete; +mod graphql; +mod server; +mod string_pool; +mod trie; +mod types; +mod utils; + +/// Autocomplete service with gRPC and GraphQL support +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// gRPC server address + #[arg(short, long, default_value = "[::1]:50051")] + grpc_addr: String, + + /// GraphQL server address + #[arg(short, long, default_value = "[::1]:8000")] + graphql_addr: String, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + + println!("Starting Autocomplete Service..."); + println!("gRPC server will listen on: {}", args.grpc_addr); + println!("GraphQL server will listen on: {}", args.graphql_addr); + println!("GraphQL Playground available at: http://{}/playground", args.graphql_addr); + + // Start both servers + server::run_server(&args.grpc_addr, &args.graphql_addr).await?; + + Ok(()) } diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs index 6ddacbd..733afef 100644 --- a/autocomplete-rs/src/server.rs +++ b/autocomplete-rs/src/server.rs @@ -1,5 +1,15 @@ -use tonic::{transport::Server, Request, Response, Status}; +use std::net::SocketAddr; +use tonic::{transport::Server as TonicServer, Request, Response, Status}; +use axum::{ + routing::{get, post}, + Router, + extract::State, + response::IntoResponse, + Json, +}; +use async_graphql_axum::{GraphQLRequest, GraphQLResponse}; use crate::autocomplete::{Autocomplete, Autocomplete2}; +use crate::graphql::{create_schema, AppSchema}; pub mod autocomplete_proto { tonic::include_proto!("autocomplete"); @@ -72,18 +82,47 @@ impl AutocompleteService for AutocompleteServiceImpl { } } -pub async fn run_server(addr: &str) -> Result<(), Box> { - let addr = addr.parse()?; - let service = AutocompleteServiceImpl { - autocomplete: Autocomplete::new(), +async fn graphql_handler( + State(schema): State, + req: GraphQLRequest, +) -> GraphQLResponse { + schema.execute(req.into_inner()).await.into() +} + +async fn graphql_playground() -> impl IntoResponse { + async_graphql::http::playground_source( + async_graphql::http::GraphQLPlaygroundConfig::new("/graphql") + ) +} + +pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box> { + let autocomplete = Autocomplete::new(); + let schema = create_schema(autocomplete.clone()); + + // Create gRPC service + let grpc_service = AutocompleteServiceImpl { + autocomplete: autocomplete.clone(), }; - println!("Autocomplete server listening on {}", addr); + // Create GraphQL router + let app = Router::new() + .route("/graphql", post(graphql_handler)) + .route("/playground", get(graphql_playground)) + .with_state(schema); + + // Start both servers + let grpc_addr = grpc_addr.parse()?; + let graphql_addr = graphql_addr.parse()?; + + println!("gRPC server listening on {}", grpc_addr); + println!("GraphQL server listening on {}", graphql_addr); - Server::builder() - .add_service(AutocompleteServiceServer::new(service)) - .serve(addr) - .await?; + tokio::join!( + TonicServer::builder() + .add_service(AutocompleteServiceServer::new(grpc_service)) + .serve(grpc_addr), + axum::Server::bind(&graphql_addr).serve(app.into_make_service()) + ); Ok(()) } \ No newline at end of file From 4d3cb52425949714ca31900c71dd2293e8de1277 Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 14:52:34 -0400 Subject: [PATCH 098/102] partial fix --- autocomplete-rs/Cargo.lock | 199 ++++++------------------ autocomplete-rs/Cargo.toml | 7 +- autocomplete-rs/LICENSE | 21 +++ autocomplete-rs/README.md | 122 ++++++++++++--- autocomplete-rs/src/autocomplete.rs | 225 +++------------------------ autocomplete-rs/src/dictionary.rs | 205 ++++-------------------- autocomplete-rs/src/graphql.rs | 27 ++-- autocomplete-rs/src/index.rs | 56 ++----- autocomplete-rs/src/lib.rs | 2 + autocomplete-rs/src/main.rs | 1 - autocomplete-rs/src/server.rs | 25 +-- autocomplete-rs/src/string_pool.rs | 50 +++--- autocomplete-rs/src/trie.rs | 233 ++++++++++++---------------- autocomplete-rs/src/types.rs | 26 +++- 14 files changed, 411 insertions(+), 788 deletions(-) create mode 100644 autocomplete-rs/LICENSE diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock index 486a6ac..bd45602 100644 --- a/autocomplete-rs/Cargo.lock +++ b/autocomplete-rs/Cargo.lock @@ -115,7 +115,7 @@ dependencies = [ "fnv", "futures-util", "handlebars", - "http 0.2.12", + "http", "indexmap 2.9.0", "mime", "multer", @@ -139,7 +139,7 @@ checksum = "01a1c20a2059bffbc95130715b23435a05168c518fba9709c81fa2a38eed990c" dependencies = [ "async-graphql", "async-trait", - "axum 0.6.20", + "axum", "bytes", "futures-util", "serde_json", @@ -235,14 +235,17 @@ version = "0.1.0" dependencies = [ "async-graphql", "async-graphql-axum", - "axum 0.7.9", + "axum", "clap", "futures", + "hyper", "prost", "tempfile", "tokio", "tonic", "tonic-build", + "tower", + "tower-http", ] [[package]] @@ -252,15 +255,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" dependencies = [ "async-trait", - "axum-core 0.3.4", + "axum-core", + "axum-macros", "base64 0.21.7", "bitflags 1.3.2", "bytes", "futures-util", "headers", - "http 0.2.12", - "http-body 0.4.6", - "hyper 0.14.32", + "http", + "http-body", + "hyper", "itoa", "matchit", "memchr", @@ -273,48 +277,14 @@ dependencies = [ "serde_path_to_error", "serde_urlencoded", "sha1", - "sync_wrapper 0.1.2", + "sync_wrapper", "tokio", "tokio-tungstenite", - "tower 0.4.13", + "tower", "tower-layer", "tower-service", ] -[[package]] -name = "axum" -version = "0.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" -dependencies = [ - "async-trait", - "axum-core 0.4.5", - "bytes", - "futures-util", - "http 1.3.1", - "http-body 1.0.1", - "http-body-util", - "hyper 1.6.0", - "hyper-util", - "itoa", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "rustversion", - "serde", - "serde_json", - "serde_path_to_error", - "serde_urlencoded", - "sync_wrapper 1.0.2", - "tokio", - "tower 0.5.2", - "tower-layer", - "tower-service", - "tracing", -] - [[package]] name = "axum-core" version = "0.3.4" @@ -324,8 +294,8 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 0.2.12", - "http-body 0.4.6", + "http", + "http-body", "mime", "rustversion", "tower-layer", @@ -333,24 +303,15 @@ dependencies = [ ] [[package]] -name = "axum-core" -version = "0.4.5" +name = "axum-macros" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +checksum = "cdca6a10ecad987bda04e95606ef85a5417dcaac1a78455242d72e031e2b6b62" dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http 1.3.1", - "http-body 1.0.1", - "http-body-util", - "mime", - "pin-project-lite", - "rustversion", - "sync_wrapper 1.0.2", - "tower-layer", - "tower-service", - "tracing", + "heck 0.4.1", + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -755,7 +716,7 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http 0.2.12", + "http", "indexmap 2.9.0", "slab", "tokio", @@ -798,7 +759,7 @@ dependencies = [ "base64 0.21.7", "bytes", "headers-core", - "http 0.2.12", + "http", "httpdate", "mime", "sha1", @@ -810,7 +771,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" dependencies = [ - "http 0.2.12", + "http", ] [[package]] @@ -836,17 +797,6 @@ dependencies = [ "itoa", ] -[[package]] -name = "http" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - [[package]] name = "http-body" version = "0.4.6" @@ -854,32 +804,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", - "http 0.2.12", + "http", "pin-project-lite", ] [[package]] -name = "http-body" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" -dependencies = [ - "bytes", - "http 1.3.1", -] - -[[package]] -name = "http-body-util" -version = "0.1.3" +name = "http-range-header" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" -dependencies = [ - "bytes", - "futures-core", - "http 1.3.1", - "http-body 1.0.1", - "pin-project-lite", -] +checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f" [[package]] name = "httparse" @@ -904,8 +837,8 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http 0.2.12", - "http-body 0.4.6", + "http", + "http-body", "httparse", "httpdate", "itoa", @@ -917,53 +850,18 @@ dependencies = [ "want", ] -[[package]] -name = "hyper" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" -dependencies = [ - "bytes", - "futures-channel", - "futures-util", - "http 1.3.1", - "http-body 1.0.1", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "smallvec", - "tokio", -] - [[package]] name = "hyper-timeout" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper 0.14.32", + "hyper", "pin-project-lite", "tokio", "tokio-io-timeout", ] -[[package]] -name = "hyper-util" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710" -dependencies = [ - "bytes", - "futures-util", - "http 1.3.1", - "http-body 1.0.1", - "hyper 1.6.0", - "pin-project-lite", - "tokio", - "tower-service", -] - [[package]] name = "icu_collections" version = "2.0.0" @@ -1206,7 +1104,7 @@ dependencies = [ "bytes", "encoding_rs", "futures-util", - "http 0.2.12", + "http", "httparse", "log", "memchr", @@ -1760,12 +1658,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" -[[package]] -name = "sync_wrapper" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" - [[package]] name = "synstructure" version = "0.13.2" @@ -1941,20 +1833,20 @@ checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e" dependencies = [ "async-stream", "async-trait", - "axum 0.6.20", + "axum", "base64 0.21.7", "bytes", "h2", - "http 0.2.12", - "http-body 0.4.6", - "hyper 0.14.32", + "http", + "http-body", + "hyper", "hyper-timeout", "percent-encoding", "pin-project", "prost", "tokio", "tokio-stream", - "tower 0.4.13", + "tower", "tower-layer", "tower-service", "tracing", @@ -1994,16 +1886,19 @@ dependencies = [ ] [[package]] -name = "tower" -version = "0.5.2" +name = "tower-http" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140" dependencies = [ + "bitflags 2.9.1", + "bytes", "futures-core", "futures-util", + "http", + "http-body", + "http-range-header", "pin-project-lite", - "sync_wrapper 1.0.2", - "tokio", "tower-layer", "tower-service", "tracing", @@ -2068,7 +1963,7 @@ dependencies = [ "byteorder", "bytes", "data-encoding", - "http 0.2.12", + "http", "httparse", "log", "rand", diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml index 74c25c0..b799bbe 100644 --- a/autocomplete-rs/Cargo.toml +++ b/autocomplete-rs/Cargo.toml @@ -4,13 +4,16 @@ version = "0.1.0" edition = "2021" [dependencies] -tonic = "0.10" +tonic = { version = "0.10", features = ["transport"] } prost = "0.12" tokio = { version = "1.0", features = ["full"] } futures = "0.3" async-graphql = "6.0" async-graphql-axum = "6.0" -axum = "0.7" +axum = { version = "0.6", features = ["macros"] } +tower = "0.4" +tower-http = { version = "0.4", features = ["trace"] } +hyper = { version = "0.14", features = ["full"] } clap = { version = "4.4", features = ["derive"] } [dev-dependencies] diff --git a/autocomplete-rs/LICENSE b/autocomplete-rs/LICENSE new file mode 100644 index 0000000..d874d0b --- /dev/null +++ b/autocomplete-rs/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Autocomplete Service Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/autocomplete-rs/README.md b/autocomplete-rs/README.md index 801e4b2..4c11811 100644 --- a/autocomplete-rs/README.md +++ b/autocomplete-rs/README.md @@ -1,43 +1,117 @@ -# Autocomplete-rs +# Autocomplete Service -This project is a Rust port of the original C++ autocomplete system. The goal is to maintain the same functionality while leveraging Rust's safety guarantees and modern tooling. +A high-performance autocomplete service written in Rust, supporting both gRPC and GraphQL interfaces. -## Project Status +## Features + +- **Dual API Support** + - gRPC interface for high-performance RPC calls + - GraphQL interface for flexible querying + - Shared backend implementation for both APIs -Currently, we are in the process of porting the core components from C++ to Rust. The following components have been ported: +- **Core Features** + - Fast prefix-based autocomplete + - Score-based ranking of suggestions + - Memory-efficient string storage + - Concurrent request handling -- Basic constants and configuration -- Parameters management -- Performance measurement probes +- **API Endpoints** + - gRPC: `[::1]:50051` (configurable) + - GraphQL: `[::1]:8000/graphql` (configurable) + - GraphQL Playground: `[::1]:8000/playground` -## Next Steps +## Project Status -1. Continue porting core components: - - Scored string pool - - Completion trie - - Blocked inverted index - - Front-coded dictionary +### Completed +- ✅ Basic autocomplete implementation +- ✅ gRPC server implementation +- ✅ GraphQL server implementation +- ✅ Command-line configuration +- ✅ Shared backend between APIs -2. Port and adapt unit tests to ensure functionality matches the original implementation +### In Progress +- 🔄 Documentation +- 🔄 Testing suite +- 🔄 Performance benchmarks -3. Containerize the application using Docker for easy deployment and testing +### Planned +- ⏳ Authentication +- ⏳ Rate limiting +- ⏳ Metrics and monitoring +- ⏳ Docker support +- ⏳ Client examples in multiple languages -## Building and Testing +## Getting Started +### Prerequisites +- Rust 1.70 or later +- Cargo + +### Building ```bash -# Build the project -cargo build +cargo build --release +``` -# Run tests -cargo test +### Running +```bash +# Default configuration +cargo run + +# Custom addresses +cargo run -- --grpc-addr 127.0.0.1:50051 --graphql-addr 127.0.0.1:8000 + +# Show help +cargo run -- --help +``` -# Run with specific test -cargo test test_name -- --nocapture +## API Usage + +### gRPC +```protobuf +service AutocompleteService { + rpc Complete(CompleteRequest) returns (CompleteResponse); + rpc Init(InitRequest) returns (InitResponse); + rpc GetStats(StatsRequest) returns (StatsResponse); +} +``` + +### GraphQL +```graphql +type Query { + complete(prefix: String!, maxResults: Int): CompleteResponse! + stats: StatsResponse! +} + +type Mutation { + init(strings: [StringInput!]!): InitResponse! +} +``` + +## Project Structure + +``` +autocomplete-rs/ +├── src/ +│ ├── main.rs # Entry point and CLI +│ ├── autocomplete.rs # Core autocomplete logic +│ ├── graphql.rs # GraphQL schema and resolvers +│ ├── server.rs # Server implementations +│ ├── string_pool.rs # String interning +│ ├── trie.rs # Trie data structure +│ └── types.rs # Common types +├── proto/ +│ └── autocomplete.proto # gRPC service definition +└── schema/ + └── schema.graphql # GraphQL schema ``` -## Original Project +## Contributing -This is a port of the original C++ autocomplete system, which provides efficient string completion functionality. The original implementation can be found in the `archive` directory. +1. Fork the repository +2. Create your feature branch (`git checkout -b feature/amazing-feature`) +3. Commit your changes (`git commit -m 'Add amazing feature'`) +4. Push to the branch (`git push origin feature/amazing-feature`) +5. Open a Pull Request ## License diff --git a/autocomplete-rs/src/autocomplete.rs b/autocomplete-rs/src/autocomplete.rs index 1191a75..b910078 100644 --- a/autocomplete-rs/src/autocomplete.rs +++ b/autocomplete-rs/src/autocomplete.rs @@ -1,222 +1,45 @@ -use std::collections::HashMap; -use crate::types::{IdType, ByteRange, global}; -use crate::string_pool::ScoredStringPool; -use crate::trie::CompletionTrie; -use crate::dictionary::{FCDictionary, IntegerFCDictionary}; -use crate::index::{BlockedInvertedIndex, CompactVector, BitVector}; +use crate::types::ScoreType; +use crate::trie::Trie; +use crate::dictionary::Dictionary; -const BLOCK_SIZE: usize = 1024; - -/// Main autocomplete implementation +#[derive(Clone)] pub struct Autocomplete { - string_pool: ScoredStringPool, - trie: CompletionTrie, - dictionary: FCDictionary, - index: BlockedInvertedIndex, - term_to_id: HashMap, - id_to_term: Vec, - num_terms: usize, + trie: Trie, + dictionary: Dictionary, } impl Autocomplete { - /// Create a new autocomplete instance - pub fn new() -> Self { - Self { - string_pool: ScoredStringPool::new(), - trie: CompletionTrie::new(), - dictionary: FCDictionary::new(), - index: BlockedInvertedIndex::new(BLOCK_SIZE), - term_to_id: HashMap::new(), - id_to_term: Vec::new(), - num_terms: 0, - } - } - - /// Initialize the autocomplete system - pub fn init(&mut self, strings: &[String], scores: &[IdType]) { - assert_eq!(strings.len(), scores.len()); - - // Build string pool - self.string_pool = ScoredStringPool::new(); - let mut offsets = Vec::with_capacity(strings.len() + 1); - let mut all_scores = Vec::with_capacity(strings.len()); - let mut total_bytes = 0; - offsets.push(0); - for (string, &score) in strings.iter().zip(scores) { - total_bytes += string.len(); - offsets.push(total_bytes); - all_scores.push(score); - } - self.string_pool.set_offsets(offsets); - self.string_pool.set_scores(all_scores); - self.string_pool.set_data(strings.iter().flat_map(|s| s.as_bytes()).cloned().collect()); - - // Build dictionary - self.dictionary.build(strings); - - // Build term mappings - self.term_to_id.clear(); - self.id_to_term.clear(); - for (i, string) in strings.iter().enumerate() { - self.term_to_id.insert(string.clone(), (i + 1) as IdType); - self.id_to_term.push(string.clone()); - } - self.num_terms = strings.len(); - - // Build trie - self.trie.clear(); - for (i, string) in strings.iter().enumerate() { - self.trie.insert(string, (i + 1) as IdType); - } - - // Build index - self.index.clear(); - for (i, _string) in strings.iter().enumerate() { - let term_id = (i + 1) as IdType; - self.index.add_doc(term_id, term_id); - } - } - - /// Find completions for a prefix - pub fn complete(&self, prefix: &str) -> Vec<(String, IdType)> { - let mut results = Vec::new(); - - // Get completion IDs from trie - let completion_ids = self.trie.complete(prefix); - - // Look up strings and scores - for &id in &completion_ids { - if let Some(string) = self.dictionary.lookup(id) { - let scored_range = self.string_pool.get(id as usize); - results.push((string, scored_range.score)); - } - } - - // Sort by score (descending) - results.sort_by(|a, b| b.1.cmp(&a.1)); - results - } - - /// Get the number of terms - pub fn num_terms(&self) -> usize { - self.num_terms - } - - /// Get the size in bytes - pub fn bytes(&self) -> usize { - self.string_pool.bytes() + - self.trie.num_nodes() * std::mem::size_of::() + - self.dictionary.bytes() + - self.index.num_blocks() * std::mem::size_of::() + - self.term_to_id.capacity() * std::mem::size_of::<(String, IdType)>() + - self.id_to_term.capacity() * std::mem::size_of::() - } -} - -/// Integer-based autocomplete implementation -pub struct Autocomplete2 { - string_pool: ScoredStringPool, - trie: CompletionTrie, - dictionary: IntegerFCDictionary, - index: BlockedInvertedIndex, - term_to_id: HashMap, - id_to_term: Vec, - num_terms: usize, -} - -impl Autocomplete2 { - /// Create a new integer-based autocomplete instance pub fn new() -> Self { Self { - string_pool: ScoredStringPool::new(), - trie: CompletionTrie::new(), - dictionary: IntegerFCDictionary::new(), - index: BlockedInvertedIndex::new(BLOCK_SIZE), - term_to_id: HashMap::new(), - id_to_term: Vec::new(), - num_terms: 0, + trie: Trie::new(), + dictionary: Dictionary::new(), } } - /// Initialize the autocomplete system - pub fn init(&mut self, strings: &[String], scores: &[IdType]) { - assert_eq!(strings.len(), scores.len()); - - // Build string pool - self.string_pool = ScoredStringPool::new(); - let mut offsets = Vec::with_capacity(strings.len() + 1); - let mut all_scores = Vec::with_capacity(strings.len()); - let mut total_bytes = 0; - offsets.push(0); - for (string, &score) in strings.iter().zip(scores) { - total_bytes += string.len(); - offsets.push(total_bytes); - all_scores.push(score); - } - self.string_pool.set_offsets(offsets); - self.string_pool.set_scores(all_scores); - self.string_pool.set_data(strings.iter().flat_map(|s| s.as_bytes()).cloned().collect()); - - // Build dictionary - self.dictionary.build(strings); - - // Build term mappings - self.term_to_id.clear(); - self.id_to_term.clear(); - for (i, string) in strings.iter().enumerate() { - self.term_to_id.insert(string.clone(), (i + 1) as IdType); - self.id_to_term.push(string.clone()); - } - self.num_terms = strings.len(); - - // Build trie - self.trie.clear(); - for (i, string) in strings.iter().enumerate() { - self.trie.insert(string, (i + 1) as IdType); - } - - // Build index - self.index.clear(); - for (i, _string) in strings.iter().enumerate() { - let term_id = (i + 1) as IdType; - self.index.add_doc(term_id, term_id); + pub fn init(&mut self, strings: &[(String, ScoreType)]) -> Result<(), String> { + for (string, score) in strings { + let id = self.dictionary.insert(string.clone()); + self.trie.insert(string, id, *score); } + Ok(()) } - /// Find completions for a prefix - pub fn complete(&self, prefix: &str) -> Vec<(String, IdType)> { - let mut results = Vec::new(); - let mut completion = Vec::new(); - - // Get completion IDs from trie - let completion_ids = self.trie.complete(prefix); - - // Look up strings and scores - for &id in &completion_ids { - let len = self.dictionary.extract(id, &mut completion); - if len > 0 { - let scored_range = self.string_pool.get(id as usize); - let string = String::from_utf8_lossy(&completion).into_owned(); - results.push((string, scored_range.score)); - } - } - - // Sort by score (descending) - results.sort_by(|a, b| b.1.cmp(&a.1)); - results + pub fn complete(&self, prefix: &str) -> Vec<(String, ScoreType)> { + let completions = self.trie.complete(prefix); + completions + .into_iter() + .filter_map(|(id, score)| { + self.dictionary.get(id).map(|text| (text.to_string(), score)) + }) + .collect() } - /// Get the number of terms pub fn num_terms(&self) -> usize { - self.num_terms + self.dictionary.len() } - /// Get the size in bytes pub fn bytes(&self) -> usize { - self.string_pool.bytes() + - self.trie.num_nodes() * std::mem::size_of::() + - self.index.num_blocks() * std::mem::size_of::() + - self.term_to_id.capacity() * std::mem::size_of::<(String, IdType)>() + - self.id_to_term.capacity() * std::mem::size_of::() + // TODO: Implement actual memory usage calculation + 0 } } \ No newline at end of file diff --git a/autocomplete-rs/src/dictionary.rs b/autocomplete-rs/src/dictionary.rs index 99a37b2..a09adda 100644 --- a/autocomplete-rs/src/dictionary.rs +++ b/autocomplete-rs/src/dictionary.rs @@ -1,199 +1,46 @@ -use std::collections::HashMap; -use crate::types::{ByteRange, IdType, global}; +use crate::types::IdType; -/// Front-coded dictionary for string compression -pub struct FCDictionary { - data: Vec, - offsets: Vec, - num_strings: usize, - total_size: usize, +#[derive(Clone)] +pub struct Dictionary { + strings: Vec, + id_map: std::collections::HashMap, + next_id: IdType, } -impl FCDictionary { - /// Create a new front-coded dictionary +impl Dictionary { pub fn new() -> Self { Self { - data: Vec::new(), - offsets: Vec::new(), - num_strings: 0, - total_size: 0, + strings: Vec::new(), + id_map: std::collections::HashMap::new(), + next_id: 0, } } - /// Build the dictionary from a list of strings - pub fn build(&mut self, strings: &[String]) { - if strings.is_empty() { - return; + pub fn insert(&mut self, string: String) -> IdType { + if let Some(&id) = self.id_map.get(&string) { + return id; } - self.num_strings = strings.len(); - self.offsets.clear(); - self.data.clear(); - self.total_size = 0; - - // Sort strings for better compression - let mut sorted_strings: Vec<_> = strings.iter().collect(); - sorted_strings.sort(); - - // First string is stored completely - let first = sorted_strings[0]; - self.offsets.push(0); - self.data.extend_from_slice(first.as_bytes()); - self.total_size += first.len(); - - // Process remaining strings - for i in 1..sorted_strings.len() { - let prev = sorted_strings[i - 1]; - let curr = sorted_strings[i]; - - // Find common prefix - let lcp = self.longest_common_prefix(prev, curr); - - // Store offset and remaining string - self.offsets.push(self.total_size as u32); - self.data.push(lcp as u8); - self.data.extend_from_slice(&curr.as_bytes()[lcp..]); - self.total_size += 1 + curr.len() - lcp; - } - } - - /// Find the longest common prefix between two strings - fn longest_common_prefix(&self, a: &str, b: &str) -> usize { - a.bytes() - .zip(b.bytes()) - .take_while(|(x, y)| x == y) - .count() - } - - /// Look up a string in the dictionary - pub fn lookup(&self, id: IdType) -> Option { - if id == 0 || id > self.num_strings as IdType { - return None; - } - - let id = (id - 1) as usize; - let offset = self.offsets[id] as usize; - - if id == 0 { - // First string is stored completely - let end = if id + 1 < self.offsets.len() { - self.offsets[id + 1] as usize - } else { - self.data.len() - }; - Some(String::from_utf8_lossy(&self.data[offset..end]).into_owned()) - } else { - // Other strings are front-coded - let lcp = self.data[offset] as usize; - let prev = self.lookup(id as IdType - 1)?; - let mut result = prev[..lcp].to_string(); - let end = if id + 1 < self.offsets.len() { - self.offsets[id + 1] as usize - } else { - self.data.len() - }; - result.push_str(std::str::from_utf8(&self.data[offset + 1..end]).unwrap()); - Some(result) - } + let id = self.next_id; + self.next_id += 1; + self.strings.push(string.clone()); + self.id_map.insert(string, id); + id } - /// Get the number of strings in the dictionary - pub fn size(&self) -> usize { - self.num_strings + pub fn get(&self, id: IdType) -> Option<&str> { + self.strings.get(id as usize).map(|s| s.as_str()) } - /// Get the total size of the compressed data - pub fn total_size(&self) -> usize { - self.total_size + pub fn get_id(&self, string: &str) -> Option { + self.id_map.get(string).copied() } - /// Get the size of the dictionary in bytes - pub fn bytes(&self) -> usize { - std::mem::size_of_val(&self.num_strings) + - std::mem::size_of_val(&self.total_size) + - self.offsets.len() * std::mem::size_of::() + - self.data.len() - } -} - -/// Integer-based front-coded dictionary -pub struct IntegerFCDictionary { - headers: Vec, - buckets: Vec, - size: usize, -} - -impl IntegerFCDictionary { - /// Create a new integer-based front-coded dictionary - pub fn new() -> Self { - Self { - headers: Vec::new(), - buckets: Vec::new(), - size: 0, - } - } - - /// Build the dictionary from a list of strings - pub fn build(&mut self, strings: &[String]) { - if strings.is_empty() { - return; - } - - self.size = strings.len(); - self.headers.clear(); - self.buckets.clear(); - - // Sort strings for better compression - let mut sorted_strings: Vec<_> = strings.iter().collect(); - sorted_strings.sort(); - - // Process strings - for i in 0..sorted_strings.len() { - let curr = sorted_strings[i]; - let lcp = if i > 0 { - self.longest_common_prefix(sorted_strings[i - 1], curr) - } else { - 0 - }; - - // Store header - self.headers.extend_from_slice(curr.as_bytes()); - - // Store bucket - self.buckets.push(lcp as u8); - self.buckets.push((curr.len() - lcp) as u8); - self.buckets.extend_from_slice(&curr.as_bytes()[lcp..]); - } - } - - /// Find the longest common prefix between two strings - fn longest_common_prefix(&self, a: &str, b: &str) -> usize { - a.bytes() - .zip(b.bytes()) - .take_while(|(x, y)| x == y) - .count() - } - - /// Extract a string from the dictionary - pub fn extract(&self, id: IdType, completion: &mut Vec) -> u8 { - if id == 0 || id > self.size as IdType { - return 0; - } - - let id = (id - 1) as usize; - let bucket_start = id * 2; - let lcp = self.buckets[bucket_start] as usize; - let remaining = self.buckets[bucket_start + 1] as usize; - - completion.clear(); - completion.extend_from_slice(&self.headers[id..id + lcp]); - completion.extend_from_slice(&self.buckets[bucket_start + 2..bucket_start + 2 + remaining]); - - (lcp + remaining) as u8 + pub fn len(&self) -> usize { + self.strings.len() } - /// Get the number of strings in the dictionary - pub fn size(&self) -> usize { - self.size + pub fn is_empty(&self) -> bool { + self.strings.is_empty() } } \ No newline at end of file diff --git a/autocomplete-rs/src/graphql.rs b/autocomplete-rs/src/graphql.rs index 197c180..daf52ab 100644 --- a/autocomplete-rs/src/graphql.rs +++ b/autocomplete-rs/src/graphql.rs @@ -1,5 +1,7 @@ -use async_graphql::{Object, Schema, SimpleObject, InputObject}; +use async_graphql::{Object, Schema, SimpleObject, InputObject, EmptySubscription}; use crate::autocomplete::Autocomplete; +use std::sync::Arc; +use tokio::sync::Mutex; #[derive(SimpleObject)] struct Completion { @@ -31,13 +33,14 @@ struct StringScoreInput { } pub struct QueryRoot { - autocomplete: Autocomplete, + autocomplete: Arc>, } #[Object] impl QueryRoot { - async fn complete(&self, prefix: String, max_results: Option) -> CompleteResponse { - let completions = self.autocomplete.complete(&prefix); + async fn complete(&self, prefix: String, _max_results: Option) -> CompleteResponse { + let autocomplete = self.autocomplete.lock().await; + let completions = autocomplete.complete(&prefix); let completions = completions.into_iter() .map(|(text, score)| Completion { text, score }) .collect(); @@ -46,15 +49,16 @@ impl QueryRoot { } async fn stats(&self) -> Stats { + let autocomplete = self.autocomplete.lock().await; Stats { - num_terms: self.autocomplete.num_terms() as i32, - memory_bytes: self.autocomplete.bytes() as i64, + num_terms: autocomplete.num_terms() as i32, + memory_bytes: autocomplete.bytes() as i64, } } } pub struct MutationRoot { - autocomplete: Autocomplete, + autocomplete: Arc>, } #[Object] @@ -65,7 +69,8 @@ impl MutationRoot { .map(|s| (s.text, s.score)) .collect(); - match self.autocomplete.init(&strings) { + let mut autocomplete = self.autocomplete.lock().await; + match autocomplete.init(&strings) { Ok(_) => InitResponse { success: true, error: None, @@ -78,13 +83,13 @@ impl MutationRoot { } } -pub type AppSchema = Schema; +pub type AppSchema = Schema; -pub fn create_schema(autocomplete: Autocomplete) -> AppSchema { +pub fn create_schema(autocomplete: Arc>) -> AppSchema { Schema::build( QueryRoot { autocomplete: autocomplete.clone() }, MutationRoot { autocomplete }, - async_graphql::EmptySubscription, + EmptySubscription, ) .finish() } \ No newline at end of file diff --git a/autocomplete-rs/src/index.rs b/autocomplete-rs/src/index.rs index 47d176e..2115e21 100644 --- a/autocomplete-rs/src/index.rs +++ b/autocomplete-rs/src/index.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; -use crate::types::{IdType, global}; +use crate::types::IdType; /// Block in the inverted index struct Block { @@ -32,8 +31,7 @@ impl Block { /// Blocked inverted index for efficient document retrieval pub struct BlockedInvertedIndex { - blocks: Vec, - term_to_block: HashMap, + blocks: Vec>, block_size: usize, } @@ -42,47 +40,21 @@ impl BlockedInvertedIndex { pub fn new(block_size: usize) -> Self { Self { blocks: Vec::new(), - term_to_block: HashMap::new(), block_size, } } /// Add a document to the index - pub fn add_doc(&mut self, term_id: IdType, doc_id: IdType) { - let block_idx = self.term_to_block.entry(term_id).or_insert_with(|| { - self.blocks.push(Block::new(term_id)); - self.blocks.len() - 1 - }); - - let block = &mut self.blocks[*block_idx]; - block.add_doc(doc_id); - - // If block is full, create a new one - if block.size() >= self.block_size { - self.blocks.push(Block::new(term_id)); - *block_idx = self.blocks.len() - 1; + pub fn insert(&mut self, id: IdType) { + if self.blocks.is_empty() || self.blocks.last().unwrap().len() >= self.block_size { + self.blocks.push(Vec::with_capacity(self.block_size)); } + self.blocks.last_mut().unwrap().push(id); } /// Get documents for a term - pub fn get_docs(&self, term_id: IdType) -> Vec { - let mut docs = Vec::new(); - - // Find all blocks for the term - let mut current_idx = self.term_to_block.get(&term_id).copied(); - while let Some(idx) = current_idx { - let block = &self.blocks[idx]; - docs.extend_from_slice(&block.docs); - - // Check if there's a next block for the same term - current_idx = if idx + 1 < self.blocks.len() && self.blocks[idx + 1].term_id == term_id { - Some(idx + 1) - } else { - None - }; - } - - docs + pub fn get(&self, block_id: usize) -> Option<&[IdType]> { + self.blocks.get(block_id).map(|v| v.as_slice()) } /// Get the number of blocks @@ -90,15 +62,9 @@ impl BlockedInvertedIndex { self.blocks.len() } - /// Get the total number of documents - pub fn num_docs(&self) -> usize { - self.blocks.iter().map(|b| b.size()).sum() - } - - /// Clear the index - pub fn clear(&mut self) { - self.blocks.clear(); - self.term_to_block.clear(); + /// Get the block size + pub fn block_size(&self) -> usize { + self.block_size } } diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs index 4004de9..7c58280 100644 --- a/autocomplete-rs/src/lib.rs +++ b/autocomplete-rs/src/lib.rs @@ -7,6 +7,8 @@ pub mod trie; pub mod dictionary; pub mod index; pub mod autocomplete; +pub mod graphql; +pub mod server; pub use constants::*; pub use parameters::*; diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs index ee606d7..4bc2099 100644 --- a/autocomplete-rs/src/main.rs +++ b/autocomplete-rs/src/main.rs @@ -7,7 +7,6 @@ mod server; mod string_pool; mod trie; mod types; -mod utils; /// Autocomplete service with gRPC and GraphQL support #[derive(Parser, Debug)] diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs index 733afef..d3eba26 100644 --- a/autocomplete-rs/src/server.rs +++ b/autocomplete-rs/src/server.rs @@ -1,15 +1,16 @@ -use std::net::SocketAddr; use tonic::{transport::Server as TonicServer, Request, Response, Status}; use axum::{ routing::{get, post}, Router, extract::State, response::IntoResponse, - Json, }; use async_graphql_axum::{GraphQLRequest, GraphQLResponse}; -use crate::autocomplete::{Autocomplete, Autocomplete2}; +use crate::autocomplete::Autocomplete; use crate::graphql::{create_schema, AppSchema}; +use std::sync::Arc; +use tokio::sync::Mutex; +use hyper::Server; pub mod autocomplete_proto { tonic::include_proto!("autocomplete"); @@ -22,8 +23,9 @@ use autocomplete_proto::{ StatsRequest, StatsResponse, }; +#[derive(Clone)] pub struct AutocompleteServiceImpl { - autocomplete: Autocomplete, + autocomplete: Arc>, } #[tonic::async_trait] @@ -33,7 +35,8 @@ impl AutocompleteService for AutocompleteServiceImpl { request: Request, ) -> Result, Status> { let req = request.into_inner(); - let completions = self.autocomplete.complete(&req.prefix); + let autocomplete = self.autocomplete.lock().await; + let completions = autocomplete.complete(&req.prefix); let response = CompleteResponse { completions: completions.into_iter() @@ -57,7 +60,8 @@ impl AutocompleteService for AutocompleteServiceImpl { .map(|s| (s.text, s.score)) .collect(); - match self.autocomplete.init(&strings) { + let mut autocomplete = self.autocomplete.lock().await; + match autocomplete.init(&strings) { Ok(_) => Ok(Response::new(InitResponse { success: true, error: String::new(), @@ -73,9 +77,10 @@ impl AutocompleteService for AutocompleteServiceImpl { &self, _request: Request, ) -> Result, Status> { + let autocomplete = self.autocomplete.lock().await; let response = StatsResponse { - num_terms: self.autocomplete.num_terms() as i32, - memory_bytes: self.autocomplete.bytes() as i64, + num_terms: autocomplete.num_terms() as i32, + memory_bytes: autocomplete.bytes() as i64, }; Ok(Response::new(response)) @@ -96,7 +101,7 @@ async fn graphql_playground() -> impl IntoResponse { } pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box> { - let autocomplete = Autocomplete::new(); + let autocomplete = Arc::new(Mutex::new(Autocomplete::new())); let schema = create_schema(autocomplete.clone()); // Create gRPC service @@ -121,7 +126,7 @@ pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box, - offsets: Vec, data: Vec, + offsets: Vec, + scores: Vec, } impl ScoredStringPool { /// Create a new empty string pool pub fn new() -> Self { - let mut pool = Self { - scores: Vec::new(), - offsets: Vec::new(), + Self { data: Vec::new(), - }; - pool.init(); - pool + offsets: vec![0], + scores: Vec::new(), + } } /// Initialize the pool @@ -33,7 +31,7 @@ impl ScoredStringPool { /// Resize the pool pub fn resize(&mut self, num_bytes: usize, k: u32) { - self.scores.resize(k as usize, 0); + self.scores.resize(k as usize, 0.0); self.data.resize(num_bytes, 0); } @@ -50,7 +48,9 @@ impl ScoredStringPool { /// Get the total number of bytes used pub fn bytes(&self) -> usize { - self.offsets.last().copied().unwrap_or(0) + std::mem::size_of_val(&self.data) + + std::mem::size_of_val(&self.offsets) + + std::mem::size_of_val(&self.scores) } /// Get a mutable reference to the data @@ -64,25 +64,24 @@ impl ScoredStringPool { } /// Get a mutable reference to the scores - pub fn scores_mut(&mut self) -> &mut [IdType] { + pub fn scores_mut(&mut self) -> &mut [f32] { &mut self.scores } /// Get a reference to the scores - pub fn scores(&self) -> &[IdType] { + pub fn scores(&self) -> &[f32] { &self.scores } /// Get a scored byte range at the given index - pub fn get(&self, i: usize) -> ScoredByteRange { - assert!(i < self.size()); - ScoredByteRange { - string: ByteRange { - begin: unsafe { self.data.as_ptr().add(self.offsets[i]) }, - end: unsafe { self.data.as_ptr().add(self.offsets[i + 1]) }, - }, - score: self.scores[i], + pub fn get(&self, index: usize) -> ByteRange { + if index >= self.offsets.len() - 1 { + return ByteRange::new(0, 0); } + ByteRange::new( + self.offsets[index], + self.offsets[index + 1] + ) } /// Set the offsets vector @@ -91,7 +90,7 @@ impl ScoredStringPool { } /// Set the scores vector - pub fn set_scores(&mut self, scores: Vec) { + pub fn set_scores(&mut self, scores: Vec) { self.scores = scores; } @@ -99,6 +98,10 @@ impl ScoredStringPool { pub fn set_data(&mut self, data: Vec) { self.data = data; } + + pub fn get_score(&self, index: usize) -> f32 { + self.scores.get(index).copied().unwrap_or(0.0) + } } /// Iterator over scored strings in the pool @@ -134,7 +137,10 @@ impl<'a> Iterator for ScoredStringPoolIterator<'a> { fn next(&mut self) -> Option { if self.pos < self.pool.size() { - let item = self.pool.get(self.pos); + let item = ScoredByteRange { + string: self.pool.get(self.pos), + score: self.pool.get_score(self.pos) as IdType, + }; self.pos += 1; Some(item) } else { diff --git a/autocomplete-rs/src/trie.rs b/autocomplete-rs/src/trie.rs index 1b24c73..05f80e5 100644 --- a/autocomplete-rs/src/trie.rs +++ b/autocomplete-rs/src/trie.rs @@ -1,182 +1,147 @@ use std::collections::HashMap; -use crate::types::{IdType, CompletionType}; +use crate::types::IdType; -/// A node in the completion trie -pub struct TrieNode { - children: HashMap, - is_terminal: bool, - completion_ids: Vec, +#[derive(Default, Clone)] +struct TrieNode { + children: HashMap>, + id: Option, + score: f32, } impl TrieNode { - /// Create a new trie node - pub fn new() -> Self { + fn new() -> Self { Self { children: HashMap::new(), - is_terminal: false, - completion_ids: Vec::new(), + id: None, + score: 0.0, } } - /// Add a child node - pub fn add_child(&mut self, c: char) -> &mut TrieNode { - self.children.entry(c).or_insert_with(TrieNode::new) - } - - /// Get a child node - pub fn get_child(&self, c: char) -> Option<&TrieNode> { - self.children.get(&c) - } - - /// Check if this is a terminal node - pub fn is_terminal(&self) -> bool { - self.is_terminal - } - - /// Set this node as terminal - pub fn set_terminal(&mut self) { - self.is_terminal = true; - } - - /// Add a completion ID - pub fn add_completion_id(&mut self, id: IdType) { - self.completion_ids.push(id); - } - - /// Get completion IDs - pub fn completion_ids(&self) -> &[IdType] { - &self.completion_ids + fn is_terminal(&self) -> bool { + self.id.is_some() } } -/// A trie for prefix-based completion -pub struct CompletionTrie { +#[derive(Clone)] +pub struct Trie { root: TrieNode, - num_nodes: usize, - num_completions: usize, } -impl CompletionTrie { - /// Create a new completion trie +impl Trie { pub fn new() -> Self { Self { root: TrieNode::new(), - num_nodes: 1, - num_completions: 0, - } - } - - /// Insert a completion string - pub fn insert(&mut self, completion: &str, id: IdType) { - let mut node = &mut self.root; - for c in completion.chars() { - node = node.add_child(c); - self.num_nodes += 1; - } - node.set_terminal(); - node.add_completion_id(id); - self.num_completions += 1; - } - - /// Find all completions for a prefix - pub fn complete(&self, prefix: &str) -> Vec { - let mut node = &self.root; - for c in prefix.chars() { - match node.get_child(c) { - Some(next) => node = next, - None => return Vec::new(), - } } - self.collect_completions(node) - } - - /// Collect all completion IDs from a node and its children - fn collect_completions(&self, node: &TrieNode) -> Vec { - let mut completions = Vec::new(); - self.collect_completions_recursive(node, &mut completions); - completions } - /// Recursive helper for collecting completions - fn collect_completions_recursive(&self, node: &TrieNode, completions: &mut Vec) { - if node.is_terminal() { - completions.extend_from_slice(node.completion_ids()); - } - for child in node.children.values() { - self.collect_completions_recursive(child, completions); + pub fn insert(&mut self, completion: &str, id: IdType, score: f32) { + let mut current = &mut self.root; + let chars: Vec = completion.chars().collect(); + + for &c in &chars { + current = current.children + .entry(c) + .or_insert_with(|| Box::new(TrieNode::new())); } + + current.id = Some(id); + current.score = score; } - /// Remove a completion string pub fn remove(&mut self, completion: &str) -> bool { - let mut chars: Vec = completion.chars().collect(); - if chars.is_empty() { - return false; - } - - // First, find if the completion exists and build the path let mut path = Vec::new(); - let mut current = &self.root; + let mut current = &mut self.root; - for &c in &chars { - match current.get_child(c) { - Some(next) => { - path.push(c); - current = next; - } - None => return false, + // First pass: find the path to the node + for c in completion.chars() { + if let Some(next) = current.children.get_mut(&c) { + path.push(c); + current = next; + } else { + return false; // String not found } } - + + // If the node is not a terminal, the string wasn't in the trie if !current.is_terminal() { return false; } - - // Now remove it by traversing the path again + + // Remove the terminal marker + current.id = None; + current.score = 0.0; + + // Second pass: remove empty nodes let mut current = &mut self.root; - let mut parent = None; + for &c in &path[..path.len()-1] { + current = current.children.get_mut(&c).unwrap(); + } - for &c in &path { - if let Some(next) = current.children.get_mut(&c) { - parent = Some((c, current)); - current = next; - } + // Remove the last node if it's empty + if current.children.is_empty() && !current.is_terminal() { + current.children.remove(&path[path.len()-1]); } + + true + } - // Remove the completion - current.completion_ids.clear(); - current.is_terminal = false; - self.num_completions -= 1; - - // Clean up empty nodes - while let Some((c, p)) = parent { - if current.children.is_empty() && !current.is_terminal() { - p.children.remove(&c); - self.num_nodes -= 1; - current = p; - parent = None; + pub fn complete(&self, prefix: &str) -> Vec<(IdType, f32)> { + let mut current = &self.root; + + // Navigate to the prefix node + for c in prefix.chars() { + if let Some(next) = current.children.get(&c) { + current = next; } else { - break; + return Vec::new(); // Prefix not found } } - - true + + // Collect all completions from this node + let mut results = Vec::new(); + self.collect_completions(current, &mut results); + results } - /// Clear the trie - pub fn clear(&mut self) { - self.root = TrieNode::new(); - self.num_nodes = 1; - self.num_completions = 0; + fn collect_completions(&self, node: &TrieNode, results: &mut Vec<(IdType, f32)>) { + if let Some(id) = node.id { + results.push((id, node.score)); + } + + for child in node.children.values() { + self.collect_completions(child, results); + } } +} - /// Get the number of nodes - pub fn num_nodes(&self) -> usize { - self.num_nodes +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_trie_insert_and_complete() { + let mut trie = Trie::new(); + trie.insert("hello", 1, 1.0); + trie.insert("help", 2, 0.8); + trie.insert("world", 3, 0.5); + + let completions = trie.complete("hel"); + assert_eq!(completions.len(), 2); + assert!(completions.contains(&(1, 1.0))); + assert!(completions.contains(&(2, 0.8))); } - /// Get the number of completions - pub fn num_completions(&self) -> usize { - self.num_completions + #[test] + fn test_trie_remove() { + let mut trie = Trie::new(); + trie.insert("hello", 1, 1.0); + trie.insert("help", 2, 0.8); + + assert!(trie.remove("hello")); + assert!(!trie.remove("hello")); // Already removed + assert!(trie.remove("help")); + + let completions = trie.complete("hel"); + assert_eq!(completions.len(), 0); } } \ No newline at end of file diff --git a/autocomplete-rs/src/types.rs b/autocomplete-rs/src/types.rs index 5490d59..cbd9316 100644 --- a/autocomplete-rs/src/types.rs +++ b/autocomplete-rs/src/types.rs @@ -1,11 +1,12 @@ -use std::ops::Range; - /// Type alias for document and term IDs pub type IdType = u32; /// Type alias for completion type (vector of term IDs) pub type CompletionType = Vec; +/// Type alias for score type +pub type ScoreType = f32; + /// Represents a range of values #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct ValueRange { @@ -48,8 +49,18 @@ impl ScoredRange { /// Represents a byte range #[derive(Debug, Clone, Copy)] pub struct ByteRange { - pub begin: *const u8, - pub end: *const u8, + pub start: usize, + pub end: usize, +} + +impl ByteRange { + pub fn new(start: usize, end: usize) -> Self { + Self { start, end } + } + + pub fn len(&self) -> usize { + self.end - self.start + } } /// Represents a range of 32-bit integers @@ -78,9 +89,10 @@ pub mod global { /// Convert a string to a byte range pub fn string_to_byte_range(s: &str) -> ByteRange { - let begin = s.as_ptr(); - let end = unsafe { begin.add(s.len()) }; - ByteRange { begin, end } + ByteRange { + start: 0, + end: s.len(), + } } /// Convert a completion to a uint32 range From 0f1af5d8711a3cca879de497dbbec58091e736de Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Fri, 23 May 2025 17:21:07 -0400 Subject: [PATCH 099/102] server partially running --- autocomplete-rs/Cargo.toml | 2 +- autocomplete-rs/src/autocomplete.rs | 2 +- autocomplete-rs/src/index.rs | 52 ++++++++++++++++++----------- autocomplete-rs/src/lib.rs | 16 +++++---- autocomplete-rs/src/main.rs | 12 ++----- autocomplete-rs/src/server.rs | 22 +++++++++++- 6 files changed, 67 insertions(+), 39 deletions(-) diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml index b799bbe..276f817 100644 --- a/autocomplete-rs/Cargo.toml +++ b/autocomplete-rs/Cargo.toml @@ -12,7 +12,7 @@ async-graphql = "6.0" async-graphql-axum = "6.0" axum = { version = "0.6", features = ["macros"] } tower = "0.4" -tower-http = { version = "0.4", features = ["trace"] } +tower-http = { version = "0.4", features = ["trace", "cors"] } hyper = { version = "0.14", features = ["full"] } clap = { version = "4.4", features = ["derive"] } diff --git a/autocomplete-rs/src/autocomplete.rs b/autocomplete-rs/src/autocomplete.rs index b910078..bb9ffa6 100644 --- a/autocomplete-rs/src/autocomplete.rs +++ b/autocomplete-rs/src/autocomplete.rs @@ -1,6 +1,6 @@ use crate::types::ScoreType; use crate::trie::Trie; -use crate::dictionary::Dictionary; +use super::dictionary::Dictionary; #[derive(Clone)] pub struct Autocomplete { diff --git a/autocomplete-rs/src/index.rs b/autocomplete-rs/src/index.rs index 2115e21..29da316 100644 --- a/autocomplete-rs/src/index.rs +++ b/autocomplete-rs/src/index.rs @@ -1,31 +1,43 @@ -use crate::types::IdType; - -/// Block in the inverted index -struct Block { - term_id: IdType, - num_docs: usize, - docs: Vec, +use crate::types::{IdType, ScoreType}; +use crate::trie::Trie; +use crate::dictionary::Dictionary; + +#[derive(Clone)] +pub struct Index { + trie: Trie, + dictionary: Dictionary, } -impl Block { - /// Create a new block - fn new(term_id: IdType) -> Self { +impl Index { + pub fn new() -> Self { Self { - term_id, - num_docs: 0, - docs: Vec::new(), + trie: Trie::new(), + dictionary: Dictionary::new(), } } - /// Add a document to the block - fn add_doc(&mut self, doc_id: IdType) { - self.docs.push(doc_id); - self.num_docs += 1; + pub fn add_doc(&mut self, _doc_id: IdType, text: &str, score: ScoreType) { + let id = self.dictionary.insert(text.to_string()); + self.trie.insert(text, id, score); + } + + pub fn search(&self, prefix: &str) -> Vec<(IdType, ScoreType)> { + let completions = self.trie.complete(prefix); + completions + .into_iter() + .filter_map(|(id, score)| { + self.dictionary.get(id).map(|_| (id, score)) + }) + .collect() } - /// Get the number of documents in the block - fn size(&self) -> usize { - self.num_docs + pub fn num_terms(&self) -> usize { + self.dictionary.len() + } + + pub fn bytes(&self) -> usize { + // TODO: Implement actual memory usage calculation + 0 } } diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs index 7c58280..7d1cb44 100644 --- a/autocomplete-rs/src/lib.rs +++ b/autocomplete-rs/src/lib.rs @@ -1,21 +1,23 @@ +pub mod dictionary; +pub mod types; +pub mod trie; pub mod constants; pub mod parameters; pub mod probe; -pub mod types; pub mod string_pool; -pub mod trie; -pub mod dictionary; pub mod index; pub mod autocomplete; pub mod graphql; pub mod server; +pub use dictionary::Dictionary; +pub use types::*; +pub use trie::*; pub use constants::*; pub use parameters::*; pub use probe::*; -pub use types::*; pub use string_pool::*; -pub use trie::*; -pub use dictionary::*; pub use index::*; -pub use autocomplete::*; \ No newline at end of file +pub use autocomplete::*; +pub use graphql::*; +pub use server::*; \ No newline at end of file diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs index 4bc2099..c5214c2 100644 --- a/autocomplete-rs/src/main.rs +++ b/autocomplete-rs/src/main.rs @@ -1,23 +1,17 @@ use std::error::Error; use clap::Parser; - -mod autocomplete; -mod graphql; -mod server; -mod string_pool; -mod trie; -mod types; +use autocomplete_rs::server; /// Autocomplete service with gRPC and GraphQL support #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { /// gRPC server address - #[arg(short, long, default_value = "[::1]:50051")] + #[arg(short = 'r', long, default_value = "[::1]:50051")] grpc_addr: String, /// GraphQL server address - #[arg(short, long, default_value = "[::1]:8000")] + #[arg(short = 'g', long, default_value = "[::1]:8000")] graphql_addr: String, } diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs index d3eba26..358bbb2 100644 --- a/autocomplete-rs/src/server.rs +++ b/autocomplete-rs/src/server.rs @@ -4,6 +4,7 @@ use axum::{ Router, extract::State, response::IntoResponse, + http::HeaderValue, }; use async_graphql_axum::{GraphQLRequest, GraphQLResponse}; use crate::autocomplete::Autocomplete; @@ -11,6 +12,7 @@ use crate::graphql::{create_schema, AppSchema}; use std::sync::Arc; use tokio::sync::Mutex; use hyper::Server; +use tower_http::cors::{CorsLayer, Any}; pub mod autocomplete_proto { tonic::include_proto!("autocomplete"); @@ -97,6 +99,7 @@ async fn graphql_handler( async fn graphql_playground() -> impl IntoResponse { async_graphql::http::playground_source( async_graphql::http::GraphQLPlaygroundConfig::new("/graphql") + .subscription_endpoint("/graphql") ) } @@ -109,10 +112,18 @@ pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box().unwrap()) + .allow_methods(Any) + .allow_headers(Any); + // Create GraphQL router let app = Router::new() .route("/graphql", post(graphql_handler)) + .route("/", get(graphql_playground)) .route("/playground", get(graphql_playground)) + .layer(cors) .with_state(schema); // Start both servers @@ -121,13 +132,22 @@ pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box Date: Sun, 25 May 2025 12:19:33 -0400 Subject: [PATCH 100/102] removed server until function works --- autocomplete-rs/.gitignore | 18 + autocomplete-rs/Cargo.lock | 1488 +--------------------- autocomplete-rs/Cargo.toml | 8 - autocomplete-rs/build.rs | 5 +- autocomplete-rs/proto/autocomplete.proto | 58 - autocomplete-rs/src/graphql.rs | 95 -- autocomplete-rs/src/main.rs | 26 +- autocomplete-rs/src/server.rs | 153 --- 8 files changed, 75 insertions(+), 1776 deletions(-) create mode 100644 autocomplete-rs/.gitignore delete mode 100644 autocomplete-rs/proto/autocomplete.proto delete mode 100644 autocomplete-rs/src/graphql.rs delete mode 100644 autocomplete-rs/src/server.rs diff --git a/autocomplete-rs/.gitignore b/autocomplete-rs/.gitignore new file mode 100644 index 0000000..da95885 --- /dev/null +++ b/autocomplete-rs/.gitignore @@ -0,0 +1,18 @@ +# Cargo +target/ + +# IDEs +.vscode/ +.idea/ + +# OS +.DS_Store + +# Rust + +# Build +build.rs + +# Cargo.lock +Cargo.lock + diff --git a/autocomplete-rs/Cargo.lock b/autocomplete-rs/Cargo.lock index bd45602..6222344 100644 --- a/autocomplete-rs/Cargo.lock +++ b/autocomplete-rs/Cargo.lock @@ -2,16 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "Inflector" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" -dependencies = [ - "lazy_static", - "regex", -] - [[package]] name = "addr2line" version = "0.24.2" @@ -92,137 +82,6 @@ version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" -[[package]] -name = "ascii_utils" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71938f30533e4d95a6d17aa530939da3842c2ab6f4f84b9dae68447e4129f74a" - -[[package]] -name = "async-graphql" -version = "6.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298a5d587d6e6fdb271bf56af2dc325a80eb291fd0fc979146584b9a05494a8c" -dependencies = [ - "async-graphql-derive", - "async-graphql-parser", - "async-graphql-value", - "async-stream", - "async-trait", - "base64 0.13.1", - "bytes", - "fast_chemail", - "fnv", - "futures-util", - "handlebars", - "http", - "indexmap 2.9.0", - "mime", - "multer", - "num-traits", - "once_cell", - "pin-project-lite", - "regex", - "serde", - "serde_json", - "serde_urlencoded", - "static_assertions", - "tempfile", - "thiserror 1.0.69", -] - -[[package]] -name = "async-graphql-axum" -version = "6.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01a1c20a2059bffbc95130715b23435a05168c518fba9709c81fa2a38eed990c" -dependencies = [ - "async-graphql", - "async-trait", - "axum", - "bytes", - "futures-util", - "serde_json", - "tokio", - "tokio-stream", - "tokio-util", - "tower-service", -] - -[[package]] -name = "async-graphql-derive" -version = "6.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7f329c7eb9b646a72f70c9c4b516c70867d356ec46cb00dcac8ad343fd006b0" -dependencies = [ - "Inflector", - "async-graphql-parser", - "darling", - "proc-macro-crate", - "proc-macro2", - "quote", - "strum", - "syn", - "thiserror 1.0.69", -] - -[[package]] -name = "async-graphql-parser" -version = "6.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6139181845757fd6a73fbb8839f3d036d7150b798db0e9bb3c6e83cdd65bd53b" -dependencies = [ - "async-graphql-value", - "pest", - "serde", - "serde_json", -] - -[[package]] -name = "async-graphql-value" -version = "6.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "323a5143f5bdd2030f45e3f2e0c821c9b1d36e79cf382129c64299c50a7f3750" -dependencies = [ - "bytes", - "indexmap 2.9.0", - "serde", - "serde_json", -] - -[[package]] -name = "async-stream" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" -dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "async-trait" -version = "0.1.88" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "autocfg" version = "1.4.0" @@ -233,85 +92,11 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" name = "autocomplete-rs" version = "0.1.0" dependencies = [ - "async-graphql", - "async-graphql-axum", - "axum", "clap", "futures", - "hyper", - "prost", "tempfile", "tokio", - "tonic", "tonic-build", - "tower", - "tower-http", -] - -[[package]] -name = "axum" -version = "0.6.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" -dependencies = [ - "async-trait", - "axum-core", - "axum-macros", - "base64 0.21.7", - "bitflags 1.3.2", - "bytes", - "futures-util", - "headers", - "http", - "http-body", - "hyper", - "itoa", - "matchit", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "rustversion", - "serde", - "serde_json", - "serde_path_to_error", - "serde_urlencoded", - "sha1", - "sync_wrapper", - "tokio", - "tokio-tungstenite", - "tower", - "tower-layer", - "tower-service", -] - -[[package]] -name = "axum-core" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" -dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http", - "http-body", - "mime", - "rustversion", - "tower-layer", - "tower-service", -] - -[[package]] -name = "axum-macros" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdca6a10ecad987bda04e95606ef85a5417dcaac1a78455242d72e031e2b6b62" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "syn", ] [[package]] @@ -329,53 +114,17 @@ dependencies = [ "windows-targets", ] -[[package]] -name = "base64" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" - -[[package]] -name = "base64" -version = "0.21.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - [[package]] name = "bytes" version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" -dependencies = [ - "serde", -] [[package]] name = "cfg-if" @@ -411,7 +160,7 @@ version = "4.5.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn", @@ -429,102 +178,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" -[[package]] -name = "cpufeatures" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" -dependencies = [ - "libc", -] - -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - -[[package]] -name = "darling" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" -dependencies = [ - "darling_core", - "darling_macro", -] - -[[package]] -name = "darling_core" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim", - "syn", -] - -[[package]] -name = "darling_macro" -version = "0.20.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" -dependencies = [ - "darling_core", - "quote", - "syn", -] - -[[package]] -name = "data-encoding" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", -] - -[[package]] -name = "displaydoc" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" -[[package]] -name = "encoding_rs" -version = "0.8.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" -dependencies = [ - "cfg-if", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -541,15 +200,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "fast_chemail" -version = "0.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "495a39d30d624c2caabe6312bfead73e7717692b44e0b32df168c275a2e8e9e4" -dependencies = [ - "ascii_utils", -] - [[package]] name = "fastrand" version = "2.3.0" @@ -562,21 +212,6 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "form_urlencoded" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" -dependencies = [ - "percent-encoding", -] - [[package]] name = "futures" version = "0.3.31" @@ -666,27 +301,6 @@ dependencies = [ "slab", ] -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.11.0+wasi-snapshot-preview1", -] - [[package]] name = "getrandom" version = "0.3.3" @@ -705,45 +319,6 @@ version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" -[[package]] -name = "h2" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap 2.9.0", - "slab", - "tokio", - "tokio-util", - "tracing", -] - -[[package]] -name = "handlebars" -version = "4.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faa67bab9ff362228eb3d00bd024a4965d8231bbb7921167f0cfa66c6626b225" -dependencies = [ - "log", - "pest", - "pest_derive", - "serde", - "serde_json", - "thiserror 1.0.69", -] - -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" - [[package]] name = "hashbrown" version = "0.15.3" @@ -751,296 +326,48 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" [[package]] -name = "headers" -version = "0.3.9" +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06683b93020a07e3dbcf5f8c0f6d40080d725bea7936fc01ad345c01b97dc270" -dependencies = [ - "base64 0.21.7", - "bytes", - "headers-core", - "http", - "httpdate", - "mime", - "sha1", -] +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] -name = "headers-core" -version = "0.2.0" +name = "indexmap" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ - "http", + "equivalent", + "hashbrown", ] [[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - -[[package]] -name = "heck" -version = "0.5.0" +name = "is_terminal_polyfill" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] -name = "http" -version = "0.2.12" +name = "itertools" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ - "bytes", - "fnv", - "itoa", + "either", ] [[package]] -name = "http-body" -version = "0.4.6" +name = "libc" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" -dependencies = [ - "bytes", - "http", - "pin-project-lite", -] +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] -name = "http-range-header" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f" - -[[package]] -name = "httparse" -version = "1.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" - -[[package]] -name = "httpdate" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" - -[[package]] -name = "hyper" -version = "0.14.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper-timeout" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" -dependencies = [ - "hyper", - "pin-project-lite", - "tokio", - "tokio-io-timeout", -] - -[[package]] -name = "icu_collections" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" -dependencies = [ - "displaydoc", - "potential_utf", - "yoke", - "zerofrom", - "zerovec", -] - -[[package]] -name = "icu_locale_core" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" -dependencies = [ - "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", -] - -[[package]] -name = "icu_normalizer" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "smallvec", - "zerovec", -] - -[[package]] -name = "icu_normalizer_data" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" - -[[package]] -name = "icu_properties" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_locale_core", - "icu_properties_data", - "icu_provider", - "potential_utf", - "zerotrie", - "zerovec", -] - -[[package]] -name = "icu_properties_data" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" - -[[package]] -name = "icu_provider" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" -dependencies = [ - "displaydoc", - "icu_locale_core", - "stable_deref_trait", - "tinystr", - "writeable", - "yoke", - "zerofrom", - "zerotrie", - "zerovec", -] - -[[package]] -name = "ident_case" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" - -[[package]] -name = "idna" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" -dependencies = [ - "idna_adapter", - "smallvec", - "utf8_iter", -] - -[[package]] -name = "idna_adapter" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" -dependencies = [ - "icu_normalizer", - "icu_properties", -] - -[[package]] -name = "indexmap" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" -dependencies = [ - "autocfg", - "hashbrown 0.12.3", -] - -[[package]] -name = "indexmap" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" -dependencies = [ - "equivalent", - "hashbrown 0.15.3", - "serde", -] - -[[package]] -name = "is_terminal_polyfill" -version = "1.70.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" - -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - -[[package]] -name = "libc" -version = "0.2.172" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" - -[[package]] -name = "linux-raw-sys" -version = "0.9.4" +name = "linux-raw-sys" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" -[[package]] -name = "litemap" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" - [[package]] name = "lock_api" version = "0.4.12" @@ -1057,24 +384,12 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" -[[package]] -name = "matchit" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" - [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - [[package]] name = "miniz_oxide" version = "0.8.8" @@ -1095,39 +410,12 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "multer" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01acbdc23469fd8fe07ab135923371d5f5a422fbf9c522158677c8eb15bc51c2" -dependencies = [ - "bytes", - "encoding_rs", - "futures-util", - "http", - "httparse", - "log", - "memchr", - "mime", - "spin", - "version_check", -] - [[package]] name = "multimap" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", -] - [[package]] name = "object" version = "0.36.7" @@ -1172,57 +460,6 @@ dependencies = [ "windows-targets", ] -[[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "pest" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "198db74531d58c70a361c42201efde7e2591e976d518caf7662a47dc5720e7b6" -dependencies = [ - "memchr", - "thiserror 2.0.12", - "ucd-trie", -] - -[[package]] -name = "pest_derive" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d725d9cfd79e87dccc9341a2ef39d1b6f6353d68c4b33c177febbe1a402c97c5" -dependencies = [ - "pest", - "pest_generator", -] - -[[package]] -name = "pest_generator" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db7d01726be8ab66ab32f9df467ae8b1148906685bbe75c82d1e65d7f5b3f841" -dependencies = [ - "pest", - "pest_meta", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pest_meta" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9f832470494906d1fca5329f8ab5791cc60beb230c74815dff541cbd2b5ca0" -dependencies = [ - "once_cell", - "pest", - "sha2", -] - [[package]] name = "petgraph" version = "0.6.5" @@ -1230,27 +467,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ "fixedbitset", - "indexmap 2.9.0", -] - -[[package]] -name = "pin-project" -version = "1.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "indexmap", ] [[package]] @@ -1265,24 +482,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "potential_utf" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" -dependencies = [ - "zerovec", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - [[package]] name = "prettyplease" version = "0.2.32" @@ -1293,16 +492,6 @@ dependencies = [ "syn", ] -[[package]] -name = "proc-macro-crate" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" -dependencies = [ - "once_cell", - "toml_edit", -] - [[package]] name = "proc-macro2" version = "1.0.95" @@ -1329,7 +518,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", - "heck 0.5.0", + "heck", "itertools", "log", "multimap", @@ -1380,43 +569,13 @@ version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.16", -] - [[package]] name = "redox_syscall" version = "0.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" dependencies = [ - "bitflags 2.9.1", + "bitflags", ] [[package]] @@ -1460,25 +619,13 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" dependencies = [ - "bitflags 2.9.1", + "bitflags", "errno", "libc", "linux-raw-sys", "windows-sys 0.59.0", ] -[[package]] -name = "rustversion" -version = "1.0.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" - -[[package]] -name = "ryu" -version = "1.0.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" - [[package]] name = "scopeguard" version = "1.2.0" @@ -1486,161 +633,45 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] -name = "serde" -version = "1.0.219" +name = "signal-hook-registry" +version = "1.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410" dependencies = [ - "serde_derive", + "libc", ] [[package]] -name = "serde_derive" -version = "1.0.219" +name = "slab" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ - "proc-macro2", - "quote", - "syn", + "autocfg", ] [[package]] -name = "serde_json" -version = "1.0.140" +name = "smallvec" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" -dependencies = [ - "itoa", - "memchr", - "ryu", - "serde", -] +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] -name = "serde_path_to_error" -version = "0.1.17" +name = "socket2" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a" +checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" dependencies = [ - "itoa", - "serde", + "libc", + "windows-sys 0.52.0", ] [[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "sha1" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha2" -version = "0.10.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410" -dependencies = [ - "libc", -] - -[[package]] -name = "slab" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] - -[[package]] -name = "smallvec" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" - -[[package]] -name = "socket2" -version = "0.5.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - -[[package]] -name = "strsim" -version = "0.11.1" +name = "strsim" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "strum" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" -dependencies = [ - "strum_macros", -] - -[[package]] -name = "strum_macros" -version = "0.25.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "rustversion", - "syn", -] - [[package]] name = "syn" version = "2.0.101" @@ -1652,23 +683,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "sync_wrapper" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" - -[[package]] -name = "synstructure" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "tempfile" version = "3.20.0" @@ -1676,62 +690,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" dependencies = [ "fastrand", - "getrandom 0.3.3", + "getrandom", "once_cell", "rustix", "windows-sys 0.59.0", ] -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] - -[[package]] -name = "thiserror" -version = "2.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" -dependencies = [ - "thiserror-impl 2.0.12", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "thiserror-impl" -version = "2.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tinystr" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" -dependencies = [ - "displaydoc", - "zerovec", -] - [[package]] name = "tokio" version = "1.45.0" @@ -1750,16 +714,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "tokio-io-timeout" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" -dependencies = [ - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-macros" version = "2.5.0" @@ -1771,87 +725,6 @@ dependencies = [ "syn", ] -[[package]] -name = "tokio-stream" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tokio-tungstenite" -version = "0.20.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "212d5dcb2a1ce06d81107c3d0ffa3121fe974b73f068c8282cb1c32328113b6c" -dependencies = [ - "futures-util", - "log", - "tokio", - "tungstenite", -] - -[[package]] -name = "tokio-util" -version = "0.7.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" -dependencies = [ - "bytes", - "futures-core", - "futures-io", - "futures-sink", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "toml_datetime" -version = "0.6.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3" - -[[package]] -name = "toml_edit" -version = "0.19.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" -dependencies = [ - "indexmap 2.9.0", - "toml_datetime", - "winnow", -] - -[[package]] -name = "tonic" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e" -dependencies = [ - "async-stream", - "async-trait", - "axum", - "base64 0.21.7", - "bytes", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "tokio", - "tokio-stream", - "tower", - "tower-layer", - "tower-service", - "tracing", -] - [[package]] name = "tonic-build" version = "0.10.2" @@ -1865,176 +738,18 @@ dependencies = [ "syn", ] -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "indexmap 1.9.3", - "pin-project", - "pin-project-lite", - "rand", - "slab", - "tokio", - "tokio-util", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower-http" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140" -dependencies = [ - "bitflags 2.9.1", - "bytes", - "futures-core", - "futures-util", - "http", - "http-body", - "http-range-header", - "pin-project-lite", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower-layer" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" - -[[package]] -name = "tower-service" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" - -[[package]] -name = "tracing" -version = "0.1.41" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" -dependencies = [ - "log", - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tracing-core" -version = "0.1.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" -dependencies = [ - "once_cell", -] - -[[package]] -name = "try-lock" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" - -[[package]] -name = "tungstenite" -version = "0.20.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9" -dependencies = [ - "byteorder", - "bytes", - "data-encoding", - "http", - "httparse", - "log", - "rand", - "sha1", - "thiserror 1.0.69", - "url", - "utf-8", -] - -[[package]] -name = "typenum" -version = "1.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" - -[[package]] -name = "ucd-trie" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" - [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" -[[package]] -name = "url" -version = "2.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", -] - -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - -[[package]] -name = "utf8_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" - [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - -[[package]] -name = "want" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" -dependencies = [ - "try-lock", -] - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -2132,124 +847,11 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "winnow" -version = "0.5.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" -dependencies = [ - "memchr", -] - [[package]] name = "wit-bindgen-rt" version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags 2.9.1", -] - -[[package]] -name = "writeable" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" - -[[package]] -name = "yoke" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" -dependencies = [ - "serde", - "stable_deref_trait", - "yoke-derive", - "zerofrom", -] - -[[package]] -name = "yoke-derive" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "synstructure", -] - -[[package]] -name = "zerocopy" -version = "0.8.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "zerofrom" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" -dependencies = [ - "zerofrom-derive", -] - -[[package]] -name = "zerofrom-derive" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "synstructure", -] - -[[package]] -name = "zerotrie" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" -dependencies = [ - "displaydoc", - "yoke", - "zerofrom", -] - -[[package]] -name = "zerovec" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" -dependencies = [ - "yoke", - "zerofrom", - "zerovec-derive", -] - -[[package]] -name = "zerovec-derive" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "bitflags", ] diff --git a/autocomplete-rs/Cargo.toml b/autocomplete-rs/Cargo.toml index 276f817..68ed87f 100644 --- a/autocomplete-rs/Cargo.toml +++ b/autocomplete-rs/Cargo.toml @@ -4,16 +4,8 @@ version = "0.1.0" edition = "2021" [dependencies] -tonic = { version = "0.10", features = ["transport"] } -prost = "0.12" tokio = { version = "1.0", features = ["full"] } futures = "0.3" -async-graphql = "6.0" -async-graphql-axum = "6.0" -axum = { version = "0.6", features = ["macros"] } -tower = "0.4" -tower-http = { version = "0.4", features = ["trace", "cors"] } -hyper = { version = "0.14", features = ["full"] } clap = { version = "4.4", features = ["derive"] } [dev-dependencies] diff --git a/autocomplete-rs/build.rs b/autocomplete-rs/build.rs index 7d082f1..ed0ba48 100644 --- a/autocomplete-rs/build.rs +++ b/autocomplete-rs/build.rs @@ -1,4 +1,3 @@ -fn main() -> Result<(), Box> { - tonic_build::compile_protos("proto/autocomplete.proto")?; - Ok(()) +fn main() { + // No build-time code generation needed } \ No newline at end of file diff --git a/autocomplete-rs/proto/autocomplete.proto b/autocomplete-rs/proto/autocomplete.proto deleted file mode 100644 index 12c2e74..0000000 --- a/autocomplete-rs/proto/autocomplete.proto +++ /dev/null @@ -1,58 +0,0 @@ -syntax = "proto3"; - -package autocomplete; - -// The autocomplete service definition -service AutocompleteService { - // Get completions for a prefix - rpc Complete (CompleteRequest) returns (CompleteResponse) {} - - // Initialize the autocomplete system with strings and scores - rpc Init (InitRequest) returns (InitResponse) {} - - // Get system statistics - rpc GetStats (StatsRequest) returns (StatsResponse) {} -} - -// Request message for completion -message CompleteRequest { - string prefix = 1; - int32 max_results = 2; // Optional: limit number of results -} - -// Response message containing completions -message CompleteResponse { - repeated Completion completions = 1; -} - -// A single completion result -message Completion { - string text = 1; - float score = 2; -} - -// Request message for initialization -message InitRequest { - repeated StringScore strings = 1; -} - -// A string with its score -message StringScore { - string text = 1; - float score = 2; -} - -// Response message for initialization -message InitResponse { - bool success = 1; - string error = 2; // Empty if success is true -} - -// Request message for stats -message StatsRequest {} - -// Response message containing system statistics -message StatsResponse { - int32 num_terms = 1; - int64 memory_bytes = 2; -} \ No newline at end of file diff --git a/autocomplete-rs/src/graphql.rs b/autocomplete-rs/src/graphql.rs deleted file mode 100644 index daf52ab..0000000 --- a/autocomplete-rs/src/graphql.rs +++ /dev/null @@ -1,95 +0,0 @@ -use async_graphql::{Object, Schema, SimpleObject, InputObject, EmptySubscription}; -use crate::autocomplete::Autocomplete; -use std::sync::Arc; -use tokio::sync::Mutex; - -#[derive(SimpleObject)] -struct Completion { - text: String, - score: f32, -} - -#[derive(SimpleObject)] -struct CompleteResponse { - completions: Vec, -} - -#[derive(SimpleObject)] -struct Stats { - num_terms: i32, - memory_bytes: i64, -} - -#[derive(SimpleObject)] -struct InitResponse { - success: bool, - error: Option, -} - -#[derive(InputObject)] -struct StringScoreInput { - text: String, - score: f32, -} - -pub struct QueryRoot { - autocomplete: Arc>, -} - -#[Object] -impl QueryRoot { - async fn complete(&self, prefix: String, _max_results: Option) -> CompleteResponse { - let autocomplete = self.autocomplete.lock().await; - let completions = autocomplete.complete(&prefix); - let completions = completions.into_iter() - .map(|(text, score)| Completion { text, score }) - .collect(); - - CompleteResponse { completions } - } - - async fn stats(&self) -> Stats { - let autocomplete = self.autocomplete.lock().await; - Stats { - num_terms: autocomplete.num_terms() as i32, - memory_bytes: autocomplete.bytes() as i64, - } - } -} - -pub struct MutationRoot { - autocomplete: Arc>, -} - -#[Object] -impl MutationRoot { - async fn init(&self, strings: Vec) -> InitResponse { - let strings: Vec<(String, f32)> = strings - .into_iter() - .map(|s| (s.text, s.score)) - .collect(); - - let mut autocomplete = self.autocomplete.lock().await; - match autocomplete.init(&strings) { - Ok(_) => InitResponse { - success: true, - error: None, - }, - Err(e) => InitResponse { - success: false, - error: Some(e.to_string()), - }, - } - } -} - -pub type AppSchema = Schema; - -pub fn create_schema(autocomplete: Arc>) -> AppSchema { - Schema::build( - QueryRoot { autocomplete: autocomplete.clone() }, - MutationRoot { autocomplete }, - EmptySubscription, - ) - .finish() -} \ No newline at end of file diff --git a/autocomplete-rs/src/main.rs b/autocomplete-rs/src/main.rs index c5214c2..751189f 100644 --- a/autocomplete-rs/src/main.rs +++ b/autocomplete-rs/src/main.rs @@ -1,31 +1,25 @@ use std::error::Error; use clap::Parser; -use autocomplete_rs::server; -/// Autocomplete service with gRPC and GraphQL support +/// Autocomplete service #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { - /// gRPC server address - #[arg(short = 'r', long, default_value = "[::1]:50051")] - grpc_addr: String, - - /// GraphQL server address - #[arg(short = 'g', long, default_value = "[::1]:8000")] - graphql_addr: String, + /// Input file path + #[arg(short, long)] + input: Option, } #[tokio::main] async fn main() -> Result<(), Box> { let args = Args::parse(); - println!("Starting Autocomplete Service..."); - println!("gRPC server will listen on: {}", args.grpc_addr); - println!("GraphQL server will listen on: {}", args.graphql_addr); - println!("GraphQL Playground available at: http://{}/playground", args.graphql_addr); - - // Start both servers - server::run_server(&args.grpc_addr, &args.graphql_addr).await?; + println!("Autocomplete Service"); + + if let Some(input) = args.input { + println!("Processing input file: {}", input); + // TODO: Implement file processing logic + } Ok(()) } diff --git a/autocomplete-rs/src/server.rs b/autocomplete-rs/src/server.rs deleted file mode 100644 index 358bbb2..0000000 --- a/autocomplete-rs/src/server.rs +++ /dev/null @@ -1,153 +0,0 @@ -use tonic::{transport::Server as TonicServer, Request, Response, Status}; -use axum::{ - routing::{get, post}, - Router, - extract::State, - response::IntoResponse, - http::HeaderValue, -}; -use async_graphql_axum::{GraphQLRequest, GraphQLResponse}; -use crate::autocomplete::Autocomplete; -use crate::graphql::{create_schema, AppSchema}; -use std::sync::Arc; -use tokio::sync::Mutex; -use hyper::Server; -use tower_http::cors::{CorsLayer, Any}; - -pub mod autocomplete_proto { - tonic::include_proto!("autocomplete"); -} - -use autocomplete_proto::{ - autocomplete_service_server::{AutocompleteService, AutocompleteServiceServer}, - CompleteRequest, CompleteResponse, Completion, - InitRequest, InitResponse, - StatsRequest, StatsResponse, -}; - -#[derive(Clone)] -pub struct AutocompleteServiceImpl { - autocomplete: Arc>, -} - -#[tonic::async_trait] -impl AutocompleteService for AutocompleteServiceImpl { - async fn complete( - &self, - request: Request, - ) -> Result, Status> { - let req = request.into_inner(); - let autocomplete = self.autocomplete.lock().await; - let completions = autocomplete.complete(&req.prefix); - - let response = CompleteResponse { - completions: completions.into_iter() - .map(|(text, score)| Completion { - text, - score, - }) - .collect(), - }; - - Ok(Response::new(response)) - } - - async fn init( - &self, - request: Request, - ) -> Result, Status> { - let req = request.into_inner(); - let strings: Vec<(String, f32)> = req.strings - .into_iter() - .map(|s| (s.text, s.score)) - .collect(); - - let mut autocomplete = self.autocomplete.lock().await; - match autocomplete.init(&strings) { - Ok(_) => Ok(Response::new(InitResponse { - success: true, - error: String::new(), - })), - Err(e) => Ok(Response::new(InitResponse { - success: false, - error: e.to_string(), - })), - } - } - - async fn get_stats( - &self, - _request: Request, - ) -> Result, Status> { - let autocomplete = self.autocomplete.lock().await; - let response = StatsResponse { - num_terms: autocomplete.num_terms() as i32, - memory_bytes: autocomplete.bytes() as i64, - }; - - Ok(Response::new(response)) - } -} - -async fn graphql_handler( - State(schema): State, - req: GraphQLRequest, -) -> GraphQLResponse { - schema.execute(req.into_inner()).await.into() -} - -async fn graphql_playground() -> impl IntoResponse { - async_graphql::http::playground_source( - async_graphql::http::GraphQLPlaygroundConfig::new("/graphql") - .subscription_endpoint("/graphql") - ) -} - -pub async fn run_server(grpc_addr: &str, graphql_addr: &str) -> Result<(), Box> { - let autocomplete = Arc::new(Mutex::new(Autocomplete::new())); - let schema = create_schema(autocomplete.clone()); - - // Create gRPC service - let grpc_service = AutocompleteServiceImpl { - autocomplete: autocomplete.clone(), - }; - - // Configure CORS - let cors = CorsLayer::new() - .allow_origin("*".parse::().unwrap()) - .allow_methods(Any) - .allow_headers(Any); - - // Create GraphQL router - let app = Router::new() - .route("/graphql", post(graphql_handler)) - .route("/", get(graphql_playground)) - .route("/playground", get(graphql_playground)) - .layer(cors) - .with_state(schema); - - // Start both servers - let grpc_addr = grpc_addr.parse()?; - let graphql_addr = graphql_addr.parse()?; - - println!("gRPC server listening on {}", grpc_addr); - println!("GraphQL server listening on {}", graphql_addr); - println!("GraphQL Playground available at: http://localhost:8000/playground"); - - let (grpc_result, graphql_result) = tokio::join!( - TonicServer::builder() - .add_service(AutocompleteServiceServer::new(grpc_service)) - .serve(grpc_addr), - Server::bind(&graphql_addr).serve(app.into_make_service()) - ); - - // Handle any errors from the servers - if let Err(e) = grpc_result { - return Err(Box::new(e)); - } - if let Err(e) = graphql_result { - return Err(Box::new(e)); - } - - Ok(()) -} \ No newline at end of file From 5923ff306334e85bc3d5246df4d41eabe358134d Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Sun, 25 May 2025 12:28:13 -0400 Subject: [PATCH 101/102] add unit testing --- autocomplete-rs/src/lib.rs | 6 +- autocomplete-rs/tests/dictionary_tests.rs | 119 ++++++++++++++++++++++ 2 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 autocomplete-rs/tests/dictionary_tests.rs diff --git a/autocomplete-rs/src/lib.rs b/autocomplete-rs/src/lib.rs index 7d1cb44..70048c6 100644 --- a/autocomplete-rs/src/lib.rs +++ b/autocomplete-rs/src/lib.rs @@ -7,8 +7,6 @@ pub mod probe; pub mod string_pool; pub mod index; pub mod autocomplete; -pub mod graphql; -pub mod server; pub use dictionary::Dictionary; pub use types::*; @@ -18,6 +16,4 @@ pub use parameters::*; pub use probe::*; pub use string_pool::*; pub use index::*; -pub use autocomplete::*; -pub use graphql::*; -pub use server::*; \ No newline at end of file +pub use autocomplete::*; \ No newline at end of file diff --git a/autocomplete-rs/tests/dictionary_tests.rs b/autocomplete-rs/tests/dictionary_tests.rs new file mode 100644 index 0000000..1aab6d8 --- /dev/null +++ b/autocomplete-rs/tests/dictionary_tests.rs @@ -0,0 +1,119 @@ +use autocomplete_rs::dictionary::Dictionary; +use autocomplete_rs::types::IdType; + +#[test] +fn test_dictionary_new() { + let dict = Dictionary::new(); + assert!(dict.is_empty()); + assert_eq!(dict.len(), 0); +} + +#[test] +fn test_dictionary_insert() { + let mut dict = Dictionary::new(); + + // Test first insertion + let id1 = dict.insert("hello".to_string()); + assert_eq!(id1, 0); + assert_eq!(dict.len(), 1); + + // Test duplicate insertion + let id2 = dict.insert("hello".to_string()); + assert_eq!(id2, id1); + assert_eq!(dict.len(), 1); + + // Test new insertion + let id3 = dict.insert("world".to_string()); + assert_eq!(id3, 1); + assert_eq!(dict.len(), 2); +} + +#[test] +fn test_dictionary_get() { + let mut dict = Dictionary::new(); + + // Insert test data + let id1 = dict.insert("hello".to_string()); + let id2 = dict.insert("world".to_string()); + + // Test valid gets + assert_eq!(dict.get(id1), Some("hello")); + assert_eq!(dict.get(id2), Some("world")); + + // Test invalid id + assert_eq!(dict.get(999), None); +} + +#[test] +fn test_dictionary_get_id() { + let mut dict = Dictionary::new(); + + // Insert test data + let id1 = dict.insert("hello".to_string()); + let id2 = dict.insert("world".to_string()); + + // Test valid gets + assert_eq!(dict.get_id("hello"), Some(id1)); + assert_eq!(dict.get_id("world"), Some(id2)); + + // Test non-existent string + assert_eq!(dict.get_id("nonexistent"), None); +} + +#[test] +fn test_dictionary_len_and_empty() { + let mut dict = Dictionary::new(); + + // Test empty state + assert!(dict.is_empty()); + assert_eq!(dict.len(), 0); + + // Test after insertions + dict.insert("hello".to_string()); + assert!(!dict.is_empty()); + assert_eq!(dict.len(), 1); + + dict.insert("world".to_string()); + assert!(!dict.is_empty()); + assert_eq!(dict.len(), 2); + + // Test duplicate insertion doesn't change length + dict.insert("hello".to_string()); + assert_eq!(dict.len(), 2); +} + +#[test] +fn test_dictionary_id_sequence() { + let mut dict = Dictionary::new(); + + // Test that IDs are assigned sequentially + let id1 = dict.insert("first".to_string()); + let id2 = dict.insert("second".to_string()); + let id3 = dict.insert("third".to_string()); + + assert_eq!(id1, 0); + assert_eq!(id2, 1); + assert_eq!(id3, 2); +} + +#[test] +fn test_dictionary_large_insertions() { + let mut dict = Dictionary::new(); + let num_insertions = 1000; + + // Insert many strings + for i in 0..num_insertions { + let s = format!("string_{}", i); + let id = dict.insert(s); + assert_eq!(id, i as IdType); + } + + assert_eq!(dict.len(), num_insertions); + + // Verify all strings can be retrieved + for i in 0..num_insertions { + let s = format!("string_{}", i); + assert_eq!(dict.get(i as IdType), Some(s.as_str())); + assert_eq!(dict.get_id(&s), Some(i as IdType)); + } +} \ No newline at end of file From ea4b22238019f029e1988fc6bab458464c06af3d Mon Sep 17 00:00:00 2001 From: Ahmed Awadallah Date: Sun, 25 May 2025 18:26:51 -0400 Subject: [PATCH 102/102] Add target directory to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2d7573c..b884c82 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,4 @@ Thumbs.db *.inverted *.forward *.bin +target/