diff --git a/corpus_transform.py b/corpus_transform.py index 975e7dfa55..84da6e7a3e 100644 --- a/corpus_transform.py +++ b/corpus_transform.py @@ -14,12 +14,22 @@ def transform(text): except ValueError: continue - if doc["url"] == "": + url = doc["url"] + if url == "": continue + filters = [] + id_hash = hash(doc["url"]) + if id_hash % 10 == 3: + filters.append("10%") + if id_hash % 100 == 42: + filters.append("1%") + doc_transformed = { - "id": doc["url"], + "id": url, "text": transform(doc["body"]) } + if len(filters) > 0: + doc_transformed["filters"] = filters print(json.dumps(doc_transformed)) diff --git a/engines/lucene-10.0.0-bp/src/main/java/BuildIndex.java b/engines/lucene-10.0.0-bp/src/main/java/BuildIndex.java index f5db1f18b2..73100ffe21 100644 --- a/engines/lucene-10.0.0-bp/src/main/java/BuildIndex.java +++ b/engines/lucene-10.0.0-bp/src/main/java/BuildIndex.java @@ -15,6 +15,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -26,7 +27,9 @@ import org.apache.lucene.util.ThreadInterruptedException; import com.eclipsesource.json.Json; +import com.eclipsesource.json.JsonArray; import com.eclipsesource.json.JsonObject; +import com.eclipsesource.json.JsonValue; public class BuildIndex { @@ -37,11 +40,11 @@ public static void main(String[] args) throws Exception { final IndexWriterConfig config = new IndexWriterConfig(standardAnalyzer) .setRAMBufferSizeMB(1024) .setOpenMode(OpenMode.CREATE); - final BPIndexReorderer reorderer = new BPIndexReorderer(); - reorderer.setRAMBudgetMB(256); - final BPReorderingMergePolicy mp = new BPReorderingMergePolicy(config.getMergePolicy(), reorderer); - mp.setMinNaturalMergeNumDocs(Integer.MAX_VALUE); // only reorder at force-merge time - config.setMergePolicy(mp); + final BPIndexReorderer reorderer = new BPIndexReorderer(); + reorderer.setRAMBudgetMB(256); + final BPReorderingMergePolicy mp = new BPReorderingMergePolicy(config.getMergePolicy(), reorderer); + mp.setMinNaturalMergeNumDocs(Integer.MAX_VALUE); // only reorder at force-merge time + config.setMergePolicy(mp); try (Directory dir = FSDirectory.open(outputPath); IndexWriter writer = new IndexWriter(dir, config); @@ -53,13 +56,9 @@ public static void main(String[] args) throws Exception { final AtomicInteger indexed = new AtomicInteger(); for (int i = 0; i < threads.length; ++i) { - final Document document = new Document(); StoredField idField = new StoredField("id", ""); TextField textField = new TextField("text", "", Field.Store.NO); - document.add(idField); - document.add(textField); - threads[i] = new Thread(() -> { while (true) { String line; @@ -83,13 +82,25 @@ public static void main(String[] args) throws Exception { final JsonObject parsed_doc = Json.parse(line).asObject(); final String id = parsed_doc.get("id").asString(); final String text = parsed_doc.get("text").asString(); + final JsonValue filters = parsed_doc.get("filters"); idField.setStringValue(id); textField.setStringValue(text); + + Document document = new Document(); + document.add(idField); + document.add(textField); + if (filters != null) { + JsonArray filterArray = filters.asArray(); + for (int j = 0; j < filterArray.size(); ++j) { + document.add(new StringField("filters", filterArray.get(j).asString(), Field.Store.NO)); + } + } + try { writer.addDocument(document); final int numIndexed = indexed.getAndIncrement(); if (numIndexed % 100_000 == 0) { - System.out.println("Indexed: " + numIndexed); + System.out.println("Indexed: " + numIndexed); } } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/engines/lucene-10.0.0-bp/src/main/java/DoQuery.java b/engines/lucene-10.0.0-bp/src/main/java/DoQuery.java index 57a6203538..73bf377bf2 100644 --- a/engines/lucene-10.0.0-bp/src/main/java/DoQuery.java +++ b/engines/lucene-10.0.0-bp/src/main/java/DoQuery.java @@ -8,15 +8,23 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollectorManager; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.FSDirectory; public class DoQuery { + + private static final String TEN_PERCENT_FILTER = "_FILTER_10%"; + private static final String ONE_PERCENT_FILTER = "_FILTER_1%"; + public static void main(String[] args) throws IOException, ParseException { final Path indexDir = Paths.get(args[0]); try (IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir)); @@ -29,9 +37,18 @@ public static void main(String[] args) throws IOException, ParseException { while ((line = bufferedReader.readLine()) != null) { final String[] fields = line.trim().split("\t"); assert fields.length == 2; - final String command = fields[0]; + String command = fields[0]; final String query_str = fields[1]; Query query = queryParser.parse(query_str); + if (command.endsWith(TEN_PERCENT_FILTER)) { + Query filter = new TermQuery(new Term("filters", "10%")); + command = command.substring(0, command.length() - TEN_PERCENT_FILTER.length()); + query = new BooleanQuery.Builder().add(query, Occur.MUST).add(filter, Occur.FILTER).build(); + } else if (command.endsWith(ONE_PERCENT_FILTER)) { + Query filter = new TermQuery(new Term("filters", "1%")); + command = command.substring(0, command.length() - ONE_PERCENT_FILTER.length()); + query = new BooleanQuery.Builder().add(query, Occur.MUST).add(filter, Occur.FILTER).build(); + } final long count; switch (command) { case "COUNT": diff --git a/engines/lucene-10.0.0/src/main/java/BuildIndex.java b/engines/lucene-10.0.0/src/main/java/BuildIndex.java index 2a7d7d4871..cd287ff3fb 100644 --- a/engines/lucene-10.0.0/src/main/java/BuildIndex.java +++ b/engines/lucene-10.0.0/src/main/java/BuildIndex.java @@ -15,6 +15,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -24,7 +25,9 @@ import org.apache.lucene.util.ThreadInterruptedException; import com.eclipsesource.json.Json; +import com.eclipsesource.json.JsonArray; import com.eclipsesource.json.JsonObject; +import com.eclipsesource.json.JsonValue; public class BuildIndex { @@ -46,13 +49,9 @@ public static void main(String[] args) throws Exception { final AtomicInteger indexed = new AtomicInteger(); for (int i = 0; i < threads.length; ++i) { - final Document document = new Document(); StoredField idField = new StoredField("id", ""); TextField textField = new TextField("text", "", Field.Store.NO); - document.add(idField); - document.add(textField); - threads[i] = new Thread(() -> { while (true) { String line; @@ -76,13 +75,25 @@ public static void main(String[] args) throws Exception { final JsonObject parsed_doc = Json.parse(line).asObject(); final String id = parsed_doc.get("id").asString(); final String text = parsed_doc.get("text").asString(); + final JsonValue filters = parsed_doc.get("filters"); idField.setStringValue(id); textField.setStringValue(text); + + Document document = new Document(); + document.add(idField); + document.add(textField); + if (filters != null) { + JsonArray filterArray = filters.asArray(); + for (int j = 0; j < filterArray.size(); ++j) { + document.add(new StringField("filters", filterArray.get(j).asString(), Field.Store.NO)); + } + } + try { writer.addDocument(document); final int numIndexed = indexed.getAndIncrement(); if (numIndexed % 100_000 == 0) { - System.out.println("Indexed: " + numIndexed); + System.out.println("Indexed: " + numIndexed); } } catch (IOException e) { throw new UncheckedIOException(e); diff --git a/engines/lucene-10.0.0/src/main/java/DoQuery.java b/engines/lucene-10.0.0/src/main/java/DoQuery.java index 57a6203538..73bf377bf2 100644 --- a/engines/lucene-10.0.0/src/main/java/DoQuery.java +++ b/engines/lucene-10.0.0/src/main/java/DoQuery.java @@ -8,15 +8,23 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollectorManager; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.store.FSDirectory; public class DoQuery { + + private static final String TEN_PERCENT_FILTER = "_FILTER_10%"; + private static final String ONE_PERCENT_FILTER = "_FILTER_1%"; + public static void main(String[] args) throws IOException, ParseException { final Path indexDir = Paths.get(args[0]); try (IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir)); @@ -29,9 +37,18 @@ public static void main(String[] args) throws IOException, ParseException { while ((line = bufferedReader.readLine()) != null) { final String[] fields = line.trim().split("\t"); assert fields.length == 2; - final String command = fields[0]; + String command = fields[0]; final String query_str = fields[1]; Query query = queryParser.parse(query_str); + if (command.endsWith(TEN_PERCENT_FILTER)) { + Query filter = new TermQuery(new Term("filters", "10%")); + command = command.substring(0, command.length() - TEN_PERCENT_FILTER.length()); + query = new BooleanQuery.Builder().add(query, Occur.MUST).add(filter, Occur.FILTER).build(); + } else if (command.endsWith(ONE_PERCENT_FILTER)) { + Query filter = new TermQuery(new Term("filters", "1%")); + command = command.substring(0, command.length() - ONE_PERCENT_FILTER.length()); + query = new BooleanQuery.Builder().add(query, Occur.MUST).add(filter, Occur.FILTER).build(); + } final long count; switch (command) { case "COUNT":