EvidentSolutions · tituomin · Apr 12, 2019 · komu · Apr 18, 2019 · komu
diff --git a/README.md b/README.md
@@ -101,16 +101,17 @@ Include `finnish` tokenizer and `voikko` filter in your analyzer, for example:
 
 You can use the following filter options to customize the behaviour of the filter:
 
-| Parameter         | Default value    | Description                                      |
-|-------------------|------------------|--------------------------------------------------|
-| language          | fi_FI            | Language to use                                  |
-| dictionaryPath    | system dependent | path to voikko dictionaries                      |
-| analyzeAll        | false            | Use all analysis possibilities or just the first |
-| minimumWordSize   | 3                | minimum length of words to analyze               |
-| maximumWordSize   | 100              | maximum length of words to analyze               |
-| libraryPath       | system dependent | path to directory containing libvoikko           |
-| poolMaxSize       | 10               | maximum amount of Voikko-instances to pool       |
-| analysisCacheSize | 1024             | number of analysis results to cache              |
+| Parameter         | Default value    | Description                                                    |
+|-------------------|------------------|----------------------------------------------------------------|
+| language          | fi_FI            | Language to use                                                |
+| dictionaryPath    | system dependent | path to voikko dictionaries                                    |
+| analyzeAll        | false            | Use all analysis possibilities or just the first               |
+| minimumWordSize   | 3                | minimum length of words to analyze                             |
+| maximumWordSize   | 100              | maximum length of words to analyze                             |
+| libraryPath       | system dependent | path to directory containing libvoikko                         |
+| poolMaxSize       | 10               | maximum amount of Voikko-instances to pool                     |
+| analysisCacheSize | 1024             | number of analysis results to cache                            |
+| expandCompounds   | false            | whether to produce separate tokens for parts of compound words |
 
 ## Development
 

diff --git a/src/main/java/fi/evident/elasticsearch/voikko/analysis/VoikkoTokenFilter.java b/src/main/java/fi/evident/elasticsearch/voikko/analysis/VoikkoTokenFilter.java
@@ -27,6 +27,8 @@
 import java.io.IOException;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
+import java.util.Set;
+import java.util.LinkedHashSet;
 import java.util.Deque;
 import java.util.List;
 import java.util.regex.Pattern;
@@ -88,7 +90,7 @@ private void analyzeToken() {
 
         charTermAttribute.setEmpty().append(baseForms.get(0));
 
-        if (cfg.analyzeAll && baseForms.size() > 1) {
+        if ((cfg.analyzeAll || cfg.expandCompounds) && baseForms.size() > 1) {
             current = captureState();
 
             for (String baseForm : baseForms.subList(1, baseForms.size()))
@@ -112,8 +114,17 @@ private List<String> analyzeUncached(String word) {
 
         for (Analysis result : results) {
             String baseForm = result.get("BASEFORM");
-            if (baseForm != null)
+            if (baseForm != null) {
                 baseForms.add(baseForm);
+            }
+        }
+        if (cfg.expandCompounds) {
+            for (String compound : expandCompounds(results)) {
+                baseForms.add(compound);
+            }
+        }
+        if (!cfg.analyzeAll) {
+            return new ArrayList<String>(new LinkedHashSet<String>(baseForms));
         }
         return baseForms;
     }
@@ -128,4 +139,82 @@ private void outputAlternative(String token) {
     private boolean isCandidateForAnalyzation(CharSequence word) {
         return word.length() >= cfg.minimumWordSize && word.length() <= cfg.maximumWordSize && VALID_WORD_PATTERN.matcher(word).matches();
     }
+
+    private Set<String> expandCompounds(List<Analysis> analysisList) {
+        // Contains code from the Voikko Filter for Solr
+        // by the National Library of Finland.
+        //
+        // https://github.com/NatLibFi/SolrPlugins
+        Set<String> compoundForms = new LinkedHashSet<String>();
+
+        for (Analysis analysis: analysisList) {
+            if (!analysis.containsKey("WORDBASES")) {
+                continue;
+            }
+            String wordbases = analysis.get("WORDBASES");
+            // Split by plus sign (unless right after an open parenthesis)
+            String matches[] = wordbases.split("(?<!\\()\\+");
+
+            int currentPos = 0, lastPos = 0;
+            String lastWordBody = "";
+            assert matches.length > 1;
+            // The string starts with a plus sign, so skip the first (empty) entry.
+            for (int i = 1; i <= matches.length - 1; i++) {
+                String wordAnalysis, wordBody, baseForm;
+
+                // Get rid of equals sign in e.g. di=oksidi.
+                wordAnalysis = matches[i].replaceAll("=", "");;
+                int parenPos = wordAnalysis.indexOf('(');
+                if (parenPos == -1) {
+                    wordBody = baseForm = wordAnalysis;
+                } else {
+                    // Word body is before the parenthesis
+                    wordBody = wordAnalysis.substring(0, parenPos);
+                    // Base form or derivative is in parenthesis
+                    baseForm = wordAnalysis.substring(parenPos + 1, wordAnalysis.length() - 1);
+                }
+
+                String word;
+                int wordOffset, wordLen;
+                boolean isDerivative = baseForm.startsWith("+");
+                if (isDerivative) {
+                    // Derivative suffix, merge with word body
+                    word = lastWordBody + wordBody;
+                    wordOffset = lastPos;
+                    wordLen = word.length();
+                } else {
+                    word = baseForm;
+                    wordOffset = currentPos;
+                    wordLen = word.length();
+                    lastWordBody = wordBody;
+                    lastPos = currentPos;
+                    currentPos += baseForm.length();
+                }
+
+                // Make sure we don't exceed the length of the original term
+                int termLen = charTermAttribute.toString().length();
+                if (wordOffset + wordLen > termLen) {
+                    if (wordOffset >= termLen) {
+                        wordOffset = wordLen - termLen;
+                        if (wordOffset < 0) {
+                            wordOffset = 0;
+                        }
+                    } else {
+                        wordLen = termLen - wordOffset;
+                    }
+                }
+
+                int maxSubwordSize = cfg.maximumSubwordSize;
+                int minSubwordSize = cfg.minimumSubwordSize;
+                if (wordLen > minSubwordSize) {
+                    if (wordLen > maxSubwordSize) {
+                        word = word.substring(0, maxSubwordSize);
+                        wordLen = maxSubwordSize;
+                    }
+                    compoundForms.add(word);
+                }
+            }
+        }
+        return compoundForms;
+    }
 }
diff --git a/src/main/java/fi/evident/elasticsearch/voikko/analysis/VoikkoTokenFilterConfiguration.java b/src/main/java/fi/evident/elasticsearch/voikko/analysis/VoikkoTokenFilterConfiguration.java
@@ -27,4 +27,13 @@ final class VoikkoTokenFilterConfiguration {
     /** Words longer than this threshold are ignored */
     int maximumWordSize = 100;
 
+    /** If true, include parts of compound words as alternatives to the whole word */
+    boolean expandCompounds = false;
+
+    /** Subwords (parts of compound words) shorter than this treshold are ignored  */
+    int minimumSubwordSize = 2;
+
+    /** Subwords longer than this treshold are ignored */
+    int maximumSubwordSize = 30;
+
 }
diff --git a/src/main/java/fi/evident/elasticsearch/voikko/analysis/VoikkoTokenFilterFactory.java b/src/main/java/fi/evident/elasticsearch/voikko/analysis/VoikkoTokenFilterFactory.java
@@ -41,6 +41,9 @@ public VoikkoTokenFilterFactory(IndexSettings indexSettings,
         cfg.analyzeAll = settings.getAsBoolean("analyzeAll", cfg.analyzeAll);
         cfg.minimumWordSize = settings.getAsInt("minimumWordSize", cfg.minimumWordSize);
         cfg.maximumWordSize = settings.getAsInt("maximumWordSize", cfg.maximumWordSize);
+        cfg.expandCompounds = settings.getAsBoolean("expandCompounds", cfg.expandCompounds);
+        cfg.minimumSubwordSize = settings.getAsInt("minimumSubwordSize", cfg.minimumSubwordSize);
+        cfg.maximumSubwordSize = settings.getAsInt("maximumSubwordSize", cfg.maximumSubwordSize);
 
         analysisCache = new AnalysisCache(settings.getAsInt("analysisCacheSize", 1024));
 

diff --git a/src/test/java/fi/evident/elasticsearch/voikko/analysis/VoikkoTokenFilterTests.java b/src/test/java/fi/evident/elasticsearch/voikko/analysis/VoikkoTokenFilterTests.java
@@ -138,6 +138,24 @@ public void testCompoundWordsWithHyphens() {
         assertTokens("rippi-isälle", token("rippi-isälle", "rippi-isä", 1));
     }
 
+    public void testExpandedCompoundWords() {
+        settings.put("index.analysis.filter.myFilter.expandCompounds", true);
+        assertTokens("isoisälle", token("isoisälle", "isoisä", 1));
+        assertTokens("tekokuusta keinokuuhun",
+                token("tekokuusta", "tekokuu", 1),
+                token("tekokuusta", "tekokuusi", 0),
+                token("tekokuusta", "teko", 0),
+                token("tekokuusta", "kuu", 0),
+                token("tekokuusta", "kuusi", 0),
+                token("keinokuuhun", "keinokuu", 1),
+                token("keinokuuhun", "keino", 0),
+                token("keinokuuhun", "kuu", 0));
+        assertTokens("hammaslääkäri",
+                token("hammaslääkäri", "hammaslääkäri", 1),
+                token("hammaslääkäri", "hammas", 0),
+                token("hammaslääkäri", "lääkäri", 0));
+    }
+
     private static TokenData token(String original, String token, int positionIncrement) {
         return new TokenData(original, token, positionIncrement);
     }