-
Notifications
You must be signed in to change notification settings - Fork 7
Implement optional expanding of compound words into separate tokens #5
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,8 @@ | |
import java.io.IOException; | ||
import java.util.ArrayDeque; | ||
import java.util.ArrayList; | ||
import java.util.Set; | ||
import java.util.LinkedHashSet; | ||
import java.util.Deque; | ||
import java.util.List; | ||
import java.util.regex.Pattern; | ||
|
@@ -88,7 +90,7 @@ private void analyzeToken() { | |
|
||
charTermAttribute.setEmpty().append(baseForms.get(0)); | ||
|
||
if (cfg.analyzeAll && baseForms.size() > 1) { | ||
if ((cfg.analyzeAll || cfg.expandCompounds) && baseForms.size() > 1) { | ||
current = captureState(); | ||
|
||
for (String baseForm : baseForms.subList(1, baseForms.size())) | ||
|
@@ -112,8 +114,17 @@ private List<String> analyzeUncached(String word) { | |
|
||
for (Analysis result : results) { | ||
String baseForm = result.get("BASEFORM"); | ||
if (baseForm != null) | ||
if (baseForm != null) { | ||
baseForms.add(baseForm); | ||
} | ||
} | ||
if (cfg.expandCompounds) { | ||
for (String compound : expandCompounds(results)) { | ||
baseForms.add(compound); | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be just: if (cfg.expandCompounds)
baseForms.addAll(expandCompounds(results)); |
||
if (!cfg.analyzeAll) { | ||
return new ArrayList<String>(new LinkedHashSet<String>(baseForms)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Couple of notes: First of all, this is Java 8, so you can leave out the explicit types and say: return new ArrayList<>(new LinkedHashSet<>(baseForms)); Second of all, I'd probably add a method like public static <T> List<T> withoutDuplicates(List<T> xs) {
if (xs.size() < 2) return xs;
return ArrayList<>(new LinkedHashSet<>(xs));
} Or even something like: public static <T> List<T> withoutDuplicates(List<T> xs) {
if (xs.size() < 2) return xs;
// for small collections (our usual case) the naive algorithm should be a win
if (xs.size() < 16) {
ArrayList<T> result = new ArrayList<>(xs.size());
for (T x : xs)
if (!result.contains(x))
result.add(x);
return result;
}
return ArrayList<>(new LinkedHashSet<>(xs));
} Not that I'd probably bother to optimize it, but having it's nice to write higher-level code in the calling side and know that I have a good place to perform optimization if I have to get my hands dirty. This code is called a lot and reducing extra allocations helps. |
||
} | ||
return baseForms; | ||
} | ||
|
@@ -128,4 +139,82 @@ private void outputAlternative(String token) { | |
private boolean isCandidateForAnalyzation(CharSequence word) { | ||
return word.length() >= cfg.minimumWordSize && word.length() <= cfg.maximumWordSize && VALID_WORD_PATTERN.matcher(word).matches(); | ||
} | ||
|
||
private Set<String> expandCompounds(List<Analysis> analysisList) { | ||
// Contains code from the Voikko Filter for Solr | ||
// by the National Library of Finland. | ||
// | ||
// https://github.com/NatLibFi/SolrPlugins | ||
Set<String> compoundForms = new LinkedHashSet<String>(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now duplicate removal is performed at two different levels: this method performs and the caller performs it again. It's probably best to just leave it to the caller and return a |
||
|
||
for (Analysis analysis: analysisList) { | ||
if (!analysis.containsKey("WORDBASES")) { | ||
continue; | ||
} | ||
String wordbases = analysis.get("WORDBASES"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This performs lookups to the map: first with String wordbases = analysis.get("WORDBASES");
if (wordbases == null) continue; Almost the only reason to call
That said, it's super-rare to contain maps where |
||
// Split by plus sign (unless right after an open parenthesis) | ||
String matches[] = wordbases.split("(?<!\\()\\+"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This ends up compiling the same regex over and over again. It's better to perform the expensive compilation of the regex just once: private static final Pattern WORDBASE_SPLIT = Pattern.compile(""(?<!\\()\\+""); and then say: String[] matches = WORDBASE_SPLIT.split(wordbases); |
||
|
||
int currentPos = 0, lastPos = 0; | ||
String lastWordBody = ""; | ||
assert matches.length > 1; | ||
// The string starts with a plus sign, so skip the first (empty) entry. | ||
for (int i = 1; i <= matches.length - 1; i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The canonical way of writing this termination condition would be |
||
String wordAnalysis, wordBody, baseForm; | ||
|
||
// Get rid of equals sign in e.g. di=oksidi. | ||
wordAnalysis = matches[i].replaceAll("=", "");; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of performing this separately for each match, we could perform this normalization just once on the original string before splitting it. And also worth noting that |
||
int parenPos = wordAnalysis.indexOf('('); | ||
if (parenPos == -1) { | ||
wordBody = baseForm = wordAnalysis; | ||
} else { | ||
// Word body is before the parenthesis | ||
wordBody = wordAnalysis.substring(0, parenPos); | ||
// Base form or derivative is in parenthesis | ||
baseForm = wordAnalysis.substring(parenPos + 1, wordAnalysis.length() - 1); | ||
} | ||
|
||
String word; | ||
int wordOffset, wordLen; | ||
boolean isDerivative = baseForm.startsWith("+"); | ||
if (isDerivative) { | ||
// Derivative suffix, merge with word body | ||
word = lastWordBody + wordBody; | ||
wordOffset = lastPos; | ||
wordLen = word.length(); | ||
} else { | ||
word = baseForm; | ||
wordOffset = currentPos; | ||
wordLen = word.length(); | ||
lastWordBody = wordBody; | ||
lastPos = currentPos; | ||
currentPos += baseForm.length(); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't say that I understand the meaning of juggling all these variables, but just by looking at the code above, I can say that at least assignment to int wordLen = word.length(); |
||
|
||
// Make sure we don't exceed the length of the original term | ||
int termLen = charTermAttribute.toString().length(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why must we not exceed the length of the original term? What input could cause that to happen and what problems would follow? Also, the int termLen = charTermAttribute.length(); |
||
if (wordOffset + wordLen > termLen) { | ||
if (wordOffset >= termLen) { | ||
wordOffset = wordLen - termLen; | ||
if (wordOffset < 0) { | ||
wordOffset = 0; | ||
} | ||
} else { | ||
wordLen = termLen - wordOffset; | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The if (wordOffset < termLen && wordOffset + wordLen > termLen)
wordLen = termLen - wordOffset; |
||
|
||
int maxSubwordSize = cfg.maximumSubwordSize; | ||
int minSubwordSize = cfg.minimumSubwordSize; | ||
if (wordLen > minSubwordSize) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I specify minimum size to be 2, I'd expect that words of length 2 will be included, but this excludes them. So perhaps this should be |
||
if (wordLen > maxSubwordSize) { | ||
word = word.substring(0, maxSubwordSize); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is prudent to just cut words to |
||
wordLen = maxSubwordSize; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This assignment is also dead. |
||
} | ||
compoundForms.add(word); | ||
} | ||
} | ||
} | ||
return compoundForms; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method is quite complex and it's hard simply look at it and be assured that it's correct. Furthermore, it's hard to write tests for this because it's tied to running Voikko. Perhaps the code could be structured along these lines: private List<String> expandCompounds(List<Analysis> analysisList) {
List<String> compoundForms = new ArrayList<String>();
for (Analysis analysis: analysisList) {
String wordBases = analysis.get("WORDBASES");
if (wordBases != null)
compoundForms.addAll(expandWordBases(wordBases));
}
return compoundForms:
}
private List<String> expandWordBases(String wordBases) {
return CompoundWordParser.expandWordBases(wordBases, charTermAttribute.length(), cfg.maximumSubwordSize, cfg.minimumSubwordSize);
} This way the complex logic would be in a static method for which it would be trivial to write test cases: assertWordBases("+köyde(köysi)+n+veto(veto)",
"köysi", "veto");
assertWordBases("+alkio(Alkio)+-+opisto(opisto)",
"Alkio", "opisto");
assertWordBases("+kansa(kansa)+llis(+llinen)+eepos(eepos)",
"kansa", "kansallis", "eepos"); I copy-pasted your code to a scratch file, hacked it a bit to make it callable from test, asked what it gives back to the examples inputs above and then wrote the expected results based on the results the code gave me. Do those look expected? I'm surprised about "kansallis" is the results. Wouldn't the correct value be "kansallinen"? Furthermore I'm a bit confused if the logic can be right because the only reason why the input All that said, here's my attempt at this function: private static final Pattern WORDBASE = Pattern.compile("\\+([^+(]+)(\\(([^)]*)\\))?");
static List<String> expandWordBases(String wordbases) {
Matcher m = WORDBASE.matcher(wordbases.replace("=", ""));
String lastBody = "";
List<String> result = new ArrayList<>();
while (m.find()) {
String body = m.group(1);
String root = m.group(3);
if (root != null) {
if (root.startsWith("+")) {
result.add(lastBody + root.substring(1));
} else {
result.add(root);
lastBody = body;
}
}
}
return result;
} And some tests to go with it: @Test
void voikkoExamples() {
assertWordBases("+köyde(köysi)+n+veto(veto)", "köysi", "veto");
assertWordBases("+alkio(Alkio)+-+opisto(opisto)", "Alkio", "opisto");
assertWordBases("+kansa(kansa)+llis(+llinen)+eepos(eepos)", "kansa", "kansallinen", "eepos");
}
private static void assertWordBases(String input, String... expected) {
assertEquals(asList(expected), Foo.expandWordBases(input));
} |
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
minimumSubwordSize
andmaximumSubwordSize
are not documented here. Is that intentional or accidental? If you you consider them to be so rarely needed that they don't need to be documented, that's fine by me. I just want to make sure that they aren't forgotten by accident.