diff --git a/Cargo.lock b/Cargo.lock index ed98581..e09cf7a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -170,15 +170,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "autocfg" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dde43e75fd43e8a1bf86103336bc699aa8d17ad1be60c76c0bdfd4828e19b78" -dependencies = [ - "autocfg 1.4.0", -] - [[package]] name = "autocfg" version = "1.4.0" @@ -243,7 +234,7 @@ dependencies = [ "comde", "fastvlq", "log", - "memmap2 0.9.5", + "memmap2", "pathdiff", "relative-path", "serde_json", @@ -333,7 +324,7 @@ dependencies = [ "glob", "indicatif 0.16.2", "log", - "rand 0.8.5", + "rand", "reqwest", "serde", "serde_json", @@ -358,9 +349,8 @@ dependencies = [ [[package]] name = "cffi" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5e4ef6239aac8a2d645d60f08cc345889659f64c815ce204de14e6ffc2b52ed" +version = "0.2.0-dev" +source = "git+https://github.com/cffi-rs/cffi#ee4a9f5a5bcf72164831650b23d9dc0d5618a04e" dependencies = [ "cffi-impl", "libc", @@ -369,20 +359,19 @@ dependencies = [ [[package]] name = "cffi-impl" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5991ed1ca79f668096f267671e6035156f23871a8a2dbd88a38dd43a8c73c68" +version = "0.2.0-dev" +source = "git+https://github.com/cffi-rs/cffi#ee4a9f5a5bcf72164831650b23d9dc0d5618a04e" dependencies = [ "ctor", "darling", - "heck 0.3.3", + "heck 0.4.1", "log", "phf", "phf_codegen", "pretty_env_logger", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.101", ] [[package]] @@ -470,15 +459,6 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "colorchoice" version = "1.0.3" @@ -619,12 +599,12 @@ dependencies = [ [[package]] name = "ctor" -version = "0.1.26" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096" +checksum = "ad291aa74992b9b7a7e88c38acbbf6ad7e107f1d90ee8775b7bc1fc3394f485c" dependencies = [ "quote", - "syn 1.0.109", + "syn 2.0.101", ] [[package]] @@ -659,9 +639,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.10.2" +version = "0.20.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d706e75d87e35569db781a9b5e2416cff1236a47ed380831f959382ccd5f858" +checksum = "54e36fcd13ed84ffdfda6f5be89b31287cbb80c439841fe69e04841435464391" dependencies = [ "darling_core", "darling_macro", @@ -669,27 +649,27 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.10.2" +version = "0.20.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b" +checksum = "9c2cf1c23a687a1feeb728783b993c4e1ad83d99f351801977dd809b48d0a70f" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", - "strsim 0.9.3", - "syn 1.0.109", + "strsim 0.10.0", + "syn 2.0.101", ] [[package]] name = "darling_macro" -version = "0.10.2" +version = "0.20.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72" +checksum = "a668eda54683121533a393014d8692171709ff57a7d61f187b6e782719f8933f" dependencies = [ "darling_core", "quote", - "syn 1.0.109", + "syn 2.0.101", ] [[package]] @@ -768,17 +748,17 @@ dependencies = [ "byteorder", "cffi", "eieio", - "env_logger 0.9.3", + "env_logger 0.11.2", "flatbuffers", "fs_extra", "globwalk", "hashbrown 0.11.2", - "itertools", + "itertools 0.12.1", "language-tags", "libc", "lifeguard", "log", - "memmap2 0.5.10", + "memmap2", "parking_lot", "pathos", "rust-bert", @@ -787,7 +767,7 @@ dependencies = [ "serde-xml-rs", "serde_json", "smol_str", - "strsim 0.10.0", + "strsim 0.11.1", "tch", "tempfile", "thiserror", @@ -842,31 +822,41 @@ dependencies = [ ] [[package]] -name = "env_logger" -version = "0.7.1" +name = "env_filter" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" +checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" dependencies = [ - "atty", - "humantime 1.3.0", "log", "regex", - "termcolor", ] [[package]] name = "env_logger" -version = "0.9.3" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" dependencies = [ - "atty", - "humantime 2.2.0", + "humantime", + "is-terminal", "log", "regex", "termcolor", ] +[[package]] +name = "env_logger" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -984,12 +974,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" - [[package]] name = "futures-channel" version = "0.3.31" @@ -1098,11 +1082,11 @@ dependencies = [ [[package]] name = "globwalk" -version = "0.8.1" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93e3af942408868f6934a7b85134a3230832b9977cf66125df2f9edcfce4ddcc" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.9.0", "ignore", "walkdir", ] @@ -1186,6 +1170,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" @@ -1207,6 +1197,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08" + [[package]] name = "hmac" version = "0.12.1" @@ -1250,15 +1246,6 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" -[[package]] -name = "humantime" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" -dependencies = [ - "quick-error", -] - [[package]] name = "humantime" version = "2.2.0" @@ -1556,6 +1543,17 @@ dependencies = [ "smallvec", ] +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi 0.5.1", + "libc", + "windows-sys 0.59.0", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -1571,6 +1569,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" @@ -1683,7 +1690,7 @@ version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ - "autocfg 1.4.0", + "autocfg", "scopeguard", ] @@ -1711,7 +1718,7 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" dependencies = [ - "autocfg 1.4.0", + "autocfg", "rawpointer", ] @@ -1721,15 +1728,6 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "memmap2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" -dependencies = [ - "libc", -] - [[package]] name = "memmap2" version = "0.9.5" @@ -1825,7 +1823,7 @@ version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ - "autocfg 1.4.0", + "autocfg", ] [[package]] @@ -1956,7 +1954,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" dependencies = [ "base64ct", - "rand_core 0.6.4", + "rand_core", "subtle", ] @@ -2013,18 +2011,18 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phf" -version = "0.7.24" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" -version = "0.7.24" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" dependencies = [ "phf_generator", "phf_shared", @@ -2032,19 +2030,19 @@ dependencies = [ [[package]] name = "phf_generator" -version = "0.7.24" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" dependencies = [ "phf_shared", - "rand 0.6.5", + "rand", ] [[package]] name = "phf_shared" -version = "0.7.24" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" dependencies = [ "siphasher", ] @@ -2084,11 +2082,11 @@ dependencies = [ [[package]] name = "pretty_env_logger" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "926d36b9553851b8b0005f1275891b392ee4d2d833852c417ed025477350fb9d" +checksum = "865724d4dbe39d9f3dd3b52b88d859d66bcb2d6a0acfd5ea68a65fb66d4bdc1c" dependencies = [ - "env_logger 0.7.1", + "env_logger 0.10.2", "log", ] @@ -2131,12 +2129,6 @@ version = "2.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf7e6d18738ecd0902d30d1ad232c9125985a3422929b16c65517b38adc14f96" -[[package]] -name = "quick-error" -version = "1.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" - [[package]] name = "quote" version = "1.0.40" @@ -2152,25 +2144,6 @@ version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" -[[package]] -name = "rand" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" -dependencies = [ - "autocfg 0.1.8", - "libc", - "rand_chacha 0.1.1", - "rand_core 0.4.2", - "rand_hc", - "rand_isaac", - "rand_jitter", - "rand_os", - "rand_pcg", - "rand_xorshift", - "winapi", -] - [[package]] name = "rand" version = "0.8.5" @@ -2178,18 +2151,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_chacha" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" -dependencies = [ - "autocfg 0.1.8", - "rand_core 0.3.1", + "rand_chacha", + "rand_core", ] [[package]] @@ -2199,24 +2162,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -dependencies = [ - "rand_core 0.4.2", + "rand_core", ] -[[package]] -name = "rand_core" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" - [[package]] name = "rand_core" version = "0.6.4" @@ -2226,68 +2174,6 @@ dependencies = [ "getrandom 0.2.16", ] -[[package]] -name = "rand_hc" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_isaac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_jitter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" -dependencies = [ - "libc", - "rand_core 0.4.2", - "winapi", -] - -[[package]] -name = "rand_os" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" -dependencies = [ - "cloudabi", - "fuchsia-cprng", - "libc", - "rand_core 0.4.2", - "rdrand", - "winapi", -] - -[[package]] -name = "rand_pcg" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" -dependencies = [ - "autocfg 0.1.8", - "rand_core 0.4.2", -] - -[[package]] -name = "rand_xorshift" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "rawpointer" version = "0.2.1" @@ -2314,15 +2200,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "redox_syscall" version = "0.2.16" @@ -2463,7 +2340,7 @@ checksum = "196e3b77b07fd5bfcbc8187ecaef5d5931820d9abd6c3fe0a9dc6d3ddb035d72" dependencies = [ "csv", "hashbrown 0.12.3", - "itertools", + "itertools 0.10.5", "lazy_static", "protobuf", "rayon", @@ -2573,9 +2450,9 @@ dependencies = [ [[package]] name = "serde-xml-rs" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65162e9059be2f6a3421ebbb4fef3e74b7d9e7c60c50a0e292c6239f19f1edfa" +checksum = "fb3aa78ecda1ebc9ec9847d5d3aba7d618823446a049ba2491940506da6e2782" dependencies = [ "log", "serde", @@ -2648,9 +2525,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "siphasher" -version = "0.2.3" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" [[package]] name = "slab" @@ -2658,7 +2535,7 @@ version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" dependencies = [ - "autocfg 1.4.0", + "autocfg", ] [[package]] @@ -2669,9 +2546,9 @@ checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] name = "smol_str" -version = "0.1.24" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fad6c857cbab2627dcf01ec85a623ca4e7dcb5691cbaa3d7fb7653671f0d09c9" +checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49" dependencies = [ "serde", ] @@ -2704,12 +2581,6 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" -[[package]] -name = "strsim" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c" - [[package]] name = "strsim" version = "0.10.0" @@ -2833,7 +2704,7 @@ dependencies = [ "lazy_static", "libc", "ndarray", - "rand 0.8.5", + "rand", "thiserror", "torch-sys", "zip 0.5.13", diff --git a/accuracy/Cargo.toml b/accuracy/Cargo.toml index e6d3de0..3180bc8 100644 --- a/accuracy/Cargo.toml +++ b/accuracy/Cargo.toml @@ -15,7 +15,7 @@ rayon = { version = "1.4.0" } indicatif = { version = "0.15", features = ["with_rayon"] } # box-format = { git = "https://github.com/bbqsrc/box", branch = "master" } # tempdir = "0.3.7" -pretty_env_logger = "0.4.0" +pretty_env_logger = "0.5.0" # ctor = "*" # gumdrop = "0.8.0" # thiserror = "1.0.20" diff --git a/accuracy/src/main.rs b/accuracy/src/main.rs index 03b2b4d..6fcc3f1 100644 --- a/accuracy/src/main.rs +++ b/accuracy/src/main.rs @@ -46,6 +46,7 @@ static CFG: SpellerConfig = SpellerConfig { beam: None, reweight: Some(ReweightingConfig::default_const()), node_pool_size: 128, + continuation_marker: None, recase: true, }; diff --git a/divvunspell-bin/Cargo.toml b/divvunspell-bin/Cargo.toml index 3606464..8a997ab 100644 --- a/divvunspell-bin/Cargo.toml +++ b/divvunspell-bin/Cargo.toml @@ -16,7 +16,7 @@ serde = { version = "1.0.116", features = ["derive"] } serde_json = "1.0.57" divvunspell = { version = "1.0.0-beta.5", features = ["internal_convert", "compression"], path = "../divvunspell" } box-format = { version = "0.3.2", features = ["reader"], default-features = false } -pretty_env_logger = "0.4.0" +pretty_env_logger = "0.5.0" gumdrop = "0.8.0" anyhow = "1.0.32" structopt = "0.3.17" diff --git a/divvunspell-bin/src/main.rs b/divvunspell-bin/src/main.rs index 77295cd..14a34dd 100644 --- a/divvunspell-bin/src/main.rs +++ b/divvunspell-bin/src/main.rs @@ -1,9 +1,14 @@ use std::io::{self, Read}; +use std::process; use std::{ path::{Path, PathBuf}, sync::Arc, }; +use divvunspell::speller::HfstSpeller; +use divvunspell::transducer::hfst::HfstTransducer; +use divvunspell::transducer::Transducer; +use divvunspell::vfs::Fs; use gumdrop::Options; use serde::Serialize; @@ -17,18 +22,22 @@ use divvunspell::{ boxf::ThfstBoxSpellerArchive, error::SpellerArchiveError, BoxSpellerArchive, SpellerArchive, ZipSpellerArchive, }, - speller::{suggestion::Suggestion, Speller, SpellerConfig}, + speller::{suggestion::Suggestion, Analyzer, SpellerConfig}, tokenizer::Tokenize, }; trait OutputWriter { fn write_correction(&mut self, word: &str, is_correct: bool); fn write_suggestions(&mut self, word: &str, suggestions: &[Suggestion]); + fn write_input_analyses(&mut self, word: &str, analyses: &[Suggestion]); + fn write_output_analyses(&mut self, word: &str, analyses: &[Suggestion]); fn write_predictions(&mut self, predictions: &[String]); fn finish(&mut self); } -struct StdoutWriter; +struct StdoutWriter { + has_continuation_marker: Option, +} impl OutputWriter for StdoutWriter { fn write_correction(&mut self, word: &str, is_correct: bool) { @@ -40,8 +49,18 @@ impl OutputWriter for StdoutWriter { } fn write_suggestions(&mut self, _word: &str, suggestions: &[Suggestion]) { - for sugg in suggestions { - println!("{}\t\t{}", sugg.value, sugg.weight); + if let Some(s) = &self.has_continuation_marker { + for sugg in suggestions { + print!("{}", sugg.value); + if sugg.completed == Some(true) { + print!("{s}"); + } + println!("\t\t{}", sugg.weight); + } + } else { + for sugg in suggestions { + println!("{}\t\t{}", sugg.value, sugg.weight); + } } println!(); } @@ -51,6 +70,22 @@ impl OutputWriter for StdoutWriter { println!("{}", predictions.join(" ")); } + fn write_input_analyses(&mut self, _word: &str, suggestions: &[Suggestion]) { + println!("Input analyses: "); + for sugg in suggestions { + println!("{}\t\t{}", sugg.value, sugg.weight); + } + println!(); + } + + fn write_output_analyses(&mut self, _word: &str, suggestions: &[Suggestion]) { + println!("Output analyses: "); + for sugg in suggestions { + println!("{}\t\t{}", sugg.value, sugg.weight); + } + println!(); + } + fn finish(&mut self) {} } @@ -62,18 +97,27 @@ struct SuggestionRequest { } #[derive(Serialize)] +struct AnalysisRequest { + word: String, + suggestions: Vec, +} + +#[derive(Default, Serialize)] #[serde(rename_all = "camelCase")] struct JsonWriter { + #[serde(skip_serializing_if = "Vec::is_empty")] suggest: Vec, - predict: Option>, + #[serde(skip_serializing_if = "Vec::is_empty")] + predict: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + input_analysis: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + output_analysis: Vec, } impl JsonWriter { pub fn new() -> JsonWriter { - JsonWriter { - suggest: vec![], - predict: None, - } + Self::default() } } @@ -92,7 +136,21 @@ impl OutputWriter for JsonWriter { } fn write_predictions(&mut self, predictions: &[String]) { - self.predict = Some(predictions.to_vec()); + self.predict = predictions.to_vec(); + } + + fn write_input_analyses(&mut self, word: &str, suggestions: &[Suggestion]) { + self.input_analysis.push(AnalysisRequest { + word: word.to_string(), + suggestions: suggestions.to_vec(), + }) + } + + fn write_output_analyses(&mut self, word: &str, suggestions: &[Suggestion]) { + self.output_analysis.push(AnalysisRequest { + word: word.to_string(), + suggestions: suggestions.to_vec(), + }) } fn finish(&mut self) { @@ -101,9 +159,10 @@ impl OutputWriter for JsonWriter { } fn run( - speller: Arc, + speller: Arc, words: Vec, writer: &mut dyn OutputWriter, + is_analyzing: bool, is_suggesting: bool, is_always_suggesting: bool, suggest_cfg: &SpellerConfig, @@ -116,6 +175,23 @@ fn run( let suggestions = speller.clone().suggest_with_config(&word, &suggest_cfg); writer.write_suggestions(&word, &suggestions); } + + if is_analyzing { + let input_analyses = speller + .clone() + .analyze_input_with_config(&word, &suggest_cfg); + writer.write_input_analyses(&word, &input_analyses); + + let output_analyses = speller + .clone() + .analyze_output_with_config(&word, &suggest_cfg); + writer.write_output_analyses(&word, &output_analyses); + + let final_suggs = speller + .clone() + .analyse_suggest_with_config(&word, &suggest_cfg); + writer.write_suggestions(&word, &final_suggs); + } } } #[derive(Debug, Options)] @@ -144,18 +220,30 @@ struct SuggestArgs { #[options(help = "print help message")] help: bool, - #[options(help = "BHFST or ZHFST archive to be used", required)] - archive: PathBuf, + #[options(short = "a", help = "BHFST or ZHFST archive to be used")] + archive_path: Option, + + #[options(long = "mutator", help = "mutator to use (if archive not provided)")] + mutator_path: Option, + + #[options(long = "lexicon", help = "lexicon to use (if archive not provided)")] + lexicon_path: Option, #[options(short = "S", help = "always show suggestions even if word is correct")] always_suggest: bool, + #[options(short = "A", help = "analyze words and suggestions")] + analyze: bool, + #[options(help = "maximum weight limit for suggestions")] weight: Option, #[options(help = "maximum number of results")] nbest: Option, + #[options(help = "character for incomplete predictions")] + continuation_marker: Option, + #[options( no_short, long = "no-reweighting", @@ -288,21 +376,42 @@ fn load_archive(path: &Path) -> Result, SpellerArchiveEr } fn suggest(args: SuggestArgs) -> anyhow::Result<()> { + // 1. default config let mut suggest_cfg = SpellerConfig::default(); + let speller = if let Some(archive_path) = args.archive_path { + let archive = load_archive(&archive_path)?; + // 2. config from metadata + if let Some(metadata) = archive.metadata() { + if let Some(continuation) = &metadata.acceptor.continuation { + suggest_cfg.continuation_marker = Some(continuation.clone()); + } + } + let speller = archive.analyser(); + speller + } else if let (Some(lexicon_path), Some(mutator_path)) = (args.lexicon_path, args.mutator_path) + { + let acceptor = HfstTransducer::from_path(&Fs, lexicon_path)?; + let errmodel = HfstTransducer::from_path(&Fs, mutator_path)?; + HfstSpeller::new(errmodel, acceptor) as _ + } else { + eprintln!("Either a BHFST or ZHFST archive must be provided, or a mutator and lexicon."); + process::exit(1); + }; + // 3. config from explicit config file if let Some(config_path) = args.config { let config_file = std::fs::File::open(config_path)?; let config: SpellerConfig = serde_json::from_reader(config_file)?; suggest_cfg = config; } - + // 4. config from other command line stuff if args.disable_reweight { suggest_cfg.reweight = None; } if args.disable_recase { suggest_cfg.recase = false; } - + suggest_cfg.continuation_marker = args.continuation_marker.clone(); if let Some(v) = args.nbest { if v == 0 { suggest_cfg.n_best = None; @@ -322,7 +431,9 @@ fn suggest(args: SuggestArgs) -> anyhow::Result<()> { let mut writer: Box = if args.use_json { Box::new(JsonWriter::new()) } else { - Box::new(StdoutWriter) + Box::new(StdoutWriter { + has_continuation_marker: args.continuation_marker, + }) }; let words = if args.inputs.is_empty() { @@ -340,12 +451,12 @@ fn suggest(args: SuggestArgs) -> anyhow::Result<()> { args.inputs.into_iter().collect() }; - let archive = load_archive(&args.archive)?; - let speller = archive.speller(); + run( speller, words, &mut *writer, + args.analyze, true, args.always_suggest, &suggest_cfg, diff --git a/divvunspell/Cargo.toml b/divvunspell/Cargo.toml index a50d6c4..3dbf2e3 100644 --- a/divvunspell/Cargo.toml +++ b/divvunspell/Cargo.toml @@ -13,11 +13,11 @@ crate-type = ["rlib", "staticlib", "cdylib"] [dependencies] libc = "0.2" -memmap2 = "0.5.0" +memmap2 = "0.9.4" byteorder = "1.3.4" serde = { version = "1.0.116", features = ["derive"] } serde_json = "1.0.57" -serde-xml-rs = { version = "0.5.0", default-features = false } +serde-xml-rs = { version = "0.6.0", default-features = false } zip = { version = "0.5", default-features = false } unic-segment = "0.9.0" unic-char-range = "0.9.0" @@ -27,15 +27,15 @@ unic-emoji-char = "0.9.0" parking_lot = "0.11.2" hashbrown = { version = "0.11", features = ["serde"] } lifeguard = "0.6.1" -smol_str = { version = "0.1.16", features = ["serde"] } +smol_str = { version = "0.2.1", features = ["serde"] } box-format = { version = "0.3.2", features = ["reader"], default-features = false } -itertools = "0.10" -strsim = "0.10.0" +itertools = "0.12.1" +strsim = "0.11.0" log = "0.4.11" -cffi = "0.1.6" +cffi = { git = "https://github.com/cffi-rs/cffi", optional = true } unic-ucd-common = "0.9.0" flatbuffers = { version = "0.6.1", optional = true } -env_logger = { version = "0.9", optional = true } +env_logger = { version = "0.11.2", optional = true } thiserror = "1.0.20" tch = { version = "0.6.1", optional = true } rust-bert = { version = "0.17.0", optional = true } @@ -45,7 +45,7 @@ fs_extra = "1.2.0" eieio = "1.0.0" pathos = "0.3.0" language-tags = "0.3.2" -globwalk = "0.8.1" +globwalk = "0.9.1" [features] compression = ["zip/deflate"] @@ -55,4 +55,4 @@ cargo-clippy = [] # Internal features: unstable, not for external use! internal_convert = [] -internal_ffi = ["flatbuffers", "logging"] +internal_ffi = ["flatbuffers", "logging", "cffi"] diff --git a/divvunspell/src/archive/boxf.rs b/divvunspell/src/archive/boxf.rs index f6b6c05..e2328e2 100644 --- a/divvunspell/src/archive/boxf.rs +++ b/divvunspell/src/archive/boxf.rs @@ -11,7 +11,7 @@ use super::{error::PredictorArchiveError, meta::PredictorMetadata, PredictorArch use super::error::SpellerArchiveError; use super::{meta::SpellerMetadata, SpellerArchive}; -use crate::speller::{HfstSpeller, Speller}; +use crate::speller::{HfstSpeller, Speller, Analyzer}; use crate::transducer::{ thfst::{MemmapThfstChunkedTransducer, MemmapThfstTransducer}, Transducer, @@ -97,6 +97,10 @@ where self.speller.clone() } + fn analyser(&self) -> Arc { + self.speller.clone() + } + fn metadata(&self) -> Option<&SpellerMetadata> { self.metadata.as_ref() } diff --git a/divvunspell/src/archive/meta.rs b/divvunspell/src/archive/meta.rs index e6588d9..afbe290 100644 --- a/divvunspell/src/archive/meta.rs +++ b/divvunspell/src/archive/meta.rs @@ -1,48 +1,78 @@ -//! Archive metadata handling +//! Data structures of speller metadata. +//! +//! These are usually read from the speller archives, in xml or json files or +//! such. XML format is described here and json format there. use serde::{Deserialize, Serialize}; use serde_xml_rs::{from_reader, Error, ParserConfig}; +/// Speller metadata #[derive(Serialize, Deserialize, Debug, Clone)] pub struct SpellerMetadata { + /// speller info pub info: SpellerMetadataInfo, + /// acceptor metadata pub acceptor: SpellerMetadataAcceptor, + /// error model metadata pub errmodel: SpellerMetadataErrmodel, } +/// Predictor metadata #[derive(Serialize, Deserialize, Debug, Default, Clone)] pub struct PredictorMetadata { + /// whether speller is #[serde(default)] pub speller: bool, } +/// localised speller title #[derive(Serialize, Deserialize, Debug, Clone)] pub struct SpellerTitle { + /// ISO 639 code of the title's content language pub lang: Option, + /// translated title #[serde(rename = "$value")] pub value: String, } +/// Speller metadata #[derive(Serialize, Deserialize, Debug, Clone)] pub struct SpellerMetadataInfo { + /// ISO-639 code of speller language pub locale: String, + /// localised, human readable titles of speller pub title: Vec, + /// human readable description of speller pub description: String, + /// creator and copyright owner of the speller pub producer: String, } +/// Acceptor metadata #[derive(Serialize, Deserialize, Debug, Clone)] pub struct SpellerMetadataAcceptor { + /// acceptor type: + /// - `blah` if normal dictionary automaton + /// - `foo` if analyser #[serde(rename = "type", default)] pub type_: String, + /// locally unique id for this acceptor pub id: String, + /// localised human readable titles of speller pub title: Vec, + /// human readable description of the acceptor pub description: String, + /// marker for incomplete strings + pub continuation: Option, } +/// Error model metadata #[derive(Serialize, Deserialize, Debug, Clone)] pub struct SpellerMetadataErrmodel { + /// locally unique id for the error model pub id: String, + /// localised human readable titles for the error model pub title: Vec, + /// human readable description of the error model pub description: String, } @@ -91,7 +121,7 @@ fn test_xml_parse() { se Giellatekno/Divvun/UiT fst-based speller for Northern Sami This is an fst-based speller for Northern Sami. It is based - on the normative subset of the morphological analyser for Northern Sami. + on the normative subset of the morphological analyzer for Northern Sami. The source code can be found at: https://victorio.uit.no/langtech/trunk/langs/sme/ License: GPL3+. diff --git a/divvunspell/src/archive/mod.rs b/divvunspell/src/archive/mod.rs index 99f1f5f..10d3631 100644 --- a/divvunspell/src/archive/mod.rs +++ b/divvunspell/src/archive/mod.rs @@ -16,7 +16,7 @@ use self::{ error::SpellerArchiveError, meta::{PredictorMetadata, SpellerMetadata}, }; -use crate::{predictor::Predictor, speller::Speller}; +use crate::{predictor::Predictor, speller::{Speller, Analyzer}}; pub(crate) struct TempMmap { mmap: Arc, @@ -48,6 +48,7 @@ pub trait SpellerArchive { /// retrieve spell-checker. fn speller(&self) -> Arc; + fn analyser(&self) -> Arc; /// retrieve metadata. fn metadata(&self) -> Option<&SpellerMetadata>; } @@ -89,7 +90,7 @@ pub(crate) mod ffi { use cffi::{FromForeign, ToForeign}; use std::error::Error; - #[cffi::marshal(return_marshaler = "cffi::ArcMarshaler::")] + #[cffi::marshal(return_marshaler = cffi::ArcMarshaler::)] pub extern "C" fn divvun_speller_archive_open( #[marshal(cffi::PathBufMarshaler)] path: std::path::PathBuf, ) -> Result, Box> { diff --git a/divvunspell/src/archive/zip.rs b/divvunspell/src/archive/zip.rs index 5c763cd..48d7cc9 100644 --- a/divvunspell/src/archive/zip.rs +++ b/divvunspell/src/archive/zip.rs @@ -9,7 +9,7 @@ use std::sync::Arc; use super::error::SpellerArchiveError; use super::meta::SpellerMetadata; use super::{MmapRef, SpellerArchive, TempMmap}; -use crate::speller::{HfstSpeller, Speller}; +use crate::speller::{HfstSpeller, Speller, Analyzer}; use crate::transducer::hfst::HfstTransducer; pub type HfstZipSpeller = @@ -103,6 +103,10 @@ impl SpellerArchive for ZipSpellerArchive { self.speller.clone() } + fn analyser(&self) -> Arc { + self.speller.clone() + } + fn metadata(&self) -> Option<&SpellerMetadata> { Some(&self.metadata) } diff --git a/divvunspell/src/paths.rs b/divvunspell/src/paths.rs index ed84a3b..662cdde 100644 --- a/divvunspell/src/paths.rs +++ b/divvunspell/src/paths.rs @@ -3,11 +3,15 @@ use std::path::PathBuf; #[cfg(target_os = "windows")] use std::path::PathBuf; +#[cfg(target_os = "linux")] +use std::path::PathBuf; #[cfg(target_os = "macos")] use language_tags::LanguageTag; #[cfg(target_os = "windows")] use language_tags::LanguageTag; +#[cfg(target_os = "linux")] +use language_tags::LanguageTag; #[cfg(target_os = "macos")] pub fn find_speller_path(tag: LanguageTag) -> Option { @@ -48,3 +52,8 @@ pub fn find_speller_path(tag: LanguageTag) -> Option { .next() .map(|x| x.path().to_path_buf()) } + +#[cfg(target_os = "linux")] +pub fn find_speller_path(tag: LanguageTag) -> Option { + None +} diff --git a/divvunspell/src/speller/mod.rs b/divvunspell/src/speller/mod.rs index 55d7a20..6063719 100644 --- a/divvunspell/src/speller/mod.rs +++ b/divvunspell/src/speller/mod.rs @@ -77,6 +77,8 @@ pub struct SpellerConfig { /// some parallel stuff? #[serde(default = "default_node_pool_size")] pub node_pool_size: usize, + /// used when suggesting unfinished word parts + pub continuation_marker: Option, /// whether we try to recase mispelt word before other suggestions #[serde(default = "default_recase")] pub recase: bool, @@ -97,6 +99,7 @@ impl SpellerConfig { beam: default_beam(), reweight: default_reweight(), node_pool_size: default_node_pool_size(), + continuation_marker: None, recase: default_recase(), } } @@ -125,7 +128,6 @@ const fn default_node_pool_size() -> usize { const fn default_recase() -> bool { true } - /// can determine if string is a correct word or suggest corrections. /// Also with SpellerConfig. pub trait Speller { @@ -139,6 +141,34 @@ pub trait Speller { fn suggest_with_config(self: Arc, word: &str, config: &SpellerConfig) -> Vec; } +/// can provide in-depth analyses along with suggestions +pub trait Analyzer: Speller { + /// analyse the input word form + fn analyze_input(self: Arc, word: &str) -> Vec; + /// analyse input word form with recasing and stuff from configs + fn analyze_input_with_config( + self: Arc, + word: &str, + config: &SpellerConfig, + ) -> Vec; + /// analyse the suggested word forms + fn analyze_output(self: Arc, word: &str) -> Vec; + /// analyse the suggested word forms with recasing and stuff from configs + fn analyze_output_with_config( + self: Arc, + word: &str, + config: &SpellerConfig, + ) -> Vec; + /// create suggestion list and use their analyses for finetununt + fn analyse_suggest(self: Arc, word: &str) -> Vec; + /// create suggestion list and use analyses to finetune with config + fn analyse_suggest_with_config( + self: Arc, + word: &str, + config: &SpellerConfig, + ) -> Vec; +} + impl Speller for HfstSpeller where F: crate::vfs::File + Send, @@ -172,7 +202,8 @@ where config ); for word in std::iter::once(word.into()).chain(words.into_iter()) { - let worker = SpellerWorker::new(self.clone(), self.to_input_vec(&word), config.clone()); + let worker = SpellerWorker::new(self.clone(), + self.to_input_vec(&word), config.clone(), false); if worker.is_correct() { return true; @@ -209,6 +240,85 @@ where } } +impl Analyzer for HfstSpeller +where + F: crate::vfs::File + Send, + T: Transducer + Send, + U: Transducer + Send, +{ + #[allow(clippy::wrong_self_convention)] + fn analyze_input_with_config( + self: Arc, + word: &str, + config: &SpellerConfig, + ) -> Vec { + if word.len() == 0 { + return vec![]; + } + + let worker = SpellerWorker::new(self.clone(), + self.to_input_vec(&word), config.clone(), false); + + log::trace!("Beginning analyze with config in mod"); + worker.analyze() + } + + #[inline] + fn analyze_input(self: Arc, word: &str) -> Vec { + self.analyze_input_with_config(word, &SpellerConfig::default()) + } + + #[inline] + fn analyze_output(self: Arc, word: &str) -> Vec { + self.analyze_output_with_config(word, &SpellerConfig::default()) + } + + #[inline] + fn analyse_suggest(self: Arc, word: &str) -> Vec { + self.analyse_suggest_with_config(word, &SpellerConfig::default()) + } + + fn analyze_output_with_config( + self: Arc, + word: &str, + config: &SpellerConfig, + ) -> Vec { + if word.len() == 0 { + return vec![]; + } + log::trace!("Beginning analyze suggest with config in mod"); + let worker = SpellerWorker::new(self.clone(), + self.to_input_vec(word), config.clone(), false); + + worker.suggest() + } + + fn analyse_suggest_with_config( + self: Arc, + word: &str, + config: &SpellerConfig + ) -> Vec { + let mut suggs = self.clone().suggest_with_config(word, config); + suggs.retain(|sugg| { + log::trace!("suggestion {}", sugg.value); + let analyses = self.clone().analyze_input_with_config(sugg.value.as_str(), + config); + let mut all_filtered = true; + for analysis in analyses { + log::trace!("-> {}", analysis.value); + if !analysis.value.contains("+Spell/NoSugg") { + all_filtered = false; + } else { + log::trace!("filtering=?"); + } + } + !all_filtered + }); + suggs + } + +} + /// a speller consisting of two HFST automata #[derive(Debug)] pub struct HfstSpeller @@ -273,8 +383,10 @@ where } fn suggest_single(self: Arc, word: &str, config: &SpellerConfig) -> Vec { - let worker = SpellerWorker::new(self.clone(), self.to_input_vec(word), config.clone()); + let worker = SpellerWorker::new(self.clone(), self.to_input_vec(word), + config.clone(), true); + log::trace!("suggesting single {}", word); worker.suggest() } @@ -286,6 +398,7 @@ where ) -> Vec { use crate::tokenizer::case_handling::*; + log::trace!("suggesting cases..."); let CaseHandler { original_input, mutation, @@ -295,7 +408,9 @@ where let mut best: HashMap = HashMap::new(); for word in std::iter::once(&original_input).chain(words.iter()) { - let worker = SpellerWorker::new(self.clone(), self.to_input_vec(&word), config.clone()); + log::trace!("suggesting for word {}", word); + let worker = SpellerWorker::new(self.clone(), + self.to_input_vec(&word), config.clone(), true); let mut suggestions = worker.suggest(); match mutation { @@ -314,7 +429,9 @@ where match mode { CaseMode::MergeAll => { + log::trace!("Case merge all"); for sugg in suggestions.into_iter() { + log::trace!("for {}", sugg.value); let penalty_start = if !sugg.value().starts_with(word.chars().next().unwrap()) { reweight.start_penalty - reweight.mid_penalty @@ -374,14 +491,26 @@ where if best.is_empty() { return vec![]; } - - let mut out = best - .into_iter() - .map(|(k, v)| Suggestion { - value: k, - weight: v, - }) - .collect::>(); + let mut out: Vec; + if let Some(s) = &config.continuation_marker { + out = best + .into_iter() + .map(|(k, v)| Suggestion { + value: k.clone(), + weight: v, + completed: Some(!k.ends_with(s)), + }) + .collect::>(); + } else { + out = best + .into_iter() + .map(|(k, v)| Suggestion { + value: k, + weight: v, + completed: None, + }) + .collect::>(); + } out.sort(); if let Some(n_best) = config.n_best { out.truncate(n_best); @@ -498,6 +627,7 @@ pub(crate) mod ffi { }, reweight, node_pool_size: config.node_pool_size, + continuation_marker: None, recase: true, }; diff --git a/divvunspell/src/speller/suggestion.rs b/divvunspell/src/speller/suggestion.rs index a331f68..f9fff9d 100644 --- a/divvunspell/src/speller/suggestion.rs +++ b/divvunspell/src/speller/suggestion.rs @@ -6,23 +6,41 @@ use std::cmp::Ordering; use std::cmp::Ordering::Equal; #[derive(Clone, Debug, Serialize, Deserialize)] +/// Suggestion for a spelling correction pub struct Suggestion { + /// the suggested word-form pub value: SmolStr, + /// total penalty weight of the word-form pub weight: Weight, + /// whether the word is completed or partial + #[serde(skip_serializing_if = "Option::is_none")] + pub completed: Option, } impl Suggestion { - pub fn new(value: SmolStr, weight: Weight) -> Suggestion { - Suggestion { value, weight } + /// creates a spelling correction suggestion + pub fn new(value: SmolStr, weight: Weight, completed: Option) -> Suggestion { + Suggestion { + value, + weight, + completed, + } } + /// gets the suggested word-form pub fn value(&self) -> &str { &self.value } + /// gets the penalty weight of the suggestion pub fn weight(&self) -> Weight { self.weight } + + /// returns whether this suggestion is a full word or partial + pub fn completed(&self) -> Option { + self.completed + } } impl PartialOrd for Suggestion { diff --git a/divvunspell/src/speller/worker.rs b/divvunspell/src/speller/worker.rs index 9072f09..df6edd7 100644 --- a/divvunspell/src/speller/worker.rs +++ b/divvunspell/src/speller/worker.rs @@ -23,6 +23,7 @@ pub struct SpellerWorker, U: Transducer speller: Arc>, input: Vec, config: SpellerConfig, + mode_correcting: bool, } #[allow(clippy::too_many_arguments)] @@ -37,11 +38,13 @@ where speller: Arc>, input: Vec, config: SpellerConfig, + mode_correcting: bool, ) -> SpellerWorker { SpellerWorker { speller, input, config, + mode_correcting, } } @@ -70,7 +73,12 @@ where if self .is_under_weight_limit(max_weight, next_node.weight() + transition_weight) { - let new_node = next_node.update_lexicon(pool, transition); + let new_node = if self.mode_correcting { + next_node.update_lexicon(pool, + transition.clone_with_epsilon_symbol()) + } else { + next_node.update_lexicon(pool, transition) + }; output_nodes.push(new_node); } } else { @@ -220,15 +228,27 @@ where ); if is_under_weight_limit { - let new_node = next_node.update( + let new_node = if self.mode_correcting { + next_node.update( + pool, + input_sym, + Some(next_node.input_state + input_increment as + u32), + mutator_state, + noneps_trans.target().unwrap(), + noneps_trans.weight().unwrap() + mutator_weight, + ) + + } else { + next_node.update( pool, sym, Some(next_node.input_state + input_increment as u32), mutator_state, noneps_trans.target().unwrap(), noneps_trans.weight().unwrap() + mutator_weight, - ); - + ) + }; output_nodes.push(new_node); } } @@ -399,14 +419,14 @@ where let input_sym = alphabet_translator[self.input[input_state as usize] as usize]; let next_lexicon_state = next_node.lexicon_state + 1; - log::trace!( - "lexicon consuming {}: {}", - input_sym, - self.speller - .lexicon - .alphabet() - .string_from_symbols(&[input_sym]) - ); + // log::trace!( + // "lexicon consuming {}: {}", + // input_sym, + // self.speller + // .lexicon + // .alphabet() + // .string_from_symbols(&[input_sym]) + // ); if !lexicon.has_transitions(next_lexicon_state, Some(input_sym)) { // we have no regular transitions for this @@ -491,6 +511,7 @@ where } pub(crate) fn is_correct(&self) -> bool { + log::trace!("is_correct"); // let max_weight = speller_max_weight(&self.config); let pool = Pool::with_size_and_max(0, 0); let mut nodes = speller_start_node(&pool, self.state_size() as usize); @@ -509,6 +530,41 @@ where false } + pub(crate) fn analyze(&self) -> Vec { + log::trace!("Beginning analyze"); + let pool = Pool::with_size_and_max(0, 0); + let mut nodes = speller_start_node(&pool, self.state_size() as usize); + log::trace!("beginning analyze {:?}", self.input); + let mut lookups = HashMap::new(); + let mut analyses: Vec = vec![]; + while let Some(next_node) = nodes.pop() { + if next_node.input_state as usize == self.input.len() + && self.speller.lexicon().is_final(next_node.lexicon_state) + { + let string = self + .speller + .lexicon() + .alphabet() + .string_from_symbols(&next_node.string); + let weight = next_node.weight() + + self + .speller + .lexicon() + .final_weight(next_node.lexicon_state) + .unwrap(); + let entry = lookups.entry(string).or_insert(weight); + if *entry > weight { + *entry = weight; + } + } + self.lexicon_epsilons(&pool, f32::INFINITY, &next_node, &mut nodes); + self.lexicon_consume(&pool, f32::INFINITY, &next_node, &mut nodes); + analyses = self.generate_sorted_suggestions(&lookups); + } + analyses + + } + pub(crate) fn suggest(&self) -> Vec { log::trace!("Beginning suggest"); @@ -577,7 +633,7 @@ where .lexicon() .alphabet() .string_from_symbols(&next_node.string); - + // log::trace!("suggesting? {}::{}", string, weight); if weight < best_weight { best_weight = weight; } @@ -592,7 +648,6 @@ where suggestions = self.generate_sorted_suggestions(&corrections); } - suggestions } @@ -600,17 +655,24 @@ where &self, corrections: &HashMap, ) -> Vec { - let mut c: Vec = corrections - .into_iter() - .map(|x| Suggestion::new(x.0.clone(), *x.1)) - .collect(); - + //log::trace!("Generating sorted suggestions"); + let mut c: Vec; + if let Some(s) = &self.config.continuation_marker { + c = corrections + .into_iter() + .map(|x| Suggestion::new(x.0.clone(), *x.1, Some(x.0.ends_with(s)))) + .collect(); + } else { + c = corrections + .into_iter() + .map(|x| Suggestion::new(x.0.clone(), *x.1, None)) + .collect(); + } c.sort(); if let Some(n) = self.config.n_best { c.truncate(n); } - c } }