Skip to content

Commit

Permalink
Merge pull request #15 from MRoci/wordsegmentation
Browse files Browse the repository at this point in the history
Add word_segmentation support
  • Loading branch information
reneklacan authored Feb 3, 2020
2 parents d072a62 + 6d49a2f commit 29a360a
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 1 deletion.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ fn main() {
let sentence = "whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him"
let compound_suggestions = symspell.lookup_compound(sentence, 2);
println!("{:?}", compound_suggestions);

let sentence = "whereisthelove";
let segmented = symspell.word_segmentation(sentence, 2);
println!("{:?}", segmented);
}
```

Expand Down
16 changes: 16 additions & 0 deletions src/composition.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#[derive(Debug, Clone)]
pub struct Composition {
pub segmented_string: String,
pub distance_sum: i64,
pub prob_log_sum: f64,
}

impl Composition {
pub fn empty() -> Self {
Self {
segmented_string: "".to_string(),
distance_sum: 0,
prob_log_sum: 0.0,
}
}
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ let compound_suggestions = symspell.lookup_compound(sentence, 2);
println!("{:?}", compound_suggestions);
```
*/

extern crate strsim;
#[macro_use]
extern crate derive_builder;
Expand All @@ -36,6 +35,7 @@ extern crate serde_derive;
#[cfg(test)]
extern crate wasm_bindgen_test;

mod composition;
mod edit_distance;
mod string_strategy;
mod suggestion;
Expand Down
115 changes: 115 additions & 0 deletions src/symspell.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::i64;
use std::io::{BufRead, BufReader};
use std::path::Path;

use composition::Composition;
use edit_distance::{DistanceAlgorithm, EditDistance};
use string_strategy::StringStrategy;
use suggestion::Suggestion;
Expand Down Expand Up @@ -649,6 +650,97 @@ impl<T: StringStrategy> SymSpell<T> {
vec![suggestion]
}

/// Divides a string into words by inserting missing spaces at the appropriate positions
///
///
/// # Arguments
///
/// * `input` - The word being segmented.
/// * `max_edit_distance` - The maximum edit distance between input and suggested words.
///
/// # Examples
///
/// ```
/// use symspell::{SymSpell, UnicodeStringStrategy, Verbosity};
///
/// let mut symspell: SymSpell<UnicodeStringStrategy> = SymSpell::default();
/// symspell.load_dictionary("data/frequency_dictionary_en_82_765.txt", 0, 1, " ");
/// symspell.word_segmentation("itwas", 2);
/// ```
pub fn word_segmentation(&self, input: &str, max_edit_distance: i64) -> Composition {
let input = self.string_strategy.prepare(input);
let asize = self.string_strategy.len(&input);

let mut ci: usize = 0;
let mut compositions: Vec<Composition> = vec![Composition::empty(); asize];

for j in 0..asize {
let imax = cmp::min(asize - j, self.max_length as usize);
for i in 1..=imax {
let top_prob_log: f64;

let mut part = self.string_strategy.slice(&input, j, j + i);

let mut sep_len = 0;
let mut top_ed: i64 = 0;

let first_char = self.string_strategy.at(&part, 0).unwrap();
if first_char.is_whitespace() {
part = self.string_strategy.remove(&part, 0);
} else {
sep_len = 1;
}

top_ed += part.len() as i64;

part = part.replace(" ", "");

top_ed -= part.len() as i64;

let results = self.lookup(&part, Verbosity::Top, max_edit_distance);

if !results.is_empty() && results[0].distance == 0 {
top_prob_log =
(results[0].count as f64 / self.corpus_word_count as f64).log10();
} else {
top_ed += part.len() as i64;
top_prob_log = (10.0
/ (self.corpus_word_count as f64 * 10.0f64.powf(part.len() as f64)))
.log10();
}

let di = (i + ci) % asize;
// set values in first loop
if j == 0 {
compositions[i - 1] = Composition {
segmented_string: part.to_owned(),
distance_sum: top_ed,
prob_log_sum: top_prob_log,
};
} else if i as i64 == self.max_length
|| (((compositions[ci].distance_sum + top_ed == compositions[di].distance_sum)
|| (compositions[ci].distance_sum + sep_len + top_ed
== compositions[di].distance_sum))
&& (compositions[di].prob_log_sum
< compositions[ci].prob_log_sum + top_prob_log))
|| (compositions[ci].distance_sum + sep_len + top_ed
< compositions[di].distance_sum)
{
compositions[di] = Composition {
segmented_string: format!("{} {}", compositions[ci].segmented_string, part),
distance_sum: compositions[ci].distance_sum + sep_len + top_ed,
prob_log_sum: compositions[ci].prob_log_sum + top_prob_log,
};
}
}
if j != 0 {
ci += 1;
}
ci = if ci == asize { 0 } else { ci };
}
compositions[ci].to_owned()
}

fn delete_in_suggestion_prefix(
&self,
delete: &str,
Expand Down Expand Up @@ -906,4 +998,27 @@ mod tests {
assert_eq!(3, results[0].distance);
assert_eq!(1366, results[0].count);
}

#[test]
fn test_word_segmentation() {
let edit_distance_max = 2;
let mut sym_spell = SymSpell::<UnicodeStringStrategy>::default();
sym_spell.load_dictionary("./data/frequency_dictionary_en_82_765.txt", 0, 1, " ");

let typo = "thequickbrownfoxjumpsoverthelazydog";
let correction = "the quick brown fox jumps over the lazy dog";
let result = sym_spell.word_segmentation(typo, edit_distance_max);
assert_eq!(correction, result.segmented_string);

let typo = "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen";
let correction = "it was a bright cold day in april and the clocks were striking thirteen";
let result = sym_spell.word_segmentation(typo, edit_distance_max);
assert_eq!(correction, result.segmented_string);

let typo =
"itwasthebestoftimesitwastheworstoftimesitwastheageofwisdomitwastheageoffoolishness";
let correction = "it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness";
let result = sym_spell.word_segmentation(typo, edit_distance_max);
assert_eq!(correction, result.segmented_string);
}
}
33 changes: 33 additions & 0 deletions src/wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ pub struct JSSuggestion {
count: i32,
}

#[derive(Serialize, Deserialize)]
pub struct JSComposition {
segmented_string: String,
distance_sum: i32,
prob_log_sum: f32,
}

#[derive(Serialize, Deserialize)]
pub struct InitParams {
max_edit_distance: i32,
Expand Down Expand Up @@ -154,6 +161,23 @@ impl JSSymSpell {
})
.collect())
}

pub fn word_segmentation(
&self,
input: &str,
max_edit_distance: i32,
) -> Result<JsValue, JsValue> {
let seg = self
.symspell
.word_segmentation(input, max_edit_distance as i64);
let res = JSComposition {
segmented_string: seg.segmented_string,
distance_sum: seg.distance_sum as i32,
prob_log_sum: seg.prob_log_sum as f32,
};

Ok(JsValue::from_serde(&res).unwrap())
}
}

#[cfg(test)]
Expand Down Expand Up @@ -197,5 +221,14 @@ mod tests {
.into_serde()
.unwrap();
assert_eq!(result.term, expected);

let sentence = "whereinfo";
let expected = "where info";
let result: JSComposition = speller
.word_segmentation(sentence, 2)
.unwrap()
.into_serde()
.unwrap();
assert_eq!(result.segmented_string, expected);
}
}

0 comments on commit 29a360a

Please sign in to comment.