allow quert time resolution of wildcard

This commit is contained in:
Jean-Marie Mineau 2023-02-24 20:32:40 +01:00
parent 1c832b747a
commit bfe039c6d7
2 changed files with 67 additions and 15 deletions

View file

@ -1,4 +1,4 @@
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
#[cfg(not(target_arch = "wasm32"))] #[cfg(not(target_arch = "wasm32"))]
use std::fs::File; use std::fs::File;
use std::hash::{Hash, Hasher}; use std::hash::{Hash, Hasher};
@ -20,13 +20,13 @@ impl Dict {
let words: std::io::Result<Vec<_>> = file let words: std::io::Result<Vec<_>> = file
.lines() .lines()
.enumerate() .enumerate()
.filter_map(|(i, e)| (i != 0).then(|| e)) // Skip the first line (size of the dict) .filter_map(|(i, e)| (i != 0).then(|| e)) // Skip the first line ( = to size of the dict)
.into_iter() .into_iter()
.collect(); .collect();
let words = words?; let words = words?;
Ok(Self { Ok(Self {
words, words,
nb_wild_cards: 1, nb_wild_cards: 2,
wild_card_char: '?', wild_card_char: '?',
}) })
} }
@ -36,7 +36,7 @@ impl Dict {
words: string words: string
.lines() .lines()
.enumerate() .enumerate()
.filter_map(|(i, e)| (i != 0).then(|| e.into())) .filter_map(|(i, e)| (i != 0).then(|| e.into())) // Skip the first line ( = to size of the dict)
.into_iter() .into_iter()
.collect(), .collect(),
nb_wild_cards, nb_wild_cards,
@ -58,12 +58,28 @@ impl Dict {
/// The precomputed hash map. The words are regrouped in vector, indexed by there /// The precomputed hash map. The words are regrouped in vector, indexed by there
/// [`FrequencyHash`]. /// [`FrequencyHash`].
/// ///
/// Hash maps have an average access complexity in O(1) /// Hash maps have an average access complexity in O(|letters|^{max(0, l-nb_wild_cards)}) where l
pub struct AnagramDict(HashMap<FrequencyHash, Vec<String>>); /// is the number of wild cards in the query
pub struct AnagramDict {
/// The precomputed anagrams classes of equivalences indexed by there caracter
/// frequency
map: HashMap<FrequencyHash, Vec<String>>,
/// Number of precomputed wild cards per word
nb_wild_cards: u8,
/// The character used as a wild card
wild_card_char: char,
/// The set of characters used in the dict
letters: HashSet<char>,
}
impl From<&Dict> for AnagramDict { impl From<&Dict> for AnagramDict {
fn from(dict: &Dict) -> Self { fn from(dict: &Dict) -> Self {
let mut map = Self(HashMap::<FrequencyHash, Vec<String>>::new()); let mut map = Self {
map: HashMap::<FrequencyHash, Vec<String>>::new(),
nb_wild_cards: dict.nb_wild_cards.clone(),
wild_card_char: dict.wild_card_char.clone(),
letters: HashSet::<char>::new(),
};
#[cfg(not(target_arch = "wasm32"))] #[cfg(not(target_arch = "wasm32"))]
let mut i = 0; let mut i = 0;
@ -71,6 +87,12 @@ impl From<&Dict> for AnagramDict {
let len = dict.words.len(); let len = dict.words.len();
for word in dict.words.iter() { for word in dict.words.iter() {
let freq = FrequencyHash::compute(word); let freq = FrequencyHash::compute(word);
for char_ in word.chars() {
if char_ != map.wild_card_char {
map.letters.insert(char_);
}
}
map.add_word_with_wild_card(freq, word, dict.nb_wild_cards, dict.wild_card_char); map.add_word_with_wild_card(freq, word, dict.nb_wild_cards, dict.wild_card_char);
#[cfg(not(target_arch = "wasm32"))] #[cfg(not(target_arch = "wasm32"))]
{ {
@ -86,8 +108,33 @@ impl AnagramDict {
/// Find all anagrams to a word present in this dict /// Find all anagrams to a word present in this dict
pub fn find(&self, word: &str) -> Option<Vec<String>> { pub fn find(&self, word: &str) -> Option<Vec<String>> {
let freq = FrequencyHash::compute(word); let freq = FrequencyHash::compute(word);
self.0.get(&freq).cloned() self.find_freq(freq)
} }
/// Recursivelly resolve wild cards until there is less wildcards than [`self.nb_wild_cards`]
fn find_freq(&self, mut freq: FrequencyHash) -> Option<Vec<String>> {
if freq.get_freq(self.wild_card_char) > self.nb_wild_cards {
let mut result = vec![];
freq.remove_one_char(self.wild_card_char);
for char_ in self.letters.iter() {
let mut freq = freq.clone();
freq.add_one_char(char_.clone());
if let Some(anagrams) = self.find_freq(freq) {
for anagram in anagrams {
result.push(anagram);
}
}
}
if result.is_empty() {
None
} else {
Some(result)
}
} else {
self.map.get(&freq).cloned()
}
}
fn add_word_with_wild_card( fn add_word_with_wild_card(
&mut self, &mut self,
freq: FrequencyHash, freq: FrequencyHash,
@ -95,7 +142,7 @@ impl AnagramDict {
nb_wild_card: u8, nb_wild_card: u8,
wild_card_symbole: char, wild_card_symbole: char,
) { ) {
self.0 self.map
.entry(freq.clone()) .entry(freq.clone())
.and_modify(|anagrams| anagrams.push(word.to_string())) .and_modify(|anagrams| anagrams.push(word.to_string()))
.or_insert(vec![word.to_string()]); .or_insert(vec![word.to_string()]);
@ -158,6 +205,15 @@ impl FrequencyHash {
.and_modify(|counter| *counter += 1) .and_modify(|counter| *counter += 1)
.or_insert(1); .or_insert(1);
} }
/// Return the number of occurence of a char in the word
fn get_freq(&self, char_: char) -> u8 {
if let Some(freq) = self.0.get(&char_) {
freq.clone()
} else {
0
}
}
} }
impl Hash for FrequencyHash { impl Hash for FrequencyHash {

View file

@ -88,14 +88,10 @@
<body> <body>
<div id="main_div"> <div id="main_div">
<p> Load <a href="./dict.dat">dict.dat</a> before starting </p> <p> Load <a href="./dict.dat">dict.dat</a> before starting </p>
<p> Wildcards are implemented but increase a lot the precomputing complexity</p> <p> The precomputation has a time and space complexity of \(\mathcal{O}(\sum_{i=0}^W \binom{S}{i} . n)\) where \(S\) is the maximum size of the words, \(W\) the maximum number of precomputed wildcards per word, and \(n\) the size of the dictionary. For \(S = 15\) and \(W = 3\), this is more or less \(500 . n\), 2 precomputed wildcards is a good default setting.</p>
<p> The precomputation time has a complexity if \(\mathcal{O}(\sum_{i=0}^W \binom{S}{i} . n)\) where \(S\) is the maximum size of the words, \(W\) the maximum number of wild card, and \(n\) the size of the dictionary. For \(S = 15\) and \(W = 3\), this is more or less \(500 . n\)</p> <p> The query has a time complexity of \(\mathcal{O}(|S|^{max(0, W'-W)})\) where \(W'\) is the number of wildcards in the query. </p>
<p> The wildcard caracter is '?'</p> <p> The wildcard caracter is '?'</p>
<p> Sources in rust are available <a href="./lib.rs">here</a> </p> <p> Sources in rust are available <a href="./lib.rs">here</a> </p>
<h3> TODO: </h3>
<ul>
<li> Allow query-time resolution of wild cards when there are more wildcards than the number of precomputed wild cards</li>
</ul>
<div id="upload_dict_div"> <div id="upload_dict_div">
<label id="upload_dict_label" for="upload_dict">Select the dictionary</label> <label id="upload_dict_label" for="upload_dict">Select the dictionary</label>
<input type="file" id="upload_dict" style="opacity:0"> <input type="file" id="upload_dict" style="opacity:0">