allow quert time resolution of wildcard
This commit is contained in:
parent
1c832b747a
commit
bfe039c6d7
|
@ -1,4 +1,4 @@
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
#[cfg(not(target_arch = "wasm32"))]
|
#[cfg(not(target_arch = "wasm32"))]
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::hash::{Hash, Hasher};
|
use std::hash::{Hash, Hasher};
|
||||||
|
@ -20,13 +20,13 @@ impl Dict {
|
||||||
let words: std::io::Result<Vec<_>> = file
|
let words: std::io::Result<Vec<_>> = file
|
||||||
.lines()
|
.lines()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.filter_map(|(i, e)| (i != 0).then(|| e)) // Skip the first line (size of the dict)
|
.filter_map(|(i, e)| (i != 0).then(|| e)) // Skip the first line ( = to size of the dict)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect();
|
.collect();
|
||||||
let words = words?;
|
let words = words?;
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
words,
|
words,
|
||||||
nb_wild_cards: 1,
|
nb_wild_cards: 2,
|
||||||
wild_card_char: '?',
|
wild_card_char: '?',
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -36,7 +36,7 @@ impl Dict {
|
||||||
words: string
|
words: string
|
||||||
.lines()
|
.lines()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.filter_map(|(i, e)| (i != 0).then(|| e.into()))
|
.filter_map(|(i, e)| (i != 0).then(|| e.into())) // Skip the first line ( = to size of the dict)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect(),
|
.collect(),
|
||||||
nb_wild_cards,
|
nb_wild_cards,
|
||||||
|
@ -58,12 +58,28 @@ impl Dict {
|
||||||
/// The precomputed hash map. The words are regrouped in vector, indexed by there
|
/// The precomputed hash map. The words are regrouped in vector, indexed by there
|
||||||
/// [`FrequencyHash`].
|
/// [`FrequencyHash`].
|
||||||
///
|
///
|
||||||
/// Hash maps have an average access complexity in O(1)
|
/// Hash maps have an average access complexity in O(|letters|^{max(0, l-nb_wild_cards)}) where l
|
||||||
pub struct AnagramDict(HashMap<FrequencyHash, Vec<String>>);
|
/// is the number of wild cards in the query
|
||||||
|
pub struct AnagramDict {
|
||||||
|
/// The precomputed anagrams classes of equivalences indexed by there caracter
|
||||||
|
/// frequency
|
||||||
|
map: HashMap<FrequencyHash, Vec<String>>,
|
||||||
|
/// Number of precomputed wild cards per word
|
||||||
|
nb_wild_cards: u8,
|
||||||
|
/// The character used as a wild card
|
||||||
|
wild_card_char: char,
|
||||||
|
/// The set of characters used in the dict
|
||||||
|
letters: HashSet<char>,
|
||||||
|
}
|
||||||
|
|
||||||
impl From<&Dict> for AnagramDict {
|
impl From<&Dict> for AnagramDict {
|
||||||
fn from(dict: &Dict) -> Self {
|
fn from(dict: &Dict) -> Self {
|
||||||
let mut map = Self(HashMap::<FrequencyHash, Vec<String>>::new());
|
let mut map = Self {
|
||||||
|
map: HashMap::<FrequencyHash, Vec<String>>::new(),
|
||||||
|
nb_wild_cards: dict.nb_wild_cards.clone(),
|
||||||
|
wild_card_char: dict.wild_card_char.clone(),
|
||||||
|
letters: HashSet::<char>::new(),
|
||||||
|
};
|
||||||
|
|
||||||
#[cfg(not(target_arch = "wasm32"))]
|
#[cfg(not(target_arch = "wasm32"))]
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
|
@ -71,6 +87,12 @@ impl From<&Dict> for AnagramDict {
|
||||||
let len = dict.words.len();
|
let len = dict.words.len();
|
||||||
for word in dict.words.iter() {
|
for word in dict.words.iter() {
|
||||||
let freq = FrequencyHash::compute(word);
|
let freq = FrequencyHash::compute(word);
|
||||||
|
for char_ in word.chars() {
|
||||||
|
if char_ != map.wild_card_char {
|
||||||
|
map.letters.insert(char_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
map.add_word_with_wild_card(freq, word, dict.nb_wild_cards, dict.wild_card_char);
|
map.add_word_with_wild_card(freq, word, dict.nb_wild_cards, dict.wild_card_char);
|
||||||
#[cfg(not(target_arch = "wasm32"))]
|
#[cfg(not(target_arch = "wasm32"))]
|
||||||
{
|
{
|
||||||
|
@ -86,8 +108,33 @@ impl AnagramDict {
|
||||||
/// Find all anagrams to a word present in this dict
|
/// Find all anagrams to a word present in this dict
|
||||||
pub fn find(&self, word: &str) -> Option<Vec<String>> {
|
pub fn find(&self, word: &str) -> Option<Vec<String>> {
|
||||||
let freq = FrequencyHash::compute(word);
|
let freq = FrequencyHash::compute(word);
|
||||||
self.0.get(&freq).cloned()
|
self.find_freq(freq)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Recursivelly resolve wild cards until there is less wildcards than [`self.nb_wild_cards`]
|
||||||
|
fn find_freq(&self, mut freq: FrequencyHash) -> Option<Vec<String>> {
|
||||||
|
if freq.get_freq(self.wild_card_char) > self.nb_wild_cards {
|
||||||
|
let mut result = vec![];
|
||||||
|
freq.remove_one_char(self.wild_card_char);
|
||||||
|
for char_ in self.letters.iter() {
|
||||||
|
let mut freq = freq.clone();
|
||||||
|
freq.add_one_char(char_.clone());
|
||||||
|
if let Some(anagrams) = self.find_freq(freq) {
|
||||||
|
for anagram in anagrams {
|
||||||
|
result.push(anagram);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if result.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(result)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self.map.get(&freq).cloned()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn add_word_with_wild_card(
|
fn add_word_with_wild_card(
|
||||||
&mut self,
|
&mut self,
|
||||||
freq: FrequencyHash,
|
freq: FrequencyHash,
|
||||||
|
@ -95,7 +142,7 @@ impl AnagramDict {
|
||||||
nb_wild_card: u8,
|
nb_wild_card: u8,
|
||||||
wild_card_symbole: char,
|
wild_card_symbole: char,
|
||||||
) {
|
) {
|
||||||
self.0
|
self.map
|
||||||
.entry(freq.clone())
|
.entry(freq.clone())
|
||||||
.and_modify(|anagrams| anagrams.push(word.to_string()))
|
.and_modify(|anagrams| anagrams.push(word.to_string()))
|
||||||
.or_insert(vec![word.to_string()]);
|
.or_insert(vec![word.to_string()]);
|
||||||
|
@ -158,6 +205,15 @@ impl FrequencyHash {
|
||||||
.and_modify(|counter| *counter += 1)
|
.and_modify(|counter| *counter += 1)
|
||||||
.or_insert(1);
|
.or_insert(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the number of occurence of a char in the word
|
||||||
|
fn get_freq(&self, char_: char) -> u8 {
|
||||||
|
if let Some(freq) = self.0.get(&char_) {
|
||||||
|
freq.clone()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Hash for FrequencyHash {
|
impl Hash for FrequencyHash {
|
||||||
|
|
|
@ -88,14 +88,10 @@
|
||||||
<body>
|
<body>
|
||||||
<div id="main_div">
|
<div id="main_div">
|
||||||
<p> Load <a href="./dict.dat">dict.dat</a> before starting </p>
|
<p> Load <a href="./dict.dat">dict.dat</a> before starting </p>
|
||||||
<p> Wildcards are implemented but increase a lot the precomputing complexity</p>
|
<p> The precomputation has a time and space complexity of \(\mathcal{O}(\sum_{i=0}^W \binom{S}{i} . n)\) where \(S\) is the maximum size of the words, \(W\) the maximum number of precomputed wildcards per word, and \(n\) the size of the dictionary. For \(S = 15\) and \(W = 3\), this is more or less \(500 . n\), 2 precomputed wildcards is a good default setting.</p>
|
||||||
<p> The precomputation time has a complexity if \(\mathcal{O}(\sum_{i=0}^W \binom{S}{i} . n)\) where \(S\) is the maximum size of the words, \(W\) the maximum number of wild card, and \(n\) the size of the dictionary. For \(S = 15\) and \(W = 3\), this is more or less \(500 . n\)</p>
|
<p> The query has a time complexity of \(\mathcal{O}(|S|^{max(0, W'-W)})\) where \(W'\) is the number of wildcards in the query. </p>
|
||||||
<p> The wildcard caracter is '?'</p>
|
<p> The wildcard caracter is '?'</p>
|
||||||
<p> Sources in rust are available <a href="./lib.rs">here</a> </p>
|
<p> Sources in rust are available <a href="./lib.rs">here</a> </p>
|
||||||
<h3> TODO: </h3>
|
|
||||||
<ul>
|
|
||||||
<li> Allow query-time resolution of wild cards when there are more wildcards than the number of precomputed wild cards</li>
|
|
||||||
</ul>
|
|
||||||
<div id="upload_dict_div">
|
<div id="upload_dict_div">
|
||||||
<label id="upload_dict_label" for="upload_dict">Select the dictionary</label>
|
<label id="upload_dict_label" for="upload_dict">Select the dictionary</label>
|
||||||
<input type="file" id="upload_dict" style="opacity:0">
|
<input type="file" id="upload_dict" style="opacity:0">
|
||||||
|
|
Loading…
Reference in a new issue