diff --git a/languedoc.py b/languedoc.py --- a/languedoc.py +++ b/languedoc.py @@ -46,24 +46,21 @@ def merge_ngram_freqs(freqs): class Sample: def __init__(self, language="??", text=""): self.language = language - self.frequencies = [dict(), dict(), dict()] + self.frequencies = dict() if text: self._extract(text) def _extract(self, text): for k in range(1, 4): - self.frequencies[k-1] = extract_ngram_freqs(text, k) + self.frequencies.update(extract_ngram_freqs(text, k)) @staticmethod def merge(samples): assert len({x.language for x in samples}) == 1 res = Sample(samples[0].language) - res.frequencies = [] - - for freqs in zip(*(x.frequencies for x in samples)): - res.frequencies.append(merge_ngram_freqs(freqs)) + res.frequencies = merge_ngram_freqs([x.frequencies for x in samples]) return res @@ -72,16 +69,13 @@ class Sample: use frequencies x order use letter, digrams, trigrams use absolute x square""" - own_frequencies = self.frequencies[0] | self.frequencies[1] | self.frequencies[2] - other_frequencies = other.frequencies[0] | other.frequencies[1] | other.frequencies[2] + ordered_own_ngrams = sorted(self.frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] + ordered_other_ngrams = sorted(other.frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] + ranked_own_ngrams = dict(zip([key for (key, freq) in ordered_own_ngrams], itertools.count(0))) + ranked_other_ngrams = dict(zip([key for (key, freq) in ordered_other_ngrams], itertools.count(0))) - ordered_own_trigrams = sorted(own_frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] - ordered_other_trigrams = sorted(other_frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] - ranked_own_trigrams = dict(zip([key for (key, freq) in ordered_own_trigrams], itertools.count(0))) - ranked_other_trigrams = dict(zip([key for (key, freq) in ordered_other_trigrams], itertools.count(0))) - - res = sum(abs(v-ranked_other_trigrams.get(k, TOP_NGRAM_COUNT)) for (k, v) in ranked_own_trigrams.items()) + \ - sum(abs(v-ranked_own_trigrams.get(k, TOP_NGRAM_COUNT)) for (k, v) in ranked_other_trigrams.items()) + res = sum(abs(v-ranked_other_ngrams.get(k, TOP_NGRAM_COUNT)) for (k, v) in ranked_own_ngrams.items()) + \ + sum(abs(v-ranked_own_ngrams.get(k, TOP_NGRAM_COUNT)) for (k, v) in ranked_other_ngrams.items()) return res