diff --git a/languedoc.py b/languedoc.py --- a/languedoc.py +++ b/languedoc.py @@ -6,6 +6,7 @@ import itertools random.seed(19181028) TEST_LENS = [8, 16, 32, 64] +TOP_TRIGRAM_COUNT = 6000 def preprocess(text): @@ -70,14 +71,14 @@ class Sample: use frequencies x order use letter, digrams, trigrams use absolute x square""" - ordered_own_trigrams = sorted(self.frequencies[2].items(), key=lambda kv: -kv[1])[:400] - ordered_other_trigrams = sorted(other.frequencies[2].items(), key=lambda kv: -kv[1])[:400] + ordered_own_trigrams = sorted(self.frequencies[2].items(), key=lambda kv: -kv[1])[:TOP_TRIGRAM_COUNT] + ordered_other_trigrams = sorted(other.frequencies[2].items(), key=lambda kv: -kv[1])[:TOP_TRIGRAM_COUNT] ranked_own_trigrams = dict(zip([key for (key, freq) in ordered_own_trigrams], itertools.count(0))) ranked_other_trigrams = dict(zip([key for (key, freq) in ordered_other_trigrams], itertools.count(0))) - res = sum(abs(v-ranked_other_trigrams.get(k, 400)) for (k, v) in ranked_own_trigrams.items()) + \ - sum(abs(v-ranked_own_trigrams.get(k, 400)) for (k, v) in ranked_other_trigrams.items()) - print(">", self.language, res) + res = sum(abs(v-ranked_other_trigrams.get(k, TOP_TRIGRAM_COUNT)) for (k, v) in ranked_own_trigrams.items()) + \ + sum(abs(v-ranked_own_trigrams.get(k, TOP_TRIGRAM_COUNT)) for (k, v) in ranked_other_trigrams.items()) + return res def print_overview(self):