# HG changeset patch # User Laman # Date 2022-09-29 21:11:24 # Node ID 9b4582354d0c5a1ee785bd5dc1410b5000e86380 # Parent 2de09682747ec747ceb4c6bcbced084fea9d3a69 merged ngram frequencies together diff --git a/languedoc.py b/languedoc.py --- a/languedoc.py +++ b/languedoc.py @@ -7,7 +7,7 @@ random.seed(19181028) CROSSVALIDATION_SOURCE_COUNT = 5 TEST_LENS = [8, 16, 32, 64] -TOP_TRIGRAM_COUNT = 6000 +TOP_NGRAM_COUNT = 4000 def preprocess(text): @@ -72,13 +72,16 @@ class Sample: use frequencies x order use letter, digrams, trigrams use absolute x square""" - ordered_own_trigrams = sorted(self.frequencies[2].items(), key=lambda kv: -kv[1])[:TOP_TRIGRAM_COUNT] - ordered_other_trigrams = sorted(other.frequencies[2].items(), key=lambda kv: -kv[1])[:TOP_TRIGRAM_COUNT] + own_frequencies = self.frequencies[0] | self.frequencies[1] | self.frequencies[2] + other_frequencies = other.frequencies[0] | other.frequencies[1] | other.frequencies[2] + + ordered_own_trigrams = sorted(own_frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] + ordered_other_trigrams = sorted(other_frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] ranked_own_trigrams = dict(zip([key for (key, freq) in ordered_own_trigrams], itertools.count(0))) ranked_other_trigrams = dict(zip([key for (key, freq) in ordered_other_trigrams], itertools.count(0))) - res = sum(abs(v-ranked_other_trigrams.get(k, TOP_TRIGRAM_COUNT)) for (k, v) in ranked_own_trigrams.items()) + \ - sum(abs(v-ranked_own_trigrams.get(k, TOP_TRIGRAM_COUNT)) for (k, v) in ranked_other_trigrams.items()) + res = sum(abs(v-ranked_other_trigrams.get(k, TOP_NGRAM_COUNT)) for (k, v) in ranked_own_trigrams.items()) + \ + sum(abs(v-ranked_own_trigrams.get(k, TOP_NGRAM_COUNT)) for (k, v) in ranked_other_trigrams.items()) return res