diff --git a/languedoc.py b/languedoc.py --- a/languedoc.py +++ b/languedoc.py @@ -1,6 +1,7 @@ import os import re import random +import itertools random.seed(19181028) @@ -14,7 +15,7 @@ def extract_ngram_freqs(text, k): n = len(text) d = dict() - for i in range(0, n-k): + for i in range(0, n-k+1): key = text[i:i+k] if key.isspace(): continue @@ -63,7 +64,19 @@ class Sample: return res def compare(self, other): - pass + """take k most common + use frequencies x order + use letter, digrams, trigrams + use absolute x square""" + ordered_own_trigrams = sorted(self.frequencies[2].items(), key=lambda kv: -kv[1])[:400] + ordered_other_trigrams = sorted(other.frequencies[2].items(), key=lambda kv: -kv[1])[:400] + ranked_own_trigrams = dict(zip([key for (key, freq) in ordered_own_trigrams], itertools.count(0))) + ranked_other_trigrams = dict(zip([key for (key, freq) in ordered_other_trigrams], itertools.count(0))) + + res = sum(abs(v-ranked_other_trigrams.get(k, 400)) for (k, v) in ranked_own_trigrams.items()) + \ + sum(abs(v-ranked_own_trigrams.get(k, 400)) for (k, v) in ranked_other_trigrams.items()) + print(">", self.language, res) + return res def print_overview(self): print(f"Sample({self.language}):")