# HG changeset patch
# User Laman
# Date 2022-09-28 22:15:07
# Node ID 5ab4acb6f2936842deae4df07fa889461aa5f7c8
# Parent  1cae4ecc8978965c7fd461a35a680ba459fb6c9b

implemented Sample.compare method

diff --git a/languedoc.py b/languedoc.py
--- a/languedoc.py
+++ b/languedoc.py
@@ -1,6 +1,7 @@
 import os
 import re
 import random
+import itertools
 
 random.seed(19181028)
 
@@ -14,7 +15,7 @@ def extract_ngram_freqs(text, k):
 	n = len(text)
 	d = dict()
 
-	for i in range(0, n-k):
+	for i in range(0, n-k+1):
 		key = text[i:i+k]
 		if key.isspace():
 			continue
@@ -63,7 +64,19 @@ class Sample:
 		return res
 
 	def compare(self, other):
-		pass
+		"""take k most common
+		use frequencies x order
+		use letter, digrams, trigrams
+		use absolute x square"""
+		ordered_own_trigrams = sorted(self.frequencies[2].items(), key=lambda kv: -kv[1])[:400]
+		ordered_other_trigrams = sorted(other.frequencies[2].items(), key=lambda kv: -kv[1])[:400]
+		ranked_own_trigrams = dict(zip([key for (key, freq) in ordered_own_trigrams], itertools.count(0)))
+		ranked_other_trigrams = dict(zip([key for (key, freq) in ordered_other_trigrams], itertools.count(0)))
+
+		res = sum(abs(v-ranked_other_trigrams.get(k, 400)) for (k, v) in ranked_own_trigrams.items()) + \
+					sum(abs(v-ranked_own_trigrams.get(k, 400)) for (k, v) in ranked_other_trigrams.items())
+		print(">", self.language, res)
+		return res
 
 	def print_overview(self):
 		print(f"Sample({self.language}):")