# HG changeset patch # User Laman # Date 2022-09-18 11:35:09 # Node ID 1c7a7c3926e60935690497097c386a9d9be6e842 init commit diff --git a/languedoc.py b/languedoc.py new file mode 100644 --- /dev/null +++ b/languedoc.py @@ -0,0 +1,60 @@ +import os +import re +import random + +random.seed(19181028) + + +def preprocess(text): + text = re.sub(r"[\W\d]+", " ", text) + return text.lower() + + +def extract_ngram_freqs(text, k): + n = len(text) + d = dict() + + for i in range(0, n-k): + key = text[i:i+k] + if key.isspace(): + continue + + d[key] = d.get(key, 0) + 1 + + count = sum(d.values()) + + return {key: val/count for (key, val) in d.items()} + + +def merge_ngram_freqs(freqs): + n = len(freqs) + res = dict() + + for d in freqs: + for (key, val) in d.items(): + res.setdefault(key, 0) + res[key] += val/n + + return res + + +DATA_DIR = os.path.join(os.path.dirname(__file__), "data") +LANG_DIRS = [x.path for x in os.scandir(DATA_DIR)] + +for d in LANG_DIRS: + models = [[], [], []] + + for file in os.scandir(d): + with open(file) as f: + text = f.read() + text = preprocess(text) + print(f"{file.name} ({len(text)})") + print(text[:256]) + print() + + for k in range(1, 4): + models[k-1].append(extract_ngram_freqs(text, k)) + + models = [merge_ngram_freqs(sources) for sources in models] + print(sorted(((key, round(val, 3)) for (key, val) in models[0].items()), key=lambda kv: -kv[1])) + print(sorted(((key, round(val, 3)) for (key, val) in models[1].items()), key=lambda kv: -kv[1]))