import os import re import random random.seed(19181028) def preprocess(text): text = re.sub(r"[\W\d]+", " ", text) return text.lower() def extract_ngram_freqs(text, k): n = len(text) d = dict() for i in range(0, n-k): key = text[i:i+k] if key.isspace(): continue d[key] = d.get(key, 0) + 1 count = sum(d.values()) return {key: val/count for (key, val) in d.items()} def merge_ngram_freqs(freqs): n = len(freqs) res = dict() for d in freqs: for (key, val) in d.items(): res.setdefault(key, 0) res[key] += val/n return res DATA_DIR = os.path.join(os.path.dirname(__file__), "data") LANG_DIRS = [x.path for x in os.scandir(DATA_DIR)] for d in LANG_DIRS: models = [[], [], []] for file in os.scandir(d): with open(file) as f: text = f.read() text = preprocess(text) print(f"{file.name} ({len(text)})") print(text[:256]) print() for k in range(1, 4): models[k-1].append(extract_ngram_freqs(text, k)) models = [merge_ngram_freqs(sources) for sources in models] print(sorted(((key, round(val, 3)) for (key, val) in models[0].items()), key=lambda kv: -kv[1])) print(sorted(((key, round(val, 3)) for (key, val) in models[1].items()), key=lambda kv: -kv[1]))