Files
@ 1c7a7c3926e6
Branch filter:
Location: Languedoc/languedoc.py - annotation
1c7a7c3926e6
1.2 KiB
text/x-python
init commit
1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 | import os
import re
import random
random.seed(19181028)
def preprocess(text):
text = re.sub(r"[\W\d]+", " ", text)
return text.lower()
def extract_ngram_freqs(text, k):
n = len(text)
d = dict()
for i in range(0, n-k):
key = text[i:i+k]
if key.isspace():
continue
d[key] = d.get(key, 0) + 1
count = sum(d.values())
return {key: val/count for (key, val) in d.items()}
def merge_ngram_freqs(freqs):
n = len(freqs)
res = dict()
for d in freqs:
for (key, val) in d.items():
res.setdefault(key, 0)
res[key] += val/n
return res
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
LANG_DIRS = [x.path for x in os.scandir(DATA_DIR)]
for d in LANG_DIRS:
models = [[], [], []]
for file in os.scandir(d):
with open(file) as f:
text = f.read()
text = preprocess(text)
print(f"{file.name} ({len(text)})")
print(text[:256])
print()
for k in range(1, 4):
models[k-1].append(extract_ngram_freqs(text, k))
models = [merge_ngram_freqs(sources) for sources in models]
print(sorted(((key, round(val, 3)) for (key, val) in models[0].items()), key=lambda kv: -kv[1]))
print(sorted(((key, round(val, 3)) for (key, val) in models[1].items()), key=lambda kv: -kv[1]))
|