Files
@ 5ab4acb6f293
Branch filter:
Location: Languedoc/languedoc.py - annotation
5ab4acb6f293
2.6 KiB
text/x-python
implemented Sample.compare method
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 5ab4acb6f293 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1cae4ecc8978 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 5ab4acb6f293 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 5ab4acb6f293 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1cae4ecc8978 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1cae4ecc8978 1cae4ecc8978 1c7a7c3926e6 1cae4ecc8978 1cae4ecc8978 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1c7a7c3926e6 1cae4ecc8978 1cae4ecc8978 1c7a7c3926e6 1cae4ecc8978 1cae4ecc8978 | import os
import re
import random
import itertools
random.seed(19181028)
def preprocess(text):
text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
return text.lower()
def extract_ngram_freqs(text, k):
n = len(text)
d = dict()
for i in range(0, n-k+1):
key = text[i:i+k]
if key.isspace():
continue
d[key] = d.get(key, 0) + 1
count = sum(d.values())
return {key: val/count for (key, val) in d.items()}
def merge_ngram_freqs(freqs):
n = len(freqs)
res = dict()
for d in freqs:
for (key, val) in d.items():
res.setdefault(key, 0)
res[key] += val/n
return res
class Sample:
def __init__(self, language="??", text=""):
self.language = language
self.frequencies = [dict(), dict(), dict()]
if text:
self._extract(text)
def _extract(self, text):
for k in range(1, 4):
self.frequencies[k-1] = extract_ngram_freqs(text, k)
@staticmethod
def merge(samples):
assert len({x.language for x in samples}) == 1
res = Sample(samples[0].language)
res.frequencies = []
for freqs in zip(*(x.frequencies for x in samples)):
res.frequencies.append(merge_ngram_freqs(freqs))
return res
def compare(self, other):
"""take k most common
use frequencies x order
use letter, digrams, trigrams
use absolute x square"""
ordered_own_trigrams = sorted(self.frequencies[2].items(), key=lambda kv: -kv[1])[:400]
ordered_other_trigrams = sorted(other.frequencies[2].items(), key=lambda kv: -kv[1])[:400]
ranked_own_trigrams = dict(zip([key for (key, freq) in ordered_own_trigrams], itertools.count(0)))
ranked_other_trigrams = dict(zip([key for (key, freq) in ordered_other_trigrams], itertools.count(0)))
res = sum(abs(v-ranked_other_trigrams.get(k, 400)) for (k, v) in ranked_own_trigrams.items()) + \
sum(abs(v-ranked_own_trigrams.get(k, 400)) for (k, v) in ranked_other_trigrams.items())
print(">", self.language, res)
return res
def print_overview(self):
print(f"Sample({self.language}):")
for freqs in self.frequencies:
x = [
(k, round(v, 3))
for (k, v) in sorted(freqs.items(), key=lambda kv: -kv[1])
]
print(" ", x[:8], "...", x[-8:])
print()
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
LANG_DIRS = [x.path for x in os.scandir(DATA_DIR)]
models = dict()
for d in LANG_DIRS:
lang = os.path.basename(d)
samples = []
for file in os.scandir(d):
with open(file) as f:
text = f.read()
text = preprocess(text)
print(f"{file.name} ({len(text)})")
print(text[:256])
print()
samples.append(Sample(lang, text))
samples[-1].print_overview()
models[lang] = Sample.merge(samples)
models[lang].print_overview()
|