Files
@ f8fe5a65e7fc
Branch filter:
Location: Languedoc/shared.py - annotation
f8fe5a65e7fc
2.0 KiB
text/x-python
model saving and loading
f8fe5a65e7fc f896b3675ee7 d2fa9460c0fb f8fe5a65e7fc f8fe5a65e7fc d2fa9460c0fb a27c2661a846 f8fe5a65e7fc d2fa9460c0fb d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb d2fa9460c0fb a27c2661a846 a27c2661a846 a27c2661a846 a27c2661a846 a27c2661a846 a27c2661a846 d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb f8fe5a65e7fc f8fe5a65e7fc f8fe5a65e7fc f8fe5a65e7fc f8fe5a65e7fc f8fe5a65e7fc f8fe5a65e7fc f8fe5a65e7fc f8fe5a65e7fc f896b3675ee7 d2fa9460c0fb a27c2661a846 | import os
import re
import itertools
import json
import gzip
TOP_NGRAM_COUNT = 3000
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
def preprocess(text):
text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
return text.lower()
def extract_kgram_freqs(text, k):
n = len(text)
d = dict()
for i in range(0, n-k+1):
key = text[i:i+k]
if key.isspace():
continue
d[key] = d.get(key, 0) + 1
count = sum(d.values())
return {key: val/count for (key, val) in d.items()}
def extract_ngram_freqs(text):
frequencies = {}
for k in range(1, 4):
frequencies.update(extract_kgram_freqs(text, k))
return frequencies
def rank_ngram_freqs(frequencies):
ordered_ngrams = sorted(frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT]
return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0)))
def extract_ranked_ngrams(text):
frequencies = extract_ngram_freqs(text)
return rank_ngram_freqs(frequencies)
class Sample:
def __init__(self, language, ranked_ngrams):
self.language = language
self.ranked_ngrams = ranked_ngrams
@classmethod
def extract(cls, text, language="??"):
return cls(language, extract_ranked_ngrams(preprocess(text)))
@classmethod
def load(cls, exported):
ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
return cls(exported["language"], ranked_ngrams)
def export(self):
return {
"language": self.language,
"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
}
def compare(self, other):
m = len(other.ranked_ngrams)
res = sum(
(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
for (k, v) in self.ranked_ngrams.items()
)
return res
def load_models(model_path):
with gzip.open(model_path, mode="rt", encoding="utf-8") as f:
return [Sample.load(obj) for obj in json.load(f)]
def identify(text, models=[]):
if not models:
models = load_models(MODEL_PATH)
sample = Sample.extract(text)
return sorted(models, key=lambda m: sample.compare(m))[0].language
|