import os
import re
import itertools
import json
import gzip
TOP_NGRAM_COUNT = 3000
MODEL_PATH = os.path.join(os.path.dirname(__file__), "../../models.json.gz")
def preprocess(text):
text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
return text.lower()
def extract_kgram_freqs(text, k):
n = len(text)
d = dict()
for i in range(0, n-k+1):
key = text[i:i+k]
if key.isspace():
continue
d[key] = d.get(key, 0) + 1
count = sum(d.values())
return {key: val/count for (key, val) in d.items()}
def extract_ngram_freqs(text):
frequencies = {}
for k in range(1, 4):
frequencies.update(extract_kgram_freqs(text, k))
return frequencies
def rank_ngram_freqs(frequencies):
ordered_ngrams = sorted(frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT]
return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0)))
def extract_ranked_ngrams(text):
frequencies = extract_ngram_freqs(text)
return rank_ngram_freqs(frequencies)
class Sample:
def __init__(self, language, ranked_ngrams):
self.language = language
self.ranked_ngrams = ranked_ngrams
@classmethod
def extract(cls, text, language="??"):
return cls(language, extract_ranked_ngrams(preprocess(text)))
@classmethod
def load(cls, exported):
ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
return cls(exported["language"], ranked_ngrams)
def export(self):
return {
"language": self.language,
"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
}
def compare(self, other):
m = len(other.ranked_ngrams)
res = sum(
(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
for (k, v) in self.ranked_ngrams.items()
)
return res
def load_models(model_path):
with gzip.open(model_path, mode="rt", encoding="utf-8") as f:
return [Sample.load(obj) for obj in json.load(f)]
def identify(text, models=[]):
if not models:
models = load_models(MODEL_PATH)
sample = Sample.extract(text)
return sorted(models, key=lambda m: sample.compare(m))[0].language