import os import re import itertools import json import gzip TOP_NGRAM_COUNT = 3000 MODEL_PATH = os.path.join(os.path.dirname(__file__), "../../models.json.gz") def preprocess(text): text = re.sub(r"[\W\d_]+", " ", " "+text+" ") return text.lower() def extract_kgram_freqs(text, k): n = len(text) d = dict() for i in range(0, n-k+1): key = text[i:i+k] if key.isspace(): continue d[key] = d.get(key, 0) + 1 count = sum(d.values()) return {key: val/count for (key, val) in d.items()} def extract_ngram_freqs(text): frequencies = {} for k in range(1, 4): frequencies.update(extract_kgram_freqs(text, k)) return frequencies def rank_ngram_freqs(frequencies): ordered_ngrams = sorted(frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0))) def extract_ranked_ngrams(text): frequencies = extract_ngram_freqs(text) return rank_ngram_freqs(frequencies) class Sample: def __init__(self, language, ranked_ngrams): self.language = language self.ranked_ngrams = ranked_ngrams @classmethod def extract(cls, text, language="??"): return cls(language, extract_ranked_ngrams(preprocess(text))) @classmethod def load(cls, exported): ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])} return cls(exported["language"], ranked_ngrams) def export(self): return { "language": self.language, "ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])] } def compare(self, other): m = len(other.ranked_ngrams) res = sum( (abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m) for (k, v) in self.ranked_ngrams.items() ) return res def load_models(model_path): with gzip.open(model_path, mode="rt", encoding="utf-8") as f: return [Sample.load(obj) for obj in json.load(f)] def identify(text, models=[]): if not models: models = load_models(MODEL_PATH) sample = Sample.extract(text) return sorted(models, key=lambda m: sample.compare(m))[0].language