diff --git a/shared.py b/shared.py --- a/shared.py +++ b/shared.py @@ -1,7 +1,11 @@ +import os import re import itertools +import json +import gzip TOP_NGRAM_COUNT = 3000 +MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz") def preprocess(text): @@ -65,10 +69,6 @@ class Sample: } def compare(self, other): - """take k most common - use frequencies x order - use letter, digrams, trigrams - use absolute x square""" m = len(other.ranked_ngrams) res = sum( @@ -79,7 +79,15 @@ class Sample: return res -def identify(text, models): +def load_models(model_path): + with gzip.open(model_path, mode="rt", encoding="utf-8") as f: + return [Sample.load(obj) for obj in json.load(f)] + + +def identify(text, models=[]): + if not models: + models = load_models(MODEL_PATH) + sample = Sample.extract(text) return sorted(models, key=lambda m: sample.compare(m))[0].language