diff --git a/languedoc.py b/languedoc.py --- a/languedoc.py +++ b/languedoc.py @@ -1,6 +1,8 @@ import os import random import itertools +import json +import gzip from shared import preprocess, identify, extract_ngram_freqs, rank_ngram_freqs, Sample @@ -72,6 +74,7 @@ def cross_validate(sample_sets): DATA_DIR = os.path.join(os.path.dirname(__file__), "data") LANG_DIRS = sorted([x.path for x in os.scandir(DATA_DIR)]) +MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz") if __name__ == "__main__": samples = [] @@ -89,4 +92,7 @@ if __name__ == "__main__": lang_samples.add(text) + with gzip.open(MODEL_PATH, mode="wt", encoding="utf-8") as f: + json.dump([sample_set.create_model().export() for sample_set in samples], f, ensure_ascii=False) + print(cross_validate(samples))