diff --git a/src/languedoc/train.py b/src/languedoc/train.py --- a/src/languedoc/train.py +++ b/src/languedoc/train.py @@ -72,6 +72,7 @@ def cross_validate(sample_sets: list[Sam score = 0 max_score = 0 + print("# Crossvalidation:") for s in sample_sets: for (test_text, partial_model) in s.generate_tests(CROSSVALIDATION_SOURCE_COUNT): real_lang = partial_model.language @@ -85,7 +86,7 @@ def cross_validate(sample_sets: list[Sam if predicted_lang == real_lang: score += 1 else: - print(real_lang, predicted_lang, t) + print(f"{real_lang} misidentified as {predicted_lang}: {t}") max_score += 1 return score/max_score, score, max_score @@ -100,6 +101,7 @@ def train(data_dir: str, model_path: str samples = [] lang_dirs = sorted([x.path for x in os.scandir(data_dir)]) + print("# Source texts:") for d in lang_dirs: lang = os.path.basename(d) lang_samples = SampleSet(lang) @@ -109,7 +111,7 @@ def train(data_dir: str, model_path: str with open(file) as f: text = f.read() text = preprocess(text) - print(f"{lang}: {file.name} ({len(text)})") + print(f"{lang}: {file.name} ({len(text)} chars)") lang_samples.add(text) @@ -117,7 +119,9 @@ def train(data_dir: str, model_path: str s = json.dumps([sample_set.create_model().export() for sample_set in samples], ensure_ascii=False, sort_keys=True) f.write(s.encode("utf-8")) - print(cross_validate(samples)) + print() + (acc, success, count) = cross_validate(samples) + print(f"Accuracy: {acc*100:.4f}%, ({success}/{count} tests during crossvalidation)") DATA_DIR = os.path.join(os.path.dirname(__file__), "../../data")