# HG changeset patch # User Laman # Date 2024-09-30 22:18:20 # Node ID f1db051d658e9aba396cf7f1b5edc2564357c144 # Parent f82e9a5b1c2ca6b57e5c97eaebf43971718d302d expanded README, added more documentation diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -3,4 +3,131 @@ Languedoc Language identification library based on ["N-Gram-Based Text Categorization"](https://www.cis.lmu.de/~stef/seminare/sprachenidentifizierung/cavnar_trenkle.pdf) by Cavnar and Trenkle. +## Usage +1. Create a directory `data`, with a subdirectory for each target language. Fill with your training data. + +2. Run `PYTHONPATH=src/ python3 train.py`. It will create a `models.json.gz` file. + +3. You can now use: +```python +import languedoc + +language = languedoc.identify("A text you want to identify.") +``` +It will output the identifier that you used as the subdirectory name in step 1, based on the closest match between n-gram frequencies. + +## Accuracy + +Below is the training script output from my training data, from seven major European languages. It is worth noting that the crossvalidation iterates through all languages, for each creates five models and for each creates ten tests of every length of 8, 16, 32 and 64 characters. If we count the misidentified samples, we can see that the tiny 8 character samples have 84% success rate, 16 chars rise to 96% and for 32 and longer there are no errors at all. + +``` +PYTHONPATH=src/ python src/languedoc/train.py +# Source texts: +cs: dyk - krysař.txt (94122 chars) +cs: hašek - švejk.txt (679701 chars) +cs: poláček - hostinec u kamenného stolu.txt (434082 chars) +cs: vančura - konec starých časů.txt (418857 chars) +cs: čapek - apokryfy.txt (180550 chars) +de: 2188-8.txt (351444 chars) +de: 23396-0.txt (187138 chars) +de: 46896-8.txt (248309 chars) +de: pg10917.txt (384020 chars) +de: pg67409.txt (333959 chars) +en: fitzgerald - the great gatsby.txt (274897 chars) +en: joyce - ulysses.txt (1469511 chars) +en: lovecraft - the dunwich horror.txt (117020 chars) +en: orwell - 1984.txt (569311 chars) +en: woolf - mrs dalloway.txt (346294 chars) +es: 14307-8.txt (67263 chars) +es: 16670-8.txt (555534 chars) +es: 51019-0.txt (420431 chars) +es: 58484-8.txt (243137 chars) +es: 61189-8.txt (438149 chars) +fr: 44468-0.txt (360878 chars) +fr: 45176-0.txt (331838 chars) +fr: 64274-0.txt (108402 chars) +fr: pg68138.txt (380198 chars) +fr: pg68265.txt (368201 chars) +it: 22642-8.txt (467018 chars) +it: 28144-8.txt (285603 chars) +it: 39289-0.txt (472982 chars) +it: 49310-0.txt (295664 chars) +it: 57040-0.txt (382809 chars) +ru: Full text of История России Кириллов В.В Уч Пос 2007 661с ( 1) (1008316 chars) +ru: Full text of Каменев П. H. Сканави A. H. Богословский В. H. И Др. Часть 1. Отопление. 1975 (985388 chars) +ru: molier.txt (387588 chars) + +# Crossvalidation: +cs misidentified as fr: trumm té +cs misidentified as fr: cit a l +cs misidentified as fr: en je ta +cs misidentified as en: l hostin +cs misidentified as fr: t a lidé +cs misidentified as it: nazaret +de misidentified as en: ch war f +de misidentified as it: r professor an e +de misidentified as en: en im hotel sond +de misidentified as es: so viel +en misidentified as de: j eckle +en misidentified as es: or prete +en misidentified as it: e stale +en misidentified as es: es agita +en misidentified as fr: connect +en misidentified as de: ust be after eig +en misidentified as fr: man pau +en misidentified as it: a puzzle +en misidentified as de: gs matte +en misidentified as de: ism and +es misidentified as en: se alarm +es misidentified as it: reserva +es misidentified as fr: ues de l +es misidentified as de: deber se +es misidentified as it: ronto la +es misidentified as fr: ue de le +es misidentified as it: no volv +es misidentified as it: sa perdi +es misidentified as fr: ailes de +es misidentified as it: e no lle +es misidentified as fr: luis sube un ra +es misidentified as it: ase a la iglesia +es misidentified as fr: t el rec +es misidentified as it: antonio +es misidentified as it: lama baltasar ti +es misidentified as fr: un galope que se +es misidentified as it: viduo un +es misidentified as fr: les con +es misidentified as it: escote ni desnud +es misidentified as fr: a encont +es misidentified as fr: vez vinu +es misidentified as en: se han +es misidentified as it: erpo de +es misidentified as fr: la caus +es misidentified as en: os bigot +es misidentified as fr: héroes balzaqui +es misidentified as fr: ue me marchara d +es misidentified as fr: va de génova voy +es misidentified as en: ol madrid me int +fr misidentified as es: le regar +fr misidentified as en: s inter +fr misidentified as it: de costa +fr misidentified as es: garde co +fr misidentified as es: recueill +fr misidentified as es: ale quel +fr misidentified as en: les offi +fr misidentified as it: e compar +fr misidentified as de: öhrenbac +fr misidentified as it: promena +fr misidentified as es: horizon +it misidentified as es: nica per +it misidentified as es: lla stan +it misidentified as es: uesto chablis ti +it misidentified as fr: onna cla +it misidentified as fr: va vent +it misidentified as fr: n si des +it misidentified as fr: endere le import +it misidentified as es: el tempo +it misidentified as es: a lo vid +Accuracy: 0.9507%, (1331/1400 tests during crossvalidation) +``` + Licensed under GNU GPLv3. diff --git a/src/languedoc/predict.py b/src/languedoc/predict.py --- a/src/languedoc/predict.py +++ b/src/languedoc/predict.py @@ -19,9 +19,17 @@ def preprocess(text: str) -> str: def sample_text(text: str, segment_length: int): + """Extract a reasonably and uniformly long sample from a long text. + + :param text: the input text + :param segment_length: a text segment length. The sample is going to be 3-4 times longer. + :return: a text sample cut from the original text, consisting of three segments + """ n = len(text) + # a text too short to sample if n < 4*segment_length: return text + # take a segment from the 1st, 2nd and 3rd quarter of the text, to get a representative sample else: f = lambda i: n*i//4 - segment_length//2 regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s") @@ -89,7 +97,7 @@ class Sample: @classmethod def extract(cls, text: str, language="??") -> "Sample": - """Create a new Sample by extracting it from text. + """Create a new Sample by extracting it from the text. :param text: a string, from which to extract the ngrams into a Sample :param language: a two letter language code if it is known (cs|de|en|...)""" @@ -120,7 +128,7 @@ class Sample: The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample). - :param other: a reference model in known language""" + :param other: a reference model in a known language""" m = len(other.ranked_ngrams) res = sum( diff --git a/src/languedoc/train.py b/src/languedoc/train.py --- a/src/languedoc/train.py +++ b/src/languedoc/train.py @@ -72,6 +72,7 @@ def cross_validate(sample_sets: list[Sam score = 0 max_score = 0 + print("# Crossvalidation:") for s in sample_sets: for (test_text, partial_model) in s.generate_tests(CROSSVALIDATION_SOURCE_COUNT): real_lang = partial_model.language @@ -85,7 +86,7 @@ def cross_validate(sample_sets: list[Sam if predicted_lang == real_lang: score += 1 else: - print(real_lang, predicted_lang, t) + print(f"{real_lang} misidentified as {predicted_lang}: {t}") max_score += 1 return score/max_score, score, max_score @@ -100,6 +101,7 @@ def train(data_dir: str, model_path: str samples = [] lang_dirs = sorted([x.path for x in os.scandir(data_dir)]) + print("# Source texts:") for d in lang_dirs: lang = os.path.basename(d) lang_samples = SampleSet(lang) @@ -109,7 +111,7 @@ def train(data_dir: str, model_path: str with open(file) as f: text = f.read() text = preprocess(text) - print(f"{lang}: {file.name} ({len(text)})") + print(f"{lang}: {file.name} ({len(text)} chars)") lang_samples.add(text) @@ -117,7 +119,9 @@ def train(data_dir: str, model_path: str s = json.dumps([sample_set.create_model().export() for sample_set in samples], ensure_ascii=False, sort_keys=True) f.write(s.encode("utf-8")) - print(cross_validate(samples)) + print() + (acc, success, count) = cross_validate(samples) + print(f"Accuracy: {acc*100:.4f}%, ({success}/{count} tests during crossvalidation)") DATA_DIR = os.path.join(os.path.dirname(__file__), "../../data")