Changeset - 4ea2a5eb6cf4
[Not reviewed]
Merge 1.1 release
0 5 1
Laman - 7 months ago 2024-09-30 22:22:23

merge default
6 files changed with 215 insertions and 9 deletions:
0 comments (0 inline, 0 general)
README.md
Show inline comments
 
@@ -3,4 +3,131 @@ Languedoc
 

	
 
Language identification library based on ["N-Gram-Based Text Categorization"](https://www.cis.lmu.de/~stef/seminare/sprachenidentifizierung/cavnar_trenkle.pdf) by Cavnar and Trenkle.
 

	
 
## Usage
 
1. Create a directory `data`, with a subdirectory for each target language. Fill with your training data.
 

	
 
2. Run `PYTHONPATH=src/ python3 train.py`. It will create a `models.json.gz` file.
 

	
 
3. You can now use:
 
```python
 
import languedoc
 

	
 
language = languedoc.identify("A text you want to identify.")
 
```
 
It will output the identifier that you used as the subdirectory name in step 1, based on the closest match between n-gram frequencies.
 

	
 
## Accuracy
 

	
 
Below is the training script output from my training data, from seven major European languages. It is worth noting that the crossvalidation iterates through all languages, for each creates five models and for each creates ten tests of every length of 8, 16, 32 and 64 characters. If we count the misidentified samples, we can see that the tiny 8 character samples have 84% success rate, 16 chars rise to 96% and for 32 and longer there are no errors at all.
 

	
 
```
 
PYTHONPATH=src/ python src/languedoc/train.py 
 
# Source texts:
 
cs: dyk - krysař.txt (94122 chars)
 
cs: hašek - švejk.txt (679701 chars)
 
cs: poláček - hostinec u kamenného stolu.txt (434082 chars)
 
cs: vančura - konec starých časů.txt (418857 chars)
 
cs: čapek - apokryfy.txt (180550 chars)
 
de: 2188-8.txt (351444 chars)
 
de: 23396-0.txt (187138 chars)
 
de: 46896-8.txt (248309 chars)
 
de: pg10917.txt (384020 chars)
 
de: pg67409.txt (333959 chars)
 
en: fitzgerald - the great gatsby.txt (274897 chars)
 
en: joyce - ulysses.txt (1469511 chars)
 
en: lovecraft - the dunwich horror.txt (117020 chars)
 
en: orwell - 1984.txt (569311 chars)
 
en: woolf - mrs dalloway.txt (346294 chars)
 
es: 14307-8.txt (67263 chars)
 
es: 16670-8.txt (555534 chars)
 
es: 51019-0.txt (420431 chars)
 
es: 58484-8.txt (243137 chars)
 
es: 61189-8.txt (438149 chars)
 
fr: 44468-0.txt (360878 chars)
 
fr: 45176-0.txt (331838 chars)
 
fr: 64274-0.txt (108402 chars)
 
fr: pg68138.txt (380198 chars)
 
fr: pg68265.txt (368201 chars)
 
it: 22642-8.txt (467018 chars)
 
it: 28144-8.txt (285603 chars)
 
it: 39289-0.txt (472982 chars)
 
it: 49310-0.txt (295664 chars)
 
it: 57040-0.txt (382809 chars)
 
ru: Full text of История России Кириллов В.В Уч Пос 2007 661с ( 1) (1008316 chars)
 
ru: Full text of Каменев П. H. Сканави A. H. Богословский В. H. И Др. Часть 1. Отопление. 1975 (985388 chars)
 
ru: molier.txt (387588 chars)
 

	
 
# Crossvalidation:
 
cs misidentified as fr: trumm té
 
cs misidentified as fr:  cit a l
 
cs misidentified as fr: en je ta
 
cs misidentified as en: l hostin
 
cs misidentified as fr: t a lidé
 
cs misidentified as it:  nazaret
 
de misidentified as en: ch war f
 
de misidentified as it: r professor an e
 
de misidentified as en: en im hotel sond
 
de misidentified as es:  so viel
 
en misidentified as de:  j eckle
 
en misidentified as es: or prete
 
en misidentified as it: e stale 
 
en misidentified as es: es agita
 
en misidentified as fr:  connect
 
en misidentified as de: ust be after eig
 
en misidentified as fr:  man pau
 
en misidentified as it: a puzzle
 
en misidentified as de: gs matte
 
en misidentified as de: ism and 
 
es misidentified as en: se alarm
 
es misidentified as it: reserva 
 
es misidentified as fr: ues de l
 
es misidentified as de: deber se
 
es misidentified as it: ronto la
 
es misidentified as fr: ue de le
 
es misidentified as it:  no volv
 
es misidentified as it: sa perdi
 
es misidentified as fr: ailes de
 
es misidentified as it: e no lle
 
es misidentified as fr:  luis sube un ra
 
es misidentified as it: ase a la iglesia
 
es misidentified as fr: t el rec
 
es misidentified as it: antonio 
 
es misidentified as it: lama baltasar ti
 
es misidentified as fr: un galope que se
 
es misidentified as it: viduo un
 
es misidentified as fr:  les con
 
es misidentified as it: escote ni desnud
 
es misidentified as fr: a encont
 
es misidentified as fr: vez vinu
 
es misidentified as en:  se han 
 
es misidentified as it: erpo de 
 
es misidentified as fr:  la caus
 
es misidentified as en: os bigot
 
es misidentified as fr:  héroes balzaqui
 
es misidentified as fr: ue me marchara d
 
es misidentified as fr: va de génova voy
 
es misidentified as en: ol madrid me int
 
fr misidentified as es: le regar
 
fr misidentified as en:  s inter
 
fr misidentified as it: de costa
 
fr misidentified as es: garde co
 
fr misidentified as es: recueill
 
fr misidentified as es: ale quel
 
fr misidentified as en: les offi
 
fr misidentified as it: e compar
 
fr misidentified as de: öhrenbac
 
fr misidentified as it:  promena
 
fr misidentified as es:  horizon
 
it misidentified as es: nica per
 
it misidentified as es: lla stan
 
it misidentified as es: uesto chablis ti
 
it misidentified as fr: onna cla
 
it misidentified as fr: va vent 
 
it misidentified as fr: n si des
 
it misidentified as fr: endere le import
 
it misidentified as es: el tempo
 
it misidentified as es: a lo vid
 
Accuracy: 0.9507%, (1331/1400 tests during crossvalidation)
 
```
 

	
 
Licensed under GNU GPLv3.
setup.cfg
Show inline comments
 
[metadata]
 
name = languedoc
 
version = 1.0
 
version = 1.1
 
license = GPLv3
 
description = A simple language identification library.
 

	
src/languedoc/predict.py
Show inline comments
 
@@ -6,7 +6,10 @@ import gzip
 
from typing import Union
 

	
 
TOP_NGRAM_COUNT = 3000
 
SINGLE_SEGMENT_LENGTH = 1000
 
SINGLE_SAMPLE_LENGTH = 32
 
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
 
MODEL = []
 

	
 

	
 
def preprocess(text: str) -> str:
 
@@ -15,6 +18,25 @@ def preprocess(text: str) -> str:
 
	return text.lower()
 

	
 

	
 
def sample_text(text: str, segment_length: int):
 
	"""Extract a reasonably and uniformly long sample from a long text.
 

	
 
	:param text: the input text
 
	:param segment_length: a text segment length. The sample is going to be 3-4 times longer.
 
	:return: a text sample cut from the original text, consisting of three segments
 
	"""
 
	n = len(text)
 
	# a text too short to sample
 
	if n < 4*segment_length:
 
		return text
 
	# take a segment from the 1st, 2nd and 3rd quarter of the text, to get a representative sample
 
	else:
 
		f = lambda i: n*i//4 - segment_length//2
 
		regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s")
 
		matches = [regexp.search(text, f(i)) for i in range(1, 4)]
 
		return " ".join(m.group(1) for m in matches if m)
 

	
 

	
 
def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
 
	"""Extract k-gram counts from the text for a provided k.
 

	
 
@@ -75,11 +97,14 @@ class Sample:
 

	
 
	@classmethod
 
	def extract(cls, text: str, language="??") -> "Sample":
 
		"""Create a new Sample by extracting it from text.
 
		"""Create a new Sample by extracting it from the text.
 

	
 
		:param text: a string, from which to extract the ngrams into a Sample
 
		:param language: a two letter language code if it is known (cs|de|en|...)"""
 
		return cls(language, extract_ranked_ngrams(preprocess(text)))
 
		preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH))
 
		sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH)
 

	
 
		return cls(language, extract_ranked_ngrams(sample))
 

	
 
	@classmethod
 
	def load(cls, exported: dict) -> "Sample":
 
@@ -103,7 +128,7 @@ class Sample:
 

	
 
		The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample).
 

	
 
		:param other: a reference model in known language"""
 
		:param other: a reference model in a known language"""
 
		m = len(other.ranked_ngrams)
 

	
 
		res = sum(
 
@@ -126,8 +151,11 @@ def identify(text: str, models=[]) -> st
 
	:param text: the text to identify
 
	:param models: list of models to choose from. The default is loaded from MODEL_PATH
 
	:return: best matching language (cs, de, en, ...)"""
 
	global MODEL
 
	if not MODEL and not models:
 
		MODEL = load_models(MODEL_PATH)
 
	if not models:
 
		models = load_models(MODEL_PATH)
 
		models = MODEL
 

	
 
	sample = Sample.extract(text)
 

	
src/languedoc/train.py
Show inline comments
 
@@ -72,6 +72,7 @@ def cross_validate(sample_sets: list[Sam
 
	score = 0
 
	max_score = 0
 

	
 
	print("# Crossvalidation:")
 
	for s in sample_sets:
 
		for (test_text, partial_model) in s.generate_tests(CROSSVALIDATION_SOURCE_COUNT):
 
			real_lang = partial_model.language
 
@@ -85,7 +86,7 @@ def cross_validate(sample_sets: list[Sam
 
					if predicted_lang == real_lang:
 
						score += 1
 
					else:
 
						print(real_lang, predicted_lang, t)
 
						print(f"{real_lang} misidentified as {predicted_lang}: {t}")
 
					max_score += 1
 

	
 
	return score/max_score, score, max_score
 
@@ -100,6 +101,7 @@ def train(data_dir: str, model_path: str
 
	samples = []
 
	lang_dirs = sorted([x.path for x in os.scandir(data_dir)])
 

	
 
	print("# Source texts:")
 
	for d in lang_dirs:
 
		lang = os.path.basename(d)
 
		lang_samples = SampleSet(lang)
 
@@ -109,7 +111,7 @@ def train(data_dir: str, model_path: str
 
			with open(file) as f:
 
				text = f.read()
 
				text = preprocess(text)
 
				print(f"{lang}: {file.name} ({len(text)})")
 
				print(f"{lang}: {file.name} ({len(text)} chars)")
 

	
 
				lang_samples.add(text)
 

	
 
@@ -117,7 +119,9 @@ def train(data_dir: str, model_path: str
 
		s = json.dumps([sample_set.create_model().export() for sample_set in samples], ensure_ascii=False, sort_keys=True)
 
		f.write(s.encode("utf-8"))
 

	
 
	print(cross_validate(samples))
 
	print()
 
	(acc, success, count) = cross_validate(samples)
 
	print(f"Accuracy: {acc*100:.4f}%, ({success}/{count} tests during crossvalidation)")
 

	
 

	
 
DATA_DIR = os.path.join(os.path.dirname(__file__), "../../data")
tests/test_predict.py
Show inline comments
 
import unittest
 
from unittest import TestCase
 

	
 
from languedoc.predict import preprocess, extract_kgram_counts, extract_ngram_counts, rank_ngram_counts, \
 
	extract_ranked_ngrams, Sample, identify
 
	extract_ranked_ngrams, Sample, identify, sample_text
 

	
 

	
 
class TestPredict(TestCase):
 
@@ -11,6 +12,10 @@ class TestPredict(TestCase):
 
		self.assertEqual(preprocess("1% "), " ")
 
		self.assertEqual(preprocess("Глава ĚŠČŘŽ"), " глава ěščřž ")
 

	
 
	@unittest.skip
 
	def test_sample_text(self):
 
		pass
 

	
 
	def test_extract_kgram_counts(self):
 
		text = "abbbabb"
 
		self.assertEqual(extract_kgram_counts(text, 1), {"a": 2, "b": 5})
tests/test_train.py
Show inline comments
 
new file 100644
 
import unittest
 
from unittest import TestCase
 

	
 
from languedoc.train import merge_ngram_freqs, cross_validate, SampleSet
 

	
 

	
 
class TestTrain(TestCase):
 
	def test_merge_ngram_freqs(self):
 
		a = {"a": 3, "b": 1, "c": 4}
 
		b = {"b": 1, "c": 5, "d": 9}
 
		c = merge_ngram_freqs([a, b])
 
		self.assertEqual(c, {"a": 3/8/2, "b": (1/8+1/15)/2, "c": (4/8+5/15)/2, "d": 9/15/2})
 
		self.assertEqual(sum(c.values()), 1)
 

	
 
	@unittest.skip
 
	def test_crossvalidate(self):
 
		pass
 

	
 

	
 
class TestSampleSet(TestCase):
 
	def test_add(self):
 
		sample_set = SampleSet("xy")
 
		self.assertEqual(sample_set.texts, [])
 
		self.assertEqual(sample_set.counts, [])
 

	
 
		sample_set.add("aaab")
 
		self.assertEqual(sample_set.texts, ["aaab"])
 
		self.assertEqual(sample_set.counts, [{"a": 3, "b": 1, "aa": 2, "ab": 1, "aaa": 1, "aab": 1}])
 

	
 
	def test_create_model(self):
 
		sample_set = SampleSet("xy")
 
		sample_set.add("aaab")
 
		sample_set.add("aab")
 

	
 
		res = sample_set.create_model()
 

	
 
		self.assertEqual(res.language, "xy")
 
		self.assertEqual(res.ranked_ngrams, {"a": 0, "aa": 1, "b": 2, "ab": 3, "aab": 4, "aaa": 5})
 

	
 
	@unittest.skip
 
	def test_generate_tests(self):
 
		pass
0 comments (0 inline, 0 general)