Changeset - f82e9a5b1c2c
[Not reviewed]
default
0 1 1
Laman - 7 months ago 2024-09-30 22:17:00

added more tests
2 files changed with 48 insertions and 1 deletions:
0 comments (0 inline, 0 general)
tests/test_predict.py
Show inline comments
 
import unittest
 
from unittest import TestCase
 

	
 
from languedoc.predict import preprocess, extract_kgram_counts, extract_ngram_counts, rank_ngram_counts, \
 
	extract_ranked_ngrams, Sample, identify
 
	extract_ranked_ngrams, Sample, identify, sample_text
 

	
 

	
 
class TestPredict(TestCase):
 
	def test_preprocess(self):
 
		self.assertEqual(preprocess("abc"), " abc ")
 
		self.assertEqual(preprocess("A  b.c"), " a b c ")
 
		self.assertEqual(preprocess("1% "), " ")
 
		self.assertEqual(preprocess("Глава ĚŠČŘŽ"), " глава ěščřž ")
 

	
 
	@unittest.skip
 
	def test_sample_text(self):
 
		pass
 

	
 
	def test_extract_kgram_counts(self):
 
		text = "abbbabb"
 
		self.assertEqual(extract_kgram_counts(text, 1), {"a": 2, "b": 5})
 
		self.assertEqual(extract_kgram_counts(text, 2), {"ab": 2, "bb": 3, "ba": 1})
 

	
 
	def test_extract_ngram_counts(self):
 
		text = "aab"
 
		self.assertEqual(extract_ngram_counts(text), {"a": 2, "b": 1, "aa": 1, "ab": 1, "aab": 1})
 

	
 
		text = "abbbabb"
 
		self.assertEqual(
 
			extract_ngram_counts(text),
 
			{"a": 2, "b": 5, "ab": 2, "bb": 3, "ba": 1, "abb": 2, "bbb": 1, "bba": 1, "bab": 1}
 
		)
 

	
 
	def test_rank_ngram_counts(self):
 
		freqs = {"a": 3, "aa": 1, "b": 4, "bb": 1, "c": 1}
 
		expected = {"b": 0, "a": 1, "c": 2, "aa": 3, "bb": 4}
 
		self.assertEqual(rank_ngram_counts(freqs), expected)
 

	
 
	def test_extract_ranked_ngrams(self):
 
		text = "aab"
 
		self.assertEqual(extract_ranked_ngrams(text), {"a": 0, "b": 1, "aa": 2, "ab": 3, "aab": 4})
 

	
 
		text = "abbbabb"
 
		self.assertEqual(
 
			extract_ranked_ngrams(text),
 
			{"b": 0, "bb": 1, "a": 2, "ab": 3, "abb": 4, "ba": 5, "bab": 6, "bba": 7, "bbb": 8}
 
		)
 

	
 

	
 
class TestSample(TestCase):
 
	def test_extract(self):
 
		a = Sample.extract("aaaaaa", "a")
 
		self.assertEqual(a.language, "a")
 
		self.assertEqual(a.ranked_ngrams, {'a': 0, 'aa': 1, 'aaa': 2, ' a': 3, 'a ': 4, ' aa': 5, 'aa ': 6})
 

	
 
		b = Sample.extract("aa aa aa", "b")
 
		self.assertEqual(b.ranked_ngrams, {'a': 0, ' a': 1, 'a ': 2, 'aa': 3, ' aa': 4, 'aa ': 5, 'a a': 6})
 

	
 
		c = Sample.extract("aa")
 
		self.assertEqual(c.language, "??")
 
		self.assertEqual(c.ranked_ngrams, {'a': 0, ' a': 1, 'a ': 2, 'aa': 3, ' aa': 4, 'aa ': 5})
 

	
 
	def test_load(self):
 
		exported = {"language": "en", "ngrams": list("abcdefgh")}
 
		a = Sample.load(exported)
 
		self.assertEqual(a.language, "en")
 
		self.assertEqual(a.ranked_ngrams, {k: 7-v for (k, v) in zip("hgfedcba", range(8))})
 

	
 
	def test_export(self):
 
		a = Sample("en", {k: 7-v for (k, v) in zip("hgfedcba", range(8))})
 
		self.assertEqual(a.export(), {"language": "en", "ngrams": list("abcdefgh")})
 

	
 
	def test_compare(self):
 
		a = Sample("ab", {"a": 0, "b": 1, "ab": 2})
 
		b = Sample("bc", {"b": 0, "ba": 1, "a": 2, "ab": 3})
 

	
 
		self.assertEqual(a.compare(b), 4)
 
		self.assertEqual(b.compare(a), 7)
 

	
 

	
 
class TestIdentify(TestCase):
 
	def test_identify(self):
 
		samples = [
 
			("cs", "Severní ledový oceán je nejmenší světový oceán."),
 
			("de", "Der Arktische Ozean ist der kleinste Ozean der Erde."),
 
			("en", "The Arctic Ocean is the smallest of the world's oceans."),
 
			("es", "Océano Ártico más pequeña y más septentrional del planeta"),
 
			("fr", "L'océan Arctique ce qui en fait le plus petit des océans."),
 
			("it", "Il Mar Glaciale Artico è una massa d'acqua..."),
 
			("ru", "Се́верный Ледови́тый океа́н — наименьший по площади океан Земли")
 
		]
 

	
 
		for (lang, sample) in samples:
 
			self.assertEqual(lang, identify(sample))
tests/test_train.py
Show inline comments
 
new file 100644
 
import unittest
 
from unittest import TestCase
 

	
 
from languedoc.train import merge_ngram_freqs, cross_validate, SampleSet
 

	
 

	
 
class TestTrain(TestCase):
 
	def test_merge_ngram_freqs(self):
 
		a = {"a": 3, "b": 1, "c": 4}
 
		b = {"b": 1, "c": 5, "d": 9}
 
		c = merge_ngram_freqs([a, b])
 
		self.assertEqual(c, {"a": 3/8/2, "b": (1/8+1/15)/2, "c": (4/8+5/15)/2, "d": 9/15/2})
 
		self.assertEqual(sum(c.values()), 1)
 

	
 
	@unittest.skip
 
	def test_crossvalidate(self):
 
		pass
 

	
 

	
 
class TestSampleSet(TestCase):
 
	def test_add(self):
 
		sample_set = SampleSet("xy")
 
		self.assertEqual(sample_set.texts, [])
 
		self.assertEqual(sample_set.counts, [])
 

	
 
		sample_set.add("aaab")
 
		self.assertEqual(sample_set.texts, ["aaab"])
 
		self.assertEqual(sample_set.counts, [{"a": 3, "b": 1, "aa": 2, "ab": 1, "aaa": 1, "aab": 1}])
 

	
 
	def test_create_model(self):
 
		sample_set = SampleSet("xy")
 
		sample_set.add("aaab")
 
		sample_set.add("aab")
 

	
 
		res = sample_set.create_model()
 

	
 
		self.assertEqual(res.language, "xy")
 
		self.assertEqual(res.ranked_ngrams, {"a": 0, "aa": 1, "b": 2, "ab": 3, "aab": 4, "aaa": 5})
 

	
 
	@unittest.skip
 
	def test_generate_tests(self):
 
		pass
0 comments (0 inline, 0 general)