diff --git a/tests/test_predict.py b/tests/test_predict.py new file mode 100644 --- /dev/null +++ b/tests/test_predict.py @@ -0,0 +1,46 @@ +from unittest import TestCase + +from languedoc.predict import preprocess, rank_ngram_freqs, Sample, identify + + +class TestPredict(TestCase): + def test_preprocess(self): + self.assertEqual(preprocess("abc"), " abc ") + self.assertEqual(preprocess("A b.c"), " a b c ") + self.assertEqual(preprocess("1% "), " ") + self.assertEqual(preprocess("Глава ĚŠČŘŽ"), " глава ěščřž ") + + def test_rank_ngram_freqs(self): + freqs = {"a": 3, "aa": 1, "b": 4, "bb": 1, "c": 1} + expected = {"b": 0, "a": 1, "c": 2, "aa": 3, "bb": 4} + self.assertEqual(rank_ngram_freqs(freqs), expected) + + +class TestSample(TestCase): + def test_extract(self): + a = Sample.extract("aaaaaa", "a") + self.assertEqual(a.language, "a") + self.assertEqual(a.ranked_ngrams, {'a': 0, 'aa': 1, 'aaa': 2, ' aa': 3, 'aa ': 4, ' a': 5, 'a ': 6}) + + b = Sample.extract("aa aa aa", "b") + self.assertEqual(b.ranked_ngrams, {'a': 0, ' aa': 1, 'aa ': 2, ' a': 3, 'a ': 4, 'aa': 5, 'a a': 6}) + + c = Sample.extract("aa") + self.assertEqual(c.language, "??") + self.assertEqual(c.ranked_ngrams, {'a': 0, ' aa': 1, 'aa ': 2, ' a': 3, 'a ': 4, 'aa': 5}) + + +class TestIdentify(TestCase): + def test_identify(self): + samples = [ + ("cs", "Severní ledový oceán je nejmenší světový oceán."), + ("de", "Der Arktische Ozean ist der kleinste Ozean der Erde."), + ("en", "The Arctic Ocean is the smallest of the world's oceans."), + ("es", "Océano Ártico más pequeña y más septentrional del planeta"), + ("fr", "L'océan Arctique ce qui en fait le plus petit des océans."), + ("it", "Il Mar Glaciale Artico è una massa d'acqua..."), + ("ru", "Се́верный Ледови́тый океа́н — наименьший по площади океан Земли") + ] + + for (lang, sample) in samples: + self.assertEqual(lang, identify(sample))