Files
@ f82e9a5b1c2c
Branch filter:
Location: Languedoc/tests/test_predict.py - annotation
f82e9a5b1c2c
3.3 KiB
text/x-python
added more tests
f82e9a5b1c2c 781dc476bf41 781dc476bf41 8e4769dd4ca6 f82e9a5b1c2c 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 f82e9a5b1c2c f82e9a5b1c2c f82e9a5b1c2c f82e9a5b1c2c 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 dbaf68186bdf 781dc476bf41 781dc476bf41 dbaf68186bdf 781dc476bf41 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 dbaf68186bdf 781dc476bf41 781dc476bf41 dbaf68186bdf 781dc476bf41 781dc476bf41 781dc476bf41 dbaf68186bdf 781dc476bf41 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 d76af898e537 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 | import unittest
from unittest import TestCase
from languedoc.predict import preprocess, extract_kgram_counts, extract_ngram_counts, rank_ngram_counts, \
extract_ranked_ngrams, Sample, identify, sample_text
class TestPredict(TestCase):
def test_preprocess(self):
self.assertEqual(preprocess("abc"), " abc ")
self.assertEqual(preprocess("A b.c"), " a b c ")
self.assertEqual(preprocess("1% "), " ")
self.assertEqual(preprocess("Глава ĚŠČŘŽ"), " глава ěščřž ")
@unittest.skip
def test_sample_text(self):
pass
def test_extract_kgram_counts(self):
text = "abbbabb"
self.assertEqual(extract_kgram_counts(text, 1), {"a": 2, "b": 5})
self.assertEqual(extract_kgram_counts(text, 2), {"ab": 2, "bb": 3, "ba": 1})
def test_extract_ngram_counts(self):
text = "aab"
self.assertEqual(extract_ngram_counts(text), {"a": 2, "b": 1, "aa": 1, "ab": 1, "aab": 1})
text = "abbbabb"
self.assertEqual(
extract_ngram_counts(text),
{"a": 2, "b": 5, "ab": 2, "bb": 3, "ba": 1, "abb": 2, "bbb": 1, "bba": 1, "bab": 1}
)
def test_rank_ngram_counts(self):
freqs = {"a": 3, "aa": 1, "b": 4, "bb": 1, "c": 1}
expected = {"b": 0, "a": 1, "c": 2, "aa": 3, "bb": 4}
self.assertEqual(rank_ngram_counts(freqs), expected)
def test_extract_ranked_ngrams(self):
text = "aab"
self.assertEqual(extract_ranked_ngrams(text), {"a": 0, "b": 1, "aa": 2, "ab": 3, "aab": 4})
text = "abbbabb"
self.assertEqual(
extract_ranked_ngrams(text),
{"b": 0, "bb": 1, "a": 2, "ab": 3, "abb": 4, "ba": 5, "bab": 6, "bba": 7, "bbb": 8}
)
class TestSample(TestCase):
def test_extract(self):
a = Sample.extract("aaaaaa", "a")
self.assertEqual(a.language, "a")
self.assertEqual(a.ranked_ngrams, {'a': 0, 'aa': 1, 'aaa': 2, ' a': 3, 'a ': 4, ' aa': 5, 'aa ': 6})
b = Sample.extract("aa aa aa", "b")
self.assertEqual(b.ranked_ngrams, {'a': 0, ' a': 1, 'a ': 2, 'aa': 3, ' aa': 4, 'aa ': 5, 'a a': 6})
c = Sample.extract("aa")
self.assertEqual(c.language, "??")
self.assertEqual(c.ranked_ngrams, {'a': 0, ' a': 1, 'a ': 2, 'aa': 3, ' aa': 4, 'aa ': 5})
def test_load(self):
exported = {"language": "en", "ngrams": list("abcdefgh")}
a = Sample.load(exported)
self.assertEqual(a.language, "en")
self.assertEqual(a.ranked_ngrams, {k: 7-v for (k, v) in zip("hgfedcba", range(8))})
def test_export(self):
a = Sample("en", {k: 7-v for (k, v) in zip("hgfedcba", range(8))})
self.assertEqual(a.export(), {"language": "en", "ngrams": list("abcdefgh")})
def test_compare(self):
a = Sample("ab", {"a": 0, "b": 1, "ab": 2})
b = Sample("bc", {"b": 0, "ba": 1, "a": 2, "ab": 3})
self.assertEqual(a.compare(b), 4)
self.assertEqual(b.compare(a), 7)
class TestIdentify(TestCase):
def test_identify(self):
samples = [
("cs", "Severní ledový oceán je nejmenší světový oceán."),
("de", "Der Arktische Ozean ist der kleinste Ozean der Erde."),
("en", "The Arctic Ocean is the smallest of the world's oceans."),
("es", "Océano Ártico más pequeña y más septentrional del planeta"),
("fr", "L'océan Arctique ce qui en fait le plus petit des océans."),
("it", "Il Mar Glaciale Artico è una massa d'acqua..."),
("ru", "Се́верный Ледови́тый океа́н — наименьший по площади океан Земли")
]
for (lang, sample) in samples:
self.assertEqual(lang, identify(sample))
|