# HG changeset patch # User Laman # Date 2022-10-20 23:16:46 # Node ID 8e4769dd4ca6e0f8118248665fedc883dfa34ff0 # Parent dbaf68186bdfd86e8f80ea725e3f6642b61f0dba tests for ngram extraction diff --git a/src/languedoc/predict.py b/src/languedoc/predict.py --- a/src/languedoc/predict.py +++ b/src/languedoc/predict.py @@ -14,7 +14,12 @@ def preprocess(text: str) -> str: return text.lower() -def extract_kgram_counts(text, k): +def extract_kgram_counts(text: str, k: int) -> dict[str, int]: + """Extract k-gram counts from the text for a provided k. + + :param text: the source text + :param k: length of the kgrams to extract. 1 for letters, 2 for bigrams, ... + :return: a dict mapping kgrams to their counts in the text""" n = len(text) counts = dict() @@ -28,7 +33,11 @@ def extract_kgram_counts(text, k): return counts -def extract_ngram_counts(text): +def extract_ngram_counts(text: str) -> dict[str, int]: + """Extract counts of 1- to 3-grams from the text. + + :param text: the source text + :return: a dict mapping ngrams to their counts in the text""" counts = dict() for k in range(1, 4): @@ -37,12 +46,20 @@ def extract_ngram_counts(text): return counts -def rank_ngram_counts(counts): +def rank_ngram_counts(counts: dict[str, int]) -> dict[str, int]: + """Order supplied ngrams by their counts (then length, then alphabetically) and return their ranking. + + :param counts: a dict mapping ngrams to their counts + :return: a dict mapping ngrams to their rank (the most frequent: 0, the second: 1, ...)""" ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT] - return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0))) + return dict(zip([key for (key, count) in ordered_ngrams], itertools.count(0))) -def extract_ranked_ngrams(text): +def extract_ranked_ngrams(text: str) -> dict[str, int]: + """Extract ngrams from the text and rank them from the most common. + + :param text: the source text + :return: a dict mapping ngrams to their ranks {most_common_ngram: 0, second: 1, ...}""" counts = extract_ngram_counts(text) return rank_ngram_counts(counts) diff --git a/tests/test_predict.py b/tests/test_predict.py --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -1,6 +1,7 @@ from unittest import TestCase -from languedoc.predict import preprocess, rank_ngram_counts, Sample, identify +from languedoc.predict import preprocess, extract_kgram_counts, extract_ngram_counts, rank_ngram_counts, \ + extract_ranked_ngrams, Sample, identify class TestPredict(TestCase): @@ -10,11 +11,36 @@ class TestPredict(TestCase): self.assertEqual(preprocess("1% "), " ") self.assertEqual(preprocess("Глава ĚŠČŘŽ"), " глава ěščřž ") + def test_extract_kgram_counts(self): + text = "abbbabb" + self.assertEqual(extract_kgram_counts(text, 1), {"a": 2, "b": 5}) + self.assertEqual(extract_kgram_counts(text, 2), {"ab": 2, "bb": 3, "ba": 1}) + + def test_extract_ngram_counts(self): + text = "aab" + self.assertEqual(extract_ngram_counts(text), {"a": 2, "b": 1, "aa": 1, "ab": 1, "aab": 1}) + + text = "abbbabb" + self.assertEqual( + extract_ngram_counts(text), + {"a": 2, "b": 5, "ab": 2, "bb": 3, "ba": 1, "abb": 2, "bbb": 1, "bba": 1, "bab": 1} + ) + def test_rank_ngram_counts(self): freqs = {"a": 3, "aa": 1, "b": 4, "bb": 1, "c": 1} expected = {"b": 0, "a": 1, "c": 2, "aa": 3, "bb": 4} self.assertEqual(rank_ngram_counts(freqs), expected) + def test_extract_ranked_ngrams(self): + text = "aab" + self.assertEqual(extract_ranked_ngrams(text), {"a": 0, "b": 1, "aa": 2, "ab": 3, "aab": 4}) + + text = "abbbabb" + self.assertEqual( + extract_ranked_ngrams(text), + {"b": 0, "bb": 1, "a": 2, "ab": 3, "abb": 4, "ba": 5, "bab": 6, "bba": 7, "bbb": 8} + ) + class TestSample(TestCase): def test_extract(self):