Changeset - 8e4769dd4ca6
[Not reviewed]
default
0 2 0
Laman - 2 years ago 2022-10-20 23:16:46

tests for ngram extraction
2 files changed with 49 insertions and 6 deletions:
0 comments (0 inline, 0 general)
src/languedoc/predict.py
Show inline comments
 
@@ -14,7 +14,12 @@ def preprocess(text: str) -> str:
 
	return text.lower()
 

	
 

	
 
def extract_kgram_counts(text, k):
 
def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
 
	"""Extract k-gram counts from the text for a provided k.
 

	
 
	:param text: the source text
 
	:param k: length of the kgrams to extract. 1 for letters, 2 for bigrams, ...
 
	:return: a dict mapping kgrams to their counts in the text"""
 
	n = len(text)
 
	counts = dict()
 

	
 
@@ -28,7 +33,11 @@ def extract_kgram_counts(text, k):
 
	return counts
 

	
 

	
 
def extract_ngram_counts(text):
 
def extract_ngram_counts(text: str) -> dict[str, int]:
 
	"""Extract counts of 1- to 3-grams from the text.
 

	
 
	:param text: the source text
 
	:return: a dict mapping ngrams to their counts in the text"""
 
	counts = dict()
 

	
 
	for k in range(1, 4):
 
@@ -37,12 +46,20 @@ def extract_ngram_counts(text):
 
	return counts
 

	
 

	
 
def rank_ngram_counts(counts):
 
def rank_ngram_counts(counts: dict[str, int]) -> dict[str, int]:
 
	"""Order supplied ngrams by their counts (then length, then alphabetically) and return their ranking.
 

	
 
	:param counts: a dict mapping ngrams to their counts
 
	:return: a dict mapping ngrams to their rank (the most frequent: 0, the second: 1, ...)"""
 
	ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT]
 
	return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0)))
 
	return dict(zip([key for (key, count) in ordered_ngrams], itertools.count(0)))
 

	
 

	
 
def extract_ranked_ngrams(text):
 
def extract_ranked_ngrams(text: str) -> dict[str, int]:
 
	"""Extract ngrams from the text and rank them from the most common.
 

	
 
	:param text: the source text
 
	:return: a dict mapping ngrams to their ranks {most_common_ngram: 0, second: 1, ...}"""
 
	counts = extract_ngram_counts(text)
 
	return rank_ngram_counts(counts)
 

	
tests/test_predict.py
Show inline comments
 
from unittest import TestCase
 

	
 
from languedoc.predict import preprocess, rank_ngram_counts, Sample, identify
 
from languedoc.predict import preprocess, extract_kgram_counts, extract_ngram_counts, rank_ngram_counts, \
 
	extract_ranked_ngrams, Sample, identify
 

	
 

	
 
class TestPredict(TestCase):
 
@@ -10,11 +11,36 @@ class TestPredict(TestCase):
 
		self.assertEqual(preprocess("1% "), " ")
 
		self.assertEqual(preprocess("Глава ĚŠČŘŽ"), " глава ěščřž ")
 

	
 
	def test_extract_kgram_counts(self):
 
		text = "abbbabb"
 
		self.assertEqual(extract_kgram_counts(text, 1), {"a": 2, "b": 5})
 
		self.assertEqual(extract_kgram_counts(text, 2), {"ab": 2, "bb": 3, "ba": 1})
 

	
 
	def test_extract_ngram_counts(self):
 
		text = "aab"
 
		self.assertEqual(extract_ngram_counts(text), {"a": 2, "b": 1, "aa": 1, "ab": 1, "aab": 1})
 

	
 
		text = "abbbabb"
 
		self.assertEqual(
 
			extract_ngram_counts(text),
 
			{"a": 2, "b": 5, "ab": 2, "bb": 3, "ba": 1, "abb": 2, "bbb": 1, "bba": 1, "bab": 1}
 
		)
 

	
 
	def test_rank_ngram_counts(self):
 
		freqs = {"a": 3, "aa": 1, "b": 4, "bb": 1, "c": 1}
 
		expected = {"b": 0, "a": 1, "c": 2, "aa": 3, "bb": 4}
 
		self.assertEqual(rank_ngram_counts(freqs), expected)
 

	
 
	def test_extract_ranked_ngrams(self):
 
		text = "aab"
 
		self.assertEqual(extract_ranked_ngrams(text), {"a": 0, "b": 1, "aa": 2, "ab": 3, "aab": 4})
 

	
 
		text = "abbbabb"
 
		self.assertEqual(
 
			extract_ranked_ngrams(text),
 
			{"b": 0, "bb": 1, "a": 2, "ab": 3, "abb": 4, "ba": 5, "bab": 6, "bba": 7, "bbb": 8}
 
		)
 

	
 

	
 
class TestSample(TestCase):
 
	def test_extract(self):
0 comments (0 inline, 0 general)