Changeset - 8e4769dd4ca6
[Not reviewed]
default
0 2 0
Laman - 2 years ago 2022-10-20 23:16:46

tests for ngram extraction
2 files changed with 49 insertions and 6 deletions:
0 comments (0 inline, 0 general)
src/languedoc/predict.py
Show inline comments
 
import os
 
import re
 
import itertools
 
import json
 
import gzip
 

	
 
TOP_NGRAM_COUNT = 3000
 
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
 

	
 

	
 
def preprocess(text: str) -> str:
 
	"""Preprocess text by stripping non-letter characters, collapsing whitespace and converting to lowercase."""
 
	text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
 
	return text.lower()
 

	
 

	
 
def extract_kgram_counts(text, k):
 
def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
 
	"""Extract k-gram counts from the text for a provided k.
 

	
 
	:param text: the source text
 
	:param k: length of the kgrams to extract. 1 for letters, 2 for bigrams, ...
 
	:return: a dict mapping kgrams to their counts in the text"""
 
	n = len(text)
 
	counts = dict()
 

	
 
	for i in range(0, n-k+1):
 
		key = text[i:i+k]
 
		if key.isspace():
 
			continue
 

	
 
		counts[key] = counts.get(key, 0) + 1
 

	
 
	return counts
 

	
 

	
 
def extract_ngram_counts(text):
 
def extract_ngram_counts(text: str) -> dict[str, int]:
 
	"""Extract counts of 1- to 3-grams from the text.
 

	
 
	:param text: the source text
 
	:return: a dict mapping ngrams to their counts in the text"""
 
	counts = dict()
 

	
 
	for k in range(1, 4):
 
		counts.update(extract_kgram_counts(text, k))
 

	
 
	return counts
 

	
 

	
 
def rank_ngram_counts(counts):
 
def rank_ngram_counts(counts: dict[str, int]) -> dict[str, int]:
 
	"""Order supplied ngrams by their counts (then length, then alphabetically) and return their ranking.
 

	
 
	:param counts: a dict mapping ngrams to their counts
 
	:return: a dict mapping ngrams to their rank (the most frequent: 0, the second: 1, ...)"""
 
	ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT]
 
	return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0)))
 
	return dict(zip([key for (key, count) in ordered_ngrams], itertools.count(0)))
 

	
 

	
 
def extract_ranked_ngrams(text):
 
def extract_ranked_ngrams(text: str) -> dict[str, int]:
 
	"""Extract ngrams from the text and rank them from the most common.
 

	
 
	:param text: the source text
 
	:return: a dict mapping ngrams to their ranks {most_common_ngram: 0, second: 1, ...}"""
 
	counts = extract_ngram_counts(text)
 
	return rank_ngram_counts(counts)
 

	
 

	
 
class Sample:
 
	def __init__(self, language: str, ranked_ngrams: dict[str, float]):
 
		"""Create a new Sample from language and ngrams.
 

	
 
		This is usually impractical and Sample.extract or Sample.load are preferred."""
 
		self.language = language
 
		self.ranked_ngrams = ranked_ngrams
 

	
 
	@classmethod
 
	def extract(cls, text: str, language="??") -> "Sample":
 
		"""Create a new Sample by extracting it from text.
 

	
 
		:param text: a string, from which to extract the ngrams into a Sample
 
		:param language: a two letter language code if it is known (cs|de|en|...)"""
 
		return cls(language, extract_ranked_ngrams(preprocess(text)))
 

	
 
	@classmethod
 
	def load(cls, exported: dict) -> "Sample":
 
		"""Load a previously exported dict and create a new Sample.
 

	
 
		:param exported: {"language": str, "ngrams": [str, ...]}"""
 
		ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
 
		return cls(exported["language"], ranked_ngrams)
 

	
 
	def export(self) -> dict:
 
		"""Export to a dict. Complement to Sample.load()
 

	
 
		:return: {"language": str, "ngrams": [str, ...]}"""
 
		return {
 
			"language": self.language,
 
			"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
 
		}
 

	
 
	def compare(self, other: "Sample") -> int:
 
		"""Compute a similarity score between self and other.
 

	
 
		The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample).
 

	
 
		:param other: a reference model in known language"""
 
		m = len(other.ranked_ngrams)
 

	
 
		res = sum(
 
			(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
 
			for (k, v) in self.ranked_ngrams.items()
 
		)
 

	
 
		return res
 

	
 

	
 
def load_models(model_path: str) -> list[Sample]:
 
	"""Load language models from path and return as a list."""
 
	with gzip.open(model_path, mode="rt", encoding="utf-8") as f:
 
		return [Sample.load(obj) for obj in json.load(f)]
 

	
 

	
 
def identify(text: str, models=[]) -> str:
 
	"""Return the language closest to text among the models.
 

	
 
	:param text: the text to identify
 
	:param models: list of models to choose from. The default is loaded from MODEL_PATH
 
	:return: best matching language (cs, de, en, ...)"""
 
	if not models:
 
		models = load_models(MODEL_PATH)
 

	
 
	sample = Sample.extract(text)
 

	
 
	return sorted(models, key=lambda m: sample.compare(m))[0].language
tests/test_predict.py
Show inline comments
 
from unittest import TestCase
 

	
 
from languedoc.predict import preprocess, rank_ngram_counts, Sample, identify
 
from languedoc.predict import preprocess, extract_kgram_counts, extract_ngram_counts, rank_ngram_counts, \
 
	extract_ranked_ngrams, Sample, identify
 

	
 

	
 
class TestPredict(TestCase):
 
	def test_preprocess(self):
 
		self.assertEqual(preprocess("abc"), " abc ")
 
		self.assertEqual(preprocess("A  b.c"), " a b c ")
 
		self.assertEqual(preprocess("1% "), " ")
 
		self.assertEqual(preprocess("Глава ĚŠČŘŽ"), " глава ěščřž ")
 

	
 
	def test_extract_kgram_counts(self):
 
		text = "abbbabb"
 
		self.assertEqual(extract_kgram_counts(text, 1), {"a": 2, "b": 5})
 
		self.assertEqual(extract_kgram_counts(text, 2), {"ab": 2, "bb": 3, "ba": 1})
 

	
 
	def test_extract_ngram_counts(self):
 
		text = "aab"
 
		self.assertEqual(extract_ngram_counts(text), {"a": 2, "b": 1, "aa": 1, "ab": 1, "aab": 1})
 

	
 
		text = "abbbabb"
 
		self.assertEqual(
 
			extract_ngram_counts(text),
 
			{"a": 2, "b": 5, "ab": 2, "bb": 3, "ba": 1, "abb": 2, "bbb": 1, "bba": 1, "bab": 1}
 
		)
 

	
 
	def test_rank_ngram_counts(self):
 
		freqs = {"a": 3, "aa": 1, "b": 4, "bb": 1, "c": 1}
 
		expected = {"b": 0, "a": 1, "c": 2, "aa": 3, "bb": 4}
 
		self.assertEqual(rank_ngram_counts(freqs), expected)
 

	
 
	def test_extract_ranked_ngrams(self):
 
		text = "aab"
 
		self.assertEqual(extract_ranked_ngrams(text), {"a": 0, "b": 1, "aa": 2, "ab": 3, "aab": 4})
 

	
 
		text = "abbbabb"
 
		self.assertEqual(
 
			extract_ranked_ngrams(text),
 
			{"b": 0, "bb": 1, "a": 2, "ab": 3, "abb": 4, "ba": 5, "bab": 6, "bba": 7, "bbb": 8}
 
		)
 

	
 

	
 
class TestSample(TestCase):
 
	def test_extract(self):
 
		a = Sample.extract("aaaaaa", "a")
 
		self.assertEqual(a.language, "a")
 
		self.assertEqual(a.ranked_ngrams, {'a': 0, 'aa': 1, 'aaa': 2, ' a': 3, 'a ': 4, ' aa': 5, 'aa ': 6})
 

	
 
		b = Sample.extract("aa aa aa", "b")
 
		self.assertEqual(b.ranked_ngrams, {'a': 0, ' a': 1, 'a ': 2, 'aa': 3, ' aa': 4, 'aa ': 5, 'a a': 6})
 

	
 
		c = Sample.extract("aa")
 
		self.assertEqual(c.language, "??")
 
		self.assertEqual(c.ranked_ngrams, {'a': 0, ' a': 1, 'a ': 2, 'aa': 3, ' aa': 4, 'aa ': 5})
 

	
 

	
 
class TestIdentify(TestCase):
 
	def test_identify(self):
 
		samples = [
 
			("cs", "Severní ledový oceán je nejmenší světový oceán."),
 
			("de", "Der Arktische Ozean ist der kleinste Ozean der Erde."),
 
			("en", "The Arctic Ocean is the smallest of the world's oceans."),
 
			("es", "Océano Ártico más pequeña y más septentrional del planeta"),
 
			("fr", "L'océan Arctique ce qui en fait le plus petit des océans."),
 
			("it", "Il Mar Glaciale Artico è una massa d'acqua..."),
 
			("ru", "Се́верный Ледови́тый океа́н — наименьший по площади океан Земли")
 
		]
 

	
 
		for (lang, sample) in samples:
 
			self.assertEqual(lang, identify(sample))
0 comments (0 inline, 0 general)