Languedoc Files · src/languedoc/predict.py

Files @ 4ea2a5eb6cf4

Branch filter:

Location: Languedoc/src/languedoc/predict.py - annotation

4ea2a5eb6cf4 5.2 KiB text/x-python Show Source Show as Raw Download as Raw

Laman

merge default

d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
8b18810a3c7c
d443541818b2
d443541818b2
ee446af216d7
ee446af216d7
252d3b1bca60
3998161856de
d443541818b2
d443541818b2
781dc476bf41
781dc476bf41
d443541818b2
d443541818b2
d443541818b2
d443541818b2
ee446af216d7
f1db051d658e
f1db051d658e
f1db051d658e
f1db051d658e
f1db051d658e
f1db051d658e
ee446af216d7
f1db051d658e
ee446af216d7
ee446af216d7
f1db051d658e
ee446af216d7
ee446af216d7
ee446af216d7
ee446af216d7
ee446af216d7
ee446af216d7
ee446af216d7
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
d443541818b2
dbaf68186bdf
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
dbaf68186bdf
d443541818b2
dbaf68186bdf
d443541818b2
d443541818b2
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
dbaf68186bdf
d443541818b2
d443541818b2
dbaf68186bdf
d443541818b2
dbaf68186bdf
d443541818b2
d443541818b2
8b18810a3c7c
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
dbaf68186bdf
8e4769dd4ca6
d443541818b2
d443541818b2
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
8e4769dd4ca6
dbaf68186bdf
dbaf68186bdf
d443541818b2
d443541818b2
d443541818b2
781dc476bf41
781dc476bf41
781dc476bf41
781dc476bf41
d443541818b2
d443541818b2
d443541818b2
d443541818b2
781dc476bf41
f1db051d658e
781dc476bf41
781dc476bf41
781dc476bf41
ee446af216d7
ee446af216d7
ee446af216d7
ee446af216d7
d443541818b2
d443541818b2
781dc476bf41
781dc476bf41
781dc476bf41
781dc476bf41
d443541818b2
d443541818b2
d443541818b2
781dc476bf41
781dc476bf41
781dc476bf41
781dc476bf41
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
781dc476bf41
781dc476bf41
781dc476bf41
781dc476bf41
781dc476bf41
f1db051d658e
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
d443541818b2
781dc476bf41
781dc476bf41
d443541818b2
d443541818b2
d443541818b2
d443541818b2
781dc476bf41
781dc476bf41
781dc476bf41
781dc476bf41
781dc476bf41
781dc476bf41
3998161856de
3998161856de
3998161856de
d443541818b2
3998161856de
d443541818b2
d443541818b2
d443541818b2
d443541818b2

import os
import re
import itertools
import json
import gzip
from typing import Union

TOP_NGRAM_COUNT = 3000
SINGLE_SEGMENT_LENGTH = 1000
SINGLE_SAMPLE_LENGTH = 32
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
MODEL = []


def preprocess(text: str) -> str:
	"""Preprocess text by stripping non-letter characters, collapsing whitespace and converting to lowercase."""
	text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
	return text.lower()


def sample_text(text: str, segment_length: int):
	"""Extract a reasonably and uniformly long sample from a long text.

	:param text: the input text
	:param segment_length: a text segment length. The sample is going to be 3-4 times longer.
	:return: a text sample cut from the original text, consisting of three segments
	"""
	n = len(text)
	# a text too short to sample
	if n < 4*segment_length:
		return text
	# take a segment from the 1st, 2nd and 3rd quarter of the text, to get a representative sample
	else:
		f = lambda i: n*i//4 - segment_length//2
		regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s")
		matches = [regexp.search(text, f(i)) for i in range(1, 4)]
		return " ".join(m.group(1) for m in matches if m)


def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
	"""Extract k-gram counts from the text for a provided k.

	:param text: the source text
	:param k: length of the kgrams to extract. 1 for letters, 2 for bigrams, ...
	:return: a dict mapping kgrams to their counts in the text"""
	n = len(text)
	counts = dict()

	for i in range(0, n-k+1):
		key = text[i:i+k]
		if key.isspace():
			continue

		counts[key] = counts.get(key, 0) + 1

	return counts


def extract_ngram_counts(text: str) -> dict[str, int]:
	"""Extract counts of 1- to 3-grams from the text.

	:param text: the source text
	:return: a dict mapping ngrams to their counts in the text"""
	counts = dict()

	for k in range(1, 4):
		counts.update(extract_kgram_counts(text, k))

	return counts


def rank_ngram_counts(counts: dict[str, Union[int, float]]) -> dict[str, int]:
	"""Order supplied ngrams by their counts (then length, then alphabetically) and return their ranking.

	:param counts: a dict mapping ngrams to their counts
	:return: a dict mapping ngrams to their rank (the most frequent: 0, the second: 1, ...)"""
	ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT]
	return dict(zip([key for (key, count) in ordered_ngrams], itertools.count(0)))


def extract_ranked_ngrams(text: str) -> dict[str, int]:
	"""Extract ngrams from the text and rank them from the most common.

	:param text: the source text
	:return: a dict mapping ngrams to their ranks {most_common_ngram: 0, second: 1, ...}"""
	counts = extract_ngram_counts(text)
	return rank_ngram_counts(counts)


class Sample:
	def __init__(self, language: str, ranked_ngrams: dict[str, float]):
		"""Create a new Sample from language and ngrams.

		This is usually impractical and Sample.extract or Sample.load are preferred."""
		self.language = language
		self.ranked_ngrams = ranked_ngrams

	@classmethod
	def extract(cls, text: str, language="??") -> "Sample":
		"""Create a new Sample by extracting it from the text.

		:param text: a string, from which to extract the ngrams into a Sample
		:param language: a two letter language code if it is known (cs|de|en|...)"""
		preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH))
		sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH)

		return cls(language, extract_ranked_ngrams(sample))

	@classmethod
	def load(cls, exported: dict) -> "Sample":
		"""Load a previously exported dict and create a new Sample.

		:param exported: {"language": str, "ngrams": [str, ...]}"""
		ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
		return cls(exported["language"], ranked_ngrams)

	def export(self) -> dict:
		"""Export to a dict. Complement to Sample.load()

		:return: {"language": str, "ngrams": [str, ...]}"""
		return {
			"language": self.language,
			"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
		}

	def compare(self, other: "Sample") -> int:
		"""Compute a similarity score between self and other.

		The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample).

		:param other: a reference model in a known language"""
		m = len(other.ranked_ngrams)

		res = sum(
			(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
			for (k, v) in self.ranked_ngrams.items()
		)

		return res


def load_models(model_path: str) -> list[Sample]:
	"""Load language models from path and return as a list."""
	with gzip.open(model_path, mode="rt", encoding="utf-8") as f:
		return [Sample.load(obj) for obj in json.load(f)]


def identify(text: str, models=[]) -> str:
	"""Return the language closest to text among the models.

	:param text: the text to identify
	:param models: list of models to choose from. The default is loaded from MODEL_PATH
	:return: best matching language (cs, de, en, ...)"""
	global MODEL
	if not MODEL and not models:
		MODEL = load_models(MODEL_PATH)
	if not models:
		models = MODEL

	sample = Sample.extract(text)

	return sorted(models, key=lambda m: sample.compare(m))[0].language