import os import re import itertools import json import gzip from typing import Union TOP_NGRAM_COUNT = 3000 SINGLE_SEGMENT_LENGTH = 1000 SINGLE_SAMPLE_LENGTH = 32 MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz") MODEL = [] def preprocess(text: str) -> str: """Preprocess text by stripping non-letter characters, collapsing whitespace and converting to lowercase.""" text = re.sub(r"[\W\d_]+", " ", " "+text+" ") return text.lower() def sample_text(text: str, segment_length: int): """Extract a reasonably and uniformly long sample from a long text. :param text: the input text :param segment_length: a text segment length. The sample is going to be 3-4 times longer. :return: a text sample cut from the original text, consisting of three segments """ n = len(text) # a text too short to sample if n < 4*segment_length: return text # take a segment from the 1st, 2nd and 3rd quarter of the text, to get a representative sample else: f = lambda i: n*i//4 - segment_length//2 regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s") matches = [regexp.search(text, f(i)) for i in range(1, 4)] return " ".join(m.group(1) for m in matches if m) def extract_kgram_counts(text: str, k: int) -> dict[str, int]: """Extract k-gram counts from the text for a provided k. :param text: the source text :param k: length of the kgrams to extract. 1 for letters, 2 for bigrams, ... :return: a dict mapping kgrams to their counts in the text""" n = len(text) counts = dict() for i in range(0, n-k+1): key = text[i:i+k] if key.isspace(): continue counts[key] = counts.get(key, 0) + 1 return counts def extract_ngram_counts(text: str) -> dict[str, int]: """Extract counts of 1- to 3-grams from the text. :param text: the source text :return: a dict mapping ngrams to their counts in the text""" counts = dict() for k in range(1, 4): counts.update(extract_kgram_counts(text, k)) return counts def rank_ngram_counts(counts: dict[str, Union[int, float]]) -> dict[str, int]: """Order supplied ngrams by their counts (then length, then alphabetically) and return their ranking. :param counts: a dict mapping ngrams to their counts :return: a dict mapping ngrams to their rank (the most frequent: 0, the second: 1, ...)""" ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT] return dict(zip([key for (key, count) in ordered_ngrams], itertools.count(0))) def extract_ranked_ngrams(text: str) -> dict[str, int]: """Extract ngrams from the text and rank them from the most common. :param text: the source text :return: a dict mapping ngrams to their ranks {most_common_ngram: 0, second: 1, ...}""" counts = extract_ngram_counts(text) return rank_ngram_counts(counts) class Sample: def __init__(self, language: str, ranked_ngrams: dict[str, float]): """Create a new Sample from language and ngrams. This is usually impractical and Sample.extract or Sample.load are preferred.""" self.language = language self.ranked_ngrams = ranked_ngrams @classmethod def extract(cls, text: str, language="??") -> "Sample": """Create a new Sample by extracting it from the text. :param text: a string, from which to extract the ngrams into a Sample :param language: a two letter language code if it is known (cs|de|en|...)""" preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH)) sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH) return cls(language, extract_ranked_ngrams(sample)) @classmethod def load(cls, exported: dict) -> "Sample": """Load a previously exported dict and create a new Sample. :param exported: {"language": str, "ngrams": [str, ...]}""" ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])} return cls(exported["language"], ranked_ngrams) def export(self) -> dict: """Export to a dict. Complement to Sample.load() :return: {"language": str, "ngrams": [str, ...]}""" return { "language": self.language, "ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])] } def compare(self, other: "Sample") -> int: """Compute a similarity score between self and other. The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample). :param other: a reference model in a known language""" m = len(other.ranked_ngrams) res = sum( (abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m) for (k, v) in self.ranked_ngrams.items() ) return res def load_models(model_path: str) -> list[Sample]: """Load language models from path and return as a list.""" with gzip.open(model_path, mode="rt", encoding="utf-8") as f: return [Sample.load(obj) for obj in json.load(f)] def identify(text: str, models=[]) -> str: """Return the language closest to text among the models. :param text: the text to identify :param models: list of models to choose from. The default is loaded from MODEL_PATH :return: best matching language (cs, de, en, ...)""" global MODEL if not MODEL and not models: MODEL = load_models(MODEL_PATH) if not models: models = MODEL sample = Sample.extract(text) return sorted(models, key=lambda m: sample.compare(m))[0].language