diff --git a/src/languedoc/predict.py b/src/languedoc/predict.py --- a/src/languedoc/predict.py +++ b/src/languedoc/predict.py @@ -6,7 +6,10 @@ import gzip from typing import Union TOP_NGRAM_COUNT = 3000 +SINGLE_SEGMENT_LENGTH = 1000 +SINGLE_SAMPLE_LENGTH = 32 MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz") +MODEL = [] def preprocess(text: str) -> str: @@ -15,6 +18,25 @@ def preprocess(text: str) -> str: return text.lower() +def sample_text(text: str, segment_length: int): + """Extract a reasonably and uniformly long sample from a long text. + + :param text: the input text + :param segment_length: a text segment length. The sample is going to be 3-4 times longer. + :return: a text sample cut from the original text, consisting of three segments + """ + n = len(text) + # a text too short to sample + if n < 4*segment_length: + return text + # take a segment from the 1st, 2nd and 3rd quarter of the text, to get a representative sample + else: + f = lambda i: n*i//4 - segment_length//2 + regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s") + matches = [regexp.search(text, f(i)) for i in range(1, 4)] + return " ".join(m.group(1) for m in matches if m) + + def extract_kgram_counts(text: str, k: int) -> dict[str, int]: """Extract k-gram counts from the text for a provided k. @@ -75,11 +97,14 @@ class Sample: @classmethod def extract(cls, text: str, language="??") -> "Sample": - """Create a new Sample by extracting it from text. + """Create a new Sample by extracting it from the text. :param text: a string, from which to extract the ngrams into a Sample :param language: a two letter language code if it is known (cs|de|en|...)""" - return cls(language, extract_ranked_ngrams(preprocess(text))) + preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH)) + sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH) + + return cls(language, extract_ranked_ngrams(sample)) @classmethod def load(cls, exported: dict) -> "Sample": @@ -103,7 +128,7 @@ class Sample: The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample). - :param other: a reference model in known language""" + :param other: a reference model in a known language""" m = len(other.ranked_ngrams) res = sum( @@ -126,8 +151,11 @@ def identify(text: str, models=[]) -> st :param text: the text to identify :param models: list of models to choose from. The default is loaded from MODEL_PATH :return: best matching language (cs, de, en, ...)""" + global MODEL + if not MODEL and not models: + MODEL = load_models(MODEL_PATH) if not models: - models = load_models(MODEL_PATH) + models = MODEL sample = Sample.extract(text)