diff --git a/src/languedoc/predict.py b/src/languedoc/predict.py --- a/src/languedoc/predict.py +++ b/src/languedoc/predict.py @@ -6,6 +6,8 @@ import gzip from typing import Union TOP_NGRAM_COUNT = 3000 +SINGLE_SEGMENT_LENGTH = 1000 +SINGLE_SAMPLE_LENGTH = 32 MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz") MODEL = [] @@ -16,6 +18,17 @@ def preprocess(text: str) -> str: return text.lower() +def sample_text(text: str, segment_length: int): + n = len(text) + if n < 4*segment_length: + return text + else: + f = lambda i: n*i//4 - segment_length//2 + regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s") + matches = [regexp.search(text, f(i)) for i in range(1, 4)] + return " ".join(m.group(1) for m in matches if m) + + def extract_kgram_counts(text: str, k: int) -> dict[str, int]: """Extract k-gram counts from the text for a provided k. @@ -80,7 +93,10 @@ class Sample: :param text: a string, from which to extract the ngrams into a Sample :param language: a two letter language code if it is known (cs|de|en|...)""" - return cls(language, extract_ranked_ngrams(preprocess(text))) + preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH)) + sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH) + + return cls(language, extract_ranked_ngrams(sample)) @classmethod def load(cls, exported: dict) -> "Sample":