Changeset - ee446af216d7
[Not reviewed]
default
0 2 0
Laman - 21 months ago 2023-07-12 21:02:01

shortening the analyzed text to achieve a constant time performance
2 files changed with 18 insertions and 2 deletions:
0 comments (0 inline, 0 general)
setup.cfg
Show inline comments
 
[metadata]
 
name = languedoc
 
version = 1.0
 
version = 1.1
 
license = GPLv3
 
description = A simple language identification library.
 

	
src/languedoc/predict.py
Show inline comments
 
@@ -6,6 +6,8 @@ import gzip
 
from typing import Union
 

	
 
TOP_NGRAM_COUNT = 3000
 
SINGLE_SEGMENT_LENGTH = 1000
 
SINGLE_SAMPLE_LENGTH = 32
 
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
 
MODEL = []
 

	
 
@@ -16,6 +18,17 @@ def preprocess(text: str) -> str:
 
	return text.lower()
 

	
 

	
 
def sample_text(text: str, segment_length: int):
 
	n = len(text)
 
	if n < 4*segment_length:
 
		return text
 
	else:
 
		f = lambda i: n*i//4 - segment_length//2
 
		regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s")
 
		matches = [regexp.search(text, f(i)) for i in range(1, 4)]
 
		return " ".join(m.group(1) for m in matches if m)
 

	
 

	
 
def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
 
	"""Extract k-gram counts from the text for a provided k.
 

	
 
@@ -80,7 +93,10 @@ class Sample:
 

	
 
		:param text: a string, from which to extract the ngrams into a Sample
 
		:param language: a two letter language code if it is known (cs|de|en|...)"""
 
		return cls(language, extract_ranked_ngrams(preprocess(text)))
 
		preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH))
 
		sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH)
 

	
 
		return cls(language, extract_ranked_ngrams(sample))
 

	
 
	@classmethod
 
	def load(cls, exported: dict) -> "Sample":
0 comments (0 inline, 0 general)