Languedoc Changeset - ee446af216d7

Changeset - ee446af216d7

Parent rev.

Child rev.

[Not reviewed]

default

0 2 0

Laman - 2 years ago 2023-07-12 21:02:01

shortening the analyzed text to achieve a constant time performance

2 files changed with 18 insertions and 2 deletions:

setup.cfg

src/languedoc/predict.py

0 comments (0 inline, 0 general)

setup.cfg

➞

Show inline comments

 [metadata]
 name = languedoc
-version = 1.0
+version = 1.1
 license = GPLv3
 description = A simple language identification library.

src/languedoc/predict.py

➞

Show inline comments

@@ @@ -6,6 +6,8 @@ import gzip @@
 from typing import Union
 TOP_NGRAM_COUNT = 3000
 SINGLE_SEGMENT_LENGTH = 1000
 SINGLE_SAMPLE_LENGTH = 32
 MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
 MODEL = []
@@ @@ -16,6 +18,17 @@ def preprocess(text: str) -> str: @@
 	return text.lower()
 def sample_text(text: str, segment_length: int):
 	n = len(text)
 	if n < 4*segment_length:
 		return text
 	else:
 		f = lambda i: n*i//4 - segment_length//2
 		regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s")
 		matches = [regexp.search(text, f(i)) for i in range(1, 4)]
 		return " ".join(m.group(1) for m in matches if m)
 def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
 	"""Extract k-gram counts from the text for a provided k.
@@ @@ -80,7 +93,10 @@ class Sample: @@
 		:param text: a string, from which to extract the ngrams into a Sample
 		:param language: a two letter language code if it is known (cs|de|en|...)"""
 		return cls(language, extract_ranked_ngrams(preprocess(text)))
 		preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH))
 		sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH)
 		return cls(language, extract_ranked_ngrams(sample))
 	@classmethod
 	def load(cls, exported: dict) -> "Sample":

0 comments (0 inline, 0 general)