Changeset - 3998161856de
[Not reviewed]
default
0 1 0
Laman - 21 months ago 2023-07-10 18:47:29

optimized the model loading
1 file changed with 5 insertions and 1 deletions:
0 comments (0 inline, 0 general)
src/languedoc/predict.py
Show inline comments
 
@@ -4,12 +4,13 @@ import itertools
 
import json
 
import gzip
 
from typing import Union
 

	
 
TOP_NGRAM_COUNT = 3000
 
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
 
MODEL = []
 

	
 

	
 
def preprocess(text: str) -> str:
 
	"""Preprocess text by stripping non-letter characters, collapsing whitespace and converting to lowercase."""
 
	text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
 
	return text.lower()
 
@@ -123,12 +124,15 @@ def load_models(model_path: str) -> list
 
def identify(text: str, models=[]) -> str:
 
	"""Return the language closest to text among the models.
 

	
 
	:param text: the text to identify
 
	:param models: list of models to choose from. The default is loaded from MODEL_PATH
 
	:return: best matching language (cs, de, en, ...)"""
 
	global MODEL
 
	if not MODEL and not models:
 
		MODEL = load_models(MODEL_PATH)
 
	if not models:
 
		models = load_models(MODEL_PATH)
 
		models = MODEL
 

	
 
	sample = Sample.extract(text)
 

	
 
	return sorted(models, key=lambda m: sample.compare(m))[0].language
0 comments (0 inline, 0 general)