Files
@ dbaf68186bdf
Branch filter:
Location: Languedoc/src/languedoc/predict.py
dbaf68186bdf
3.3 KiB
text/x-python
switched from frequencies to basic counts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | import os
import re
import itertools
import json
import gzip
TOP_NGRAM_COUNT = 3000
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
def preprocess(text: str) -> str:
"""Preprocess text by stripping non-letter characters, collapsing whitespace and converting to lowercase."""
text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
return text.lower()
def extract_kgram_counts(text, k):
n = len(text)
counts = dict()
for i in range(0, n-k+1):
key = text[i:i+k]
if key.isspace():
continue
counts[key] = counts.get(key, 0) + 1
return counts
def extract_ngram_counts(text):
counts = dict()
for k in range(1, 4):
counts.update(extract_kgram_counts(text, k))
return counts
def rank_ngram_counts(counts):
ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT]
return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0)))
def extract_ranked_ngrams(text):
counts = extract_ngram_counts(text)
return rank_ngram_counts(counts)
class Sample:
def __init__(self, language: str, ranked_ngrams: dict[str, float]):
"""Create a new Sample from language and ngrams.
This is usually impractical and Sample.extract or Sample.load are preferred."""
self.language = language
self.ranked_ngrams = ranked_ngrams
@classmethod
def extract(cls, text: str, language="??") -> "Sample":
"""Create a new Sample by extracting it from text.
:param text: a string, from which to extract the ngrams into a Sample
:param language: a two letter language code if it is known (cs|de|en|...)"""
return cls(language, extract_ranked_ngrams(preprocess(text)))
@classmethod
def load(cls, exported: dict) -> "Sample":
"""Load a previously exported dict and create a new Sample.
:param exported: {"language": str, "ngrams": [str, ...]}"""
ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
return cls(exported["language"], ranked_ngrams)
def export(self) -> dict:
"""Export to a dict. Complement to Sample.load()
:return: {"language": str, "ngrams": [str, ...]}"""
return {
"language": self.language,
"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
}
def compare(self, other: "Sample") -> int:
"""Compute a similarity score between self and other.
The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample).
:param other: a reference model in known language"""
m = len(other.ranked_ngrams)
res = sum(
(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
for (k, v) in self.ranked_ngrams.items()
)
return res
def load_models(model_path: str) -> list[Sample]:
"""Load language models from path and return as a list."""
with gzip.open(model_path, mode="rt", encoding="utf-8") as f:
return [Sample.load(obj) for obj in json.load(f)]
def identify(text: str, models=[]) -> str:
"""Return the language closest to text among the models.
:param text: the text to identify
:param models: list of models to choose from. The default is loaded from MODEL_PATH
:return: best matching language (cs, de, en, ...)"""
if not models:
models = load_models(MODEL_PATH)
sample = Sample.extract(text)
return sorted(models, key=lambda m: sample.compare(m))[0].language
|