Files
@ 9c518a47ef7f
Branch filter:
Location: Languedoc/src/languedoc/predict.py
9c518a47ef7f
4.2 KiB
text/x-python
1.0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | import os
import re
import itertools
import json
import gzip
from typing import Union
TOP_NGRAM_COUNT = 3000
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
def preprocess(text: str) -> str:
"""Preprocess text by stripping non-letter characters, collapsing whitespace and converting to lowercase."""
text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
return text.lower()
def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
"""Extract k-gram counts from the text for a provided k.
:param text: the source text
:param k: length of the kgrams to extract. 1 for letters, 2 for bigrams, ...
:return: a dict mapping kgrams to their counts in the text"""
n = len(text)
counts = dict()
for i in range(0, n-k+1):
key = text[i:i+k]
if key.isspace():
continue
counts[key] = counts.get(key, 0) + 1
return counts
def extract_ngram_counts(text: str) -> dict[str, int]:
"""Extract counts of 1- to 3-grams from the text.
:param text: the source text
:return: a dict mapping ngrams to their counts in the text"""
counts = dict()
for k in range(1, 4):
counts.update(extract_kgram_counts(text, k))
return counts
def rank_ngram_counts(counts: dict[str, Union[int, float]]) -> dict[str, int]:
"""Order supplied ngrams by their counts (then length, then alphabetically) and return their ranking.
:param counts: a dict mapping ngrams to their counts
:return: a dict mapping ngrams to their rank (the most frequent: 0, the second: 1, ...)"""
ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT]
return dict(zip([key for (key, count) in ordered_ngrams], itertools.count(0)))
def extract_ranked_ngrams(text: str) -> dict[str, int]:
"""Extract ngrams from the text and rank them from the most common.
:param text: the source text
:return: a dict mapping ngrams to their ranks {most_common_ngram: 0, second: 1, ...}"""
counts = extract_ngram_counts(text)
return rank_ngram_counts(counts)
class Sample:
def __init__(self, language: str, ranked_ngrams: dict[str, float]):
"""Create a new Sample from language and ngrams.
This is usually impractical and Sample.extract or Sample.load are preferred."""
self.language = language
self.ranked_ngrams = ranked_ngrams
@classmethod
def extract(cls, text: str, language="??") -> "Sample":
"""Create a new Sample by extracting it from text.
:param text: a string, from which to extract the ngrams into a Sample
:param language: a two letter language code if it is known (cs|de|en|...)"""
return cls(language, extract_ranked_ngrams(preprocess(text)))
@classmethod
def load(cls, exported: dict) -> "Sample":
"""Load a previously exported dict and create a new Sample.
:param exported: {"language": str, "ngrams": [str, ...]}"""
ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
return cls(exported["language"], ranked_ngrams)
def export(self) -> dict:
"""Export to a dict. Complement to Sample.load()
:return: {"language": str, "ngrams": [str, ...]}"""
return {
"language": self.language,
"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
}
def compare(self, other: "Sample") -> int:
"""Compute a similarity score between self and other.
The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample).
:param other: a reference model in known language"""
m = len(other.ranked_ngrams)
res = sum(
(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
for (k, v) in self.ranked_ngrams.items()
)
return res
def load_models(model_path: str) -> list[Sample]:
"""Load language models from path and return as a list."""
with gzip.open(model_path, mode="rt", encoding="utf-8") as f:
return [Sample.load(obj) for obj in json.load(f)]
def identify(text: str, models=[]) -> str:
"""Return the language closest to text among the models.
:param text: the text to identify
:param models: list of models to choose from. The default is loaded from MODEL_PATH
:return: best matching language (cs, de, en, ...)"""
if not models:
models = load_models(MODEL_PATH)
sample = Sample.extract(text)
return sorted(models, key=lambda m: sample.compare(m))[0].language
|