Files
@ f82e9a5b1c2c
Branch filter:
Location: Languedoc/src/languedoc/predict.py
f82e9a5b1c2c
4.8 KiB
text/x-python
added more tests
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | import os
import re
import itertools
import json
import gzip
from typing import Union
TOP_NGRAM_COUNT = 3000
SINGLE_SEGMENT_LENGTH = 1000
SINGLE_SAMPLE_LENGTH = 32
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
MODEL = []
def preprocess(text: str) -> str:
"""Preprocess text by stripping non-letter characters, collapsing whitespace and converting to lowercase."""
text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
return text.lower()
def sample_text(text: str, segment_length: int):
n = len(text)
if n < 4*segment_length:
return text
else:
f = lambda i: n*i//4 - segment_length//2
regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s")
matches = [regexp.search(text, f(i)) for i in range(1, 4)]
return " ".join(m.group(1) for m in matches if m)
def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
"""Extract k-gram counts from the text for a provided k.
:param text: the source text
:param k: length of the kgrams to extract. 1 for letters, 2 for bigrams, ...
:return: a dict mapping kgrams to their counts in the text"""
n = len(text)
counts = dict()
for i in range(0, n-k+1):
key = text[i:i+k]
if key.isspace():
continue
counts[key] = counts.get(key, 0) + 1
return counts
def extract_ngram_counts(text: str) -> dict[str, int]:
"""Extract counts of 1- to 3-grams from the text.
:param text: the source text
:return: a dict mapping ngrams to their counts in the text"""
counts = dict()
for k in range(1, 4):
counts.update(extract_kgram_counts(text, k))
return counts
def rank_ngram_counts(counts: dict[str, Union[int, float]]) -> dict[str, int]:
"""Order supplied ngrams by their counts (then length, then alphabetically) and return their ranking.
:param counts: a dict mapping ngrams to their counts
:return: a dict mapping ngrams to their rank (the most frequent: 0, the second: 1, ...)"""
ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT]
return dict(zip([key for (key, count) in ordered_ngrams], itertools.count(0)))
def extract_ranked_ngrams(text: str) -> dict[str, int]:
"""Extract ngrams from the text and rank them from the most common.
:param text: the source text
:return: a dict mapping ngrams to their ranks {most_common_ngram: 0, second: 1, ...}"""
counts = extract_ngram_counts(text)
return rank_ngram_counts(counts)
class Sample:
def __init__(self, language: str, ranked_ngrams: dict[str, float]):
"""Create a new Sample from language and ngrams.
This is usually impractical and Sample.extract or Sample.load are preferred."""
self.language = language
self.ranked_ngrams = ranked_ngrams
@classmethod
def extract(cls, text: str, language="??") -> "Sample":
"""Create a new Sample by extracting it from text.
:param text: a string, from which to extract the ngrams into a Sample
:param language: a two letter language code if it is known (cs|de|en|...)"""
preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH))
sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH)
return cls(language, extract_ranked_ngrams(sample))
@classmethod
def load(cls, exported: dict) -> "Sample":
"""Load a previously exported dict and create a new Sample.
:param exported: {"language": str, "ngrams": [str, ...]}"""
ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
return cls(exported["language"], ranked_ngrams)
def export(self) -> dict:
"""Export to a dict. Complement to Sample.load()
:return: {"language": str, "ngrams": [str, ...]}"""
return {
"language": self.language,
"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
}
def compare(self, other: "Sample") -> int:
"""Compute a similarity score between self and other.
The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample).
:param other: a reference model in known language"""
m = len(other.ranked_ngrams)
res = sum(
(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
for (k, v) in self.ranked_ngrams.items()
)
return res
def load_models(model_path: str) -> list[Sample]:
"""Load language models from path and return as a list."""
with gzip.open(model_path, mode="rt", encoding="utf-8") as f:
return [Sample.load(obj) for obj in json.load(f)]
def identify(text: str, models=[]) -> str:
"""Return the language closest to text among the models.
:param text: the text to identify
:param models: list of models to choose from. The default is loaded from MODEL_PATH
:return: best matching language (cs, de, en, ...)"""
global MODEL
if not MODEL and not models:
MODEL = load_models(MODEL_PATH)
if not models:
models = MODEL
sample = Sample.extract(text)
return sorted(models, key=lambda m: sample.compare(m))[0].language
|