Files
@ f82e9a5b1c2c
Branch filter:
Location: Languedoc/src/languedoc/predict.py - annotation
f82e9a5b1c2c
4.8 KiB
text/x-python
added more tests
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 8b18810a3c7c d443541818b2 d443541818b2 ee446af216d7 ee446af216d7 252d3b1bca60 3998161856de d443541818b2 d443541818b2 781dc476bf41 781dc476bf41 d443541818b2 d443541818b2 d443541818b2 d443541818b2 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 d443541818b2 dbaf68186bdf d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 dbaf68186bdf d443541818b2 dbaf68186bdf d443541818b2 d443541818b2 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 dbaf68186bdf d443541818b2 d443541818b2 dbaf68186bdf d443541818b2 dbaf68186bdf d443541818b2 d443541818b2 8b18810a3c7c 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 dbaf68186bdf 8e4769dd4ca6 d443541818b2 d443541818b2 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 8e4769dd4ca6 dbaf68186bdf dbaf68186bdf d443541818b2 d443541818b2 d443541818b2 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 d443541818b2 d443541818b2 d443541818b2 d443541818b2 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 ee446af216d7 ee446af216d7 ee446af216d7 ee446af216d7 d443541818b2 d443541818b2 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 d443541818b2 d443541818b2 d443541818b2 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 d443541818b2 781dc476bf41 781dc476bf41 d443541818b2 d443541818b2 d443541818b2 d443541818b2 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 781dc476bf41 3998161856de 3998161856de 3998161856de d443541818b2 3998161856de d443541818b2 d443541818b2 d443541818b2 d443541818b2 | import os
import re
import itertools
import json
import gzip
from typing import Union
TOP_NGRAM_COUNT = 3000
SINGLE_SEGMENT_LENGTH = 1000
SINGLE_SAMPLE_LENGTH = 32
MODEL_PATH = os.path.join(os.path.dirname(__file__), "models.json.gz")
MODEL = []
def preprocess(text: str) -> str:
"""Preprocess text by stripping non-letter characters, collapsing whitespace and converting to lowercase."""
text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
return text.lower()
def sample_text(text: str, segment_length: int):
n = len(text)
if n < 4*segment_length:
return text
else:
f = lambda i: n*i//4 - segment_length//2
regexp = re.compile(fr"\s(.{{{segment_length}}}.*?)\s")
matches = [regexp.search(text, f(i)) for i in range(1, 4)]
return " ".join(m.group(1) for m in matches if m)
def extract_kgram_counts(text: str, k: int) -> dict[str, int]:
"""Extract k-gram counts from the text for a provided k.
:param text: the source text
:param k: length of the kgrams to extract. 1 for letters, 2 for bigrams, ...
:return: a dict mapping kgrams to their counts in the text"""
n = len(text)
counts = dict()
for i in range(0, n-k+1):
key = text[i:i+k]
if key.isspace():
continue
counts[key] = counts.get(key, 0) + 1
return counts
def extract_ngram_counts(text: str) -> dict[str, int]:
"""Extract counts of 1- to 3-grams from the text.
:param text: the source text
:return: a dict mapping ngrams to their counts in the text"""
counts = dict()
for k in range(1, 4):
counts.update(extract_kgram_counts(text, k))
return counts
def rank_ngram_counts(counts: dict[str, Union[int, float]]) -> dict[str, int]:
"""Order supplied ngrams by their counts (then length, then alphabetically) and return their ranking.
:param counts: a dict mapping ngrams to their counts
:return: a dict mapping ngrams to their rank (the most frequent: 0, the second: 1, ...)"""
ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT]
return dict(zip([key for (key, count) in ordered_ngrams], itertools.count(0)))
def extract_ranked_ngrams(text: str) -> dict[str, int]:
"""Extract ngrams from the text and rank them from the most common.
:param text: the source text
:return: a dict mapping ngrams to their ranks {most_common_ngram: 0, second: 1, ...}"""
counts = extract_ngram_counts(text)
return rank_ngram_counts(counts)
class Sample:
def __init__(self, language: str, ranked_ngrams: dict[str, float]):
"""Create a new Sample from language and ngrams.
This is usually impractical and Sample.extract or Sample.load are preferred."""
self.language = language
self.ranked_ngrams = ranked_ngrams
@classmethod
def extract(cls, text: str, language="??") -> "Sample":
"""Create a new Sample by extracting it from text.
:param text: a string, from which to extract the ngrams into a Sample
:param language: a two letter language code if it is known (cs|de|en|...)"""
preprocessed_text = preprocess(sample_text(text, SINGLE_SEGMENT_LENGTH))
sample = sample_text(preprocessed_text, SINGLE_SAMPLE_LENGTH)
return cls(language, extract_ranked_ngrams(sample))
@classmethod
def load(cls, exported: dict) -> "Sample":
"""Load a previously exported dict and create a new Sample.
:param exported: {"language": str, "ngrams": [str, ...]}"""
ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
return cls(exported["language"], ranked_ngrams)
def export(self) -> dict:
"""Export to a dict. Complement to Sample.load()
:return: {"language": str, "ngrams": [str, ...]}"""
return {
"language": self.language,
"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
}
def compare(self, other: "Sample") -> int:
"""Compute a similarity score between self and other.
The method is asymmetric. You are supposed to use sample.compare(model), not model.compare(sample).
:param other: a reference model in known language"""
m = len(other.ranked_ngrams)
res = sum(
(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
for (k, v) in self.ranked_ngrams.items()
)
return res
def load_models(model_path: str) -> list[Sample]:
"""Load language models from path and return as a list."""
with gzip.open(model_path, mode="rt", encoding="utf-8") as f:
return [Sample.load(obj) for obj in json.load(f)]
def identify(text: str, models=[]) -> str:
"""Return the language closest to text among the models.
:param text: the text to identify
:param models: list of models to choose from. The default is loaded from MODEL_PATH
:return: best matching language (cs, de, en, ...)"""
global MODEL
if not MODEL and not models:
MODEL = load_models(MODEL_PATH)
if not models:
models = MODEL
sample = Sample.extract(text)
return sorted(models, key=lambda m: sample.compare(m))[0].language
|