Files
@ 3299400fb3ff
Branch filter:
Location: Languedoc/shared.py - annotation
3299400fb3ff
2.0 KiB
text/x-python
closing a dead end
f896b3675ee7 d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 f896b3675ee7 d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb 7897aa8656bc 7897aa8656bc 7897aa8656bc 7897aa8656bc d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb d2fa9460c0fb f896b3675ee7 d2fa9460c0fb d2fa9460c0fb | import re
import itertools
TOP_NGRAM_COUNT = 5000
def preprocess(text):
text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
return text.lower()
def extract_kgram_freqs(text, k):
n = len(text)
d = dict()
for i in range(0, n-k+1):
key = text[i:i+k]
if key.isspace():
continue
d[key] = d.get(key, 0) + 1
count = sum(d.values())
return {key: val/count for (key, val) in d.items()}
def extract_ngram_freqs(text):
frequencies = {}
for k in range(1, 4):
frequencies.update(extract_kgram_freqs(text, k))
return frequencies
def rank_ngram_freqs(frequencies):
ordered_ngrams = sorted(frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT]
return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0)))
def extract_ranked_ngrams(text):
frequencies = extract_ngram_freqs(text)
return rank_ngram_freqs(frequencies)
class Sample:
def __init__(self, language, ranked_ngrams):
self.language = language
self.ranked_ngrams = ranked_ngrams
@classmethod
def extract(cls, text, language="??"):
return cls(language, extract_ranked_ngrams(preprocess(text)))
@classmethod
def load(cls, exported):
ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
return cls(exported["language"], ranked_ngrams)
def export(self):
return {
"language": self.language,
"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
}
def compare(self, other):
"""take k most common
use frequencies x order
use letter, digrams, trigrams
use absolute x square"""
"""make a set difference of keys, multiply its size by the max score"""
m = len(other.ranked_ngrams)
n = len(self.ranked_ngrams)
res = sum(abs(v-other.ranked_ngrams.get(k, m)) for (k, v) in self.ranked_ngrams.items()) + \
sum(abs(v-self.ranked_ngrams.get(k, n)) for (k, v) in other.ranked_ngrams.items())
return res
def identify(text, models):
sample = Sample.extract(text)
return sorted(models, key=lambda m: m.compare(sample))[0].language
|