diff --git a/languedoc.py b/languedoc.py --- a/languedoc.py +++ b/languedoc.py @@ -1,9 +1,8 @@ import os -import re import random import itertools -from shared import identify, extract_ngram_freqs, TOP_NGRAM_COUNT +from shared import preprocess, identify, extract_ngram_freqs, rank_ngram_freqs, Sample random.seed(19181028) @@ -11,11 +10,6 @@ CROSSVALIDATION_SOURCE_COUNT = 5 TEST_LENS = [8, 16, 32, 64] -def preprocess(text): - text = re.sub(r"[\W\d_]+", " ", " "+text+" ") - return text.lower() - - def merge_ngram_freqs(freqs): n = len(freqs) res = dict() @@ -28,78 +22,28 @@ def merge_ngram_freqs(freqs): return res -class Sample: - def __init__(self, language="??", text=""): - self.language = language - self.frequencies = dict() - self._ranked_ngrams = dict() - - if text: - self._extract(text) - - def _extract(self, text): - for k in range(1, 4): - self.frequencies.update(extract_ngram_freqs(text, k)) - - @staticmethod - def merge(samples): - assert len({x.language for x in samples}) == 1 - - res = Sample(samples[0].language) - res.frequencies = merge_ngram_freqs([x.frequencies for x in samples]) - - return res - - @property - def ranked_ngrams(self): - if not self._ranked_ngrams: - ordered_ngrams = sorted(self.frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] - self._ranked_ngrams = dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0))) - - return self._ranked_ngrams - - def compare(self, other): - """take k most common - use frequencies x order - use letter, digrams, trigrams - use absolute x square""" - res = sum(abs(v-other.ranked_ngrams.get(k, len(other.ranked_ngrams))) for (k, v) in self.ranked_ngrams.items()) + \ - sum(abs(v-self.ranked_ngrams.get(k, len(self.ranked_ngrams))) for (k, v) in other.ranked_ngrams.items()) - - return res - - def print_overview(self): - print(f"Sample({self.language}):") - - for freqs in self.frequencies: - x = [ - (k, round(v, 3)) - for (k, v) in sorted(freqs.items(), key=lambda kv: -kv[1]) - ] - print(" ", x[:8], "...", x[-8:]) - - print() - - class SampleSet: def __init__(self, language): self.language = language self.texts = [] - self.samples = [] + self.frequencies = [] def add(self, text): self.texts.append(text) - self.samples.append(Sample(self.language, text)) + self.frequencies.append(extract_ngram_freqs(text)) def create_model(self): - return Sample.merge(self.samples) + merged_frequencies = merge_ngram_freqs(self.frequencies) + res = Sample(self.language, rank_ngram_freqs(merged_frequencies)) + return res def generate_tests(self, n): - for (i, (text, sample)) in enumerate(itertools.cycle(zip(self.texts, self.samples))): + for (i, (text, freqs)) in enumerate(itertools.cycle(zip(self.texts, self.frequencies))): if i >= n: break - yield (text, Sample.merge([x for x in self.samples if x is not sample])) + ranked_ngrams = rank_ngram_freqs(merge_ngram_freqs([f for f in self.frequencies if f is not freqs])) + yield (text, Sample(self.language, ranked_ngrams)) def cross_validate(sample_sets): diff --git a/shared.py b/shared.py --- a/shared.py +++ b/shared.py @@ -1,9 +1,15 @@ +import re import itertools TOP_NGRAM_COUNT = 5000 -def extract_ngram_freqs(text, k): +def preprocess(text): + text = re.sub(r"[\W\d_]+", " ", " "+text+" ") + return text.lower() + + +def extract_kgram_freqs(text, k): n = len(text) d = dict() @@ -19,26 +25,44 @@ def extract_ngram_freqs(text, k): return {key: val/count for (key, val) in d.items()} -class Sample: - def __init__(self, language="??", text=""): - self.language = language - self.frequencies = dict() - self._ranked_ngrams = dict() +def extract_ngram_freqs(text): + frequencies = {} + + for k in range(1, 4): + frequencies.update(extract_kgram_freqs(text, k)) + + return frequencies + - if text: - self._extract(text) +def rank_ngram_freqs(frequencies): + ordered_ngrams = sorted(frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] + return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0))) + + +def extract_ranked_ngrams(text): + frequencies = extract_ngram_freqs(text) + return rank_ngram_freqs(frequencies) + - def _extract(self, text): - for k in range(1, 4): - self.frequencies.update(extract_ngram_freqs(text, k)) +class Sample: + def __init__(self, language, ranked_ngrams): + self.language = language + self.ranked_ngrams = ranked_ngrams + + @classmethod + def extract(cls, text, language="??"): + return cls(language, extract_ranked_ngrams(preprocess(text))) - @property - def ranked_ngrams(self): - if not self._ranked_ngrams: - ordered_ngrams = sorted(self.frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT] - self._ranked_ngrams = dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0))) + @classmethod + def load(cls, exported): + ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])} + return cls(exported["language"], ranked_ngrams) - return self._ranked_ngrams + def export(self): + return { + "language": self.language, + "ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])] + } def compare(self, other): """take k most common @@ -51,20 +75,8 @@ class Sample: return res - def print_overview(self): - print(f"Sample({self.language}):") - - for freqs in self.frequencies: - x = [ - (k, round(v, 3)) - for (k, v) in sorted(freqs.items(), key=lambda kv: -kv[1]) - ] - print(" ", x[:8], "...", x[-8:]) - - print() - def identify(text, models): - sample = Sample(text=text) + sample = Sample.extract(text) return sorted(models, key=lambda m: m.compare(sample))[0].language