Files @ 7897aa8656bc
Branch filter:

Location: Languedoc/shared.py

Laman
basic gets
import re
import itertools

TOP_NGRAM_COUNT = 5000


def preprocess(text):
	text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
	return text.lower()


def extract_kgram_freqs(text, k):
	n = len(text)
	d = dict()

	for i in range(0, n-k+1):
		key = text[i:i+k]
		if key.isspace():
			continue

		d[key] = d.get(key, 0) + 1

	count = sum(d.values())

	return {key: val/count for (key, val) in d.items()}


def extract_ngram_freqs(text):
	frequencies = {}

	for k in range(1, 4):
		frequencies.update(extract_kgram_freqs(text, k))

	return frequencies


def rank_ngram_freqs(frequencies):
	ordered_ngrams = sorted(frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT]
	return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0)))


def extract_ranked_ngrams(text):
	frequencies = extract_ngram_freqs(text)
	return rank_ngram_freqs(frequencies)


class Sample:
	def __init__(self, language, ranked_ngrams):
		self.language = language
		self.ranked_ngrams = ranked_ngrams

	@classmethod
	def extract(cls, text, language="??"):
		return cls(language, extract_ranked_ngrams(preprocess(text)))

	@classmethod
	def load(cls, exported):
		ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
		return cls(exported["language"], ranked_ngrams)

	def export(self):
		return {
			"language": self.language,
			"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
		}

	def compare(self, other):
		"""take k most common
		use frequencies x order
		use letter, digrams, trigrams
		use absolute x square"""
		"""make a set difference of keys, multiply its size by the max score"""
		m = len(other.ranked_ngrams)
		n = len(self.ranked_ngrams)
		res = sum(abs(v-other.ranked_ngrams.get(k, m)) for (k, v) in self.ranked_ngrams.items()) + \
					sum(abs(v-self.ranked_ngrams.get(k, n)) for (k, v) in other.ranked_ngrams.items())

		return res


def identify(text, models):
	sample = Sample.extract(text)

	return sorted(models, key=lambda m: m.compare(sample))[0].language