Changeset - a27c2661a846
[Not reviewed]
default
0 1 0
Laman - 2 years ago 2022-10-10 22:12:27

fixed the prediction code to match the original paper
1 file changed with 8 insertions and 5 deletions:
0 comments (0 inline, 0 general)
shared.py
Show inline comments
 
import re
 
import itertools
 

	
 
TOP_NGRAM_COUNT = 5000
 
TOP_NGRAM_COUNT = 3000
 

	
 

	
 
def preprocess(text):
 
	text = re.sub(r"[\W\d_]+", " ", " "+text+" ")
 
	return text.lower()
 

	
 

	
 
def extract_kgram_freqs(text, k):
 
	n = len(text)
 
	d = dict()
 

	
 
	for i in range(0, n-k+1):
 
		key = text[i:i+k]
 
		if key.isspace():
 
			continue
 

	
 
		d[key] = d.get(key, 0) + 1
 

	
 
	count = sum(d.values())
 

	
 
	return {key: val/count for (key, val) in d.items()}
 

	
 

	
 
def extract_ngram_freqs(text):
 
	frequencies = {}
 

	
 
	for k in range(1, 4):
 
		frequencies.update(extract_kgram_freqs(text, k))
 

	
 
	return frequencies
 

	
 

	
 
def rank_ngram_freqs(frequencies):
 
	ordered_ngrams = sorted(frequencies.items(), key=lambda kv: -kv[1])[:TOP_NGRAM_COUNT]
 
	return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0)))
 

	
 

	
 
def extract_ranked_ngrams(text):
 
	frequencies = extract_ngram_freqs(text)
 
	return rank_ngram_freqs(frequencies)
 

	
 

	
 
class Sample:
 
	def __init__(self, language, ranked_ngrams):
 
		self.language = language
 
		self.ranked_ngrams = ranked_ngrams
 

	
 
	@classmethod
 
	def extract(cls, text, language="??"):
 
		return cls(language, extract_ranked_ngrams(preprocess(text)))
 

	
 
	@classmethod
 
	def load(cls, exported):
 
		ranked_ngrams = {key: order for (order, key) in enumerate(exported["ngrams"])}
 
		return cls(exported["language"], ranked_ngrams)
 

	
 
	def export(self):
 
		return {
 
			"language": self.language,
 
			"ngrams": [key for (key, order) in sorted(self.ranked_ngrams.items(), key=lambda key_order: key_order[1])]
 
		}
 

	
 
	def compare(self, other):
 
		"""take k most common
 
		use frequencies x order
 
		use letter, digrams, trigrams
 
		use absolute x square"""
 
		"""make a set difference of keys, multiply its size by the max score"""
 
		res = sum(abs(v-other.ranked_ngrams.get(k, len(other.ranked_ngrams))) for (k, v) in self.ranked_ngrams.items()) + \
 
					sum(abs(v-self.ranked_ngrams.get(k, len(self.ranked_ngrams))) for (k, v) in other.ranked_ngrams.items())
 
		m = len(other.ranked_ngrams)
 

	
 
		res = sum(
 
			(abs(v - other.ranked_ngrams[k]) if k in other.ranked_ngrams else m)
 
			for (k, v) in self.ranked_ngrams.items()
 
		)
 

	
 
		return res
 

	
 

	
 
def identify(text, models):
 
	sample = Sample.extract(text)
 

	
 
	return sorted(models, key=lambda m: m.compare(sample))[0].language
 
	return sorted(models, key=lambda m: sample.compare(m))[0].language
0 comments (0 inline, 0 general)