diff --git a/src/languedoc/predict.py b/src/languedoc/predict.py --- a/src/languedoc/predict.py +++ b/src/languedoc/predict.py @@ -14,39 +14,37 @@ def preprocess(text: str) -> str: return text.lower() -def extract_kgram_freqs(text, k): +def extract_kgram_counts(text, k): n = len(text) - d = dict() + counts = dict() for i in range(0, n-k+1): key = text[i:i+k] if key.isspace(): continue - d[key] = d.get(key, 0) + 1 + counts[key] = counts.get(key, 0) + 1 - count = sum(d.values()) - - return {key: val/count for (key, val) in d.items()} + return counts -def extract_ngram_freqs(text): - frequencies = {} +def extract_ngram_counts(text): + counts = dict() for k in range(1, 4): - frequencies.update(extract_kgram_freqs(text, k)) + counts.update(extract_kgram_counts(text, k)) - return frequencies + return counts -def rank_ngram_freqs(frequencies): - ordered_ngrams = sorted(frequencies.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT] +def rank_ngram_counts(counts): + ordered_ngrams = sorted(counts.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0]))[:TOP_NGRAM_COUNT] return dict(zip([key for (key, freq) in ordered_ngrams], itertools.count(0))) def extract_ranked_ngrams(text): - frequencies = extract_ngram_freqs(text) - return rank_ngram_freqs(frequencies) + counts = extract_ngram_counts(text) + return rank_ngram_counts(counts) class Sample: