Changeset - 3980aeb455b0
[Not reviewed]
default
0 1 0
Laman - 3 years ago 2022-09-28 22:56:06

sorted input files
1 file changed with 2 insertions and 3 deletions:
0 comments (0 inline, 0 general)
languedoc.py
Show inline comments
 
@@ -116,13 +116,12 @@ def cross_validate(sample_sets):
 
	models = [s.create_model() for s in sample_sets]
 
	score = 0
 
	max_score = 0
 

	
 
	for s in sample_sets:
 
		for (test_text, partial_model) in s.generate_tests():
 
			partial_model.print_overview()
 
			real_lang = partial_model.language
 
			test_models = [partial_model] + [m for m in models if m.language != real_lang]
 

	
 
			for k in TEST_LENS:
 
				j = random.randrange(0, len(test_text)-k)
 
				t = test_text[j:j+k]
 
@@ -139,23 +138,23 @@ def identify(text, models):
 
	sample = Sample(text=text)
 

	
 
	return min(map(lambda m: (m.compare(sample), m.language), models))[1]
 

	
 

	
 
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 
LANG_DIRS = [x.path for x in os.scandir(DATA_DIR)]
 
LANG_DIRS = sorted([x.path for x in os.scandir(DATA_DIR)])
 

	
 
if __name__ == "__main__":
 
	samples = []
 

	
 
	for d in LANG_DIRS:
 
		lang = os.path.basename(d)
 
		lang_samples = SampleSet(lang)
 
		samples.append(lang_samples)
 

	
 
		for file in os.scandir(d):
 
		for file in sorted(os.scandir(d), key=lambda f: f.name):
 
			with open(file) as f:
 
				text = f.read()
 
				text = preprocess(text)
 
				print(f"{file.name} ({len(text)})")
 
				print(text[:256])
 
				print()
0 comments (0 inline, 0 general)