Changeset - 5982aac96259
[Not reviewed]
default
0 2 0
Laman - 8 years ago 2017-02-08 22:36:20

optimized text parsing
2 files changed with 37 insertions and 20 deletions:
0 comments (0 inline, 0 general)
src/sgfParser/propValues.py
Show inline comments
 
modified file chmod 100644 => 100755
 
import re
 

	
 
from . import ParserError, skipWhitespace
 

	
 

	
 
class Regexp:
 
	number=re.compile(r"(\+|-|)\d+")
 
	real=re.compile(r"(\+|-|)\d+(\.\d+)?")
 
	point=re.compile(r"[a-zA-Z]{2}|")
 
	text=re.compile(r"(?:.*?[^\\])??(?:\\\\)*(?=])", re.DOTALL)
 
	composedText=re.compile(r"(?:.*?[^\\])??(?:\\\\)*(?=]|:)", re.DOTALL)
 

	
 
	class Text:
 
		softBreaks=re.compile(r"(^|[^\\])((\\\\)*)\\((\n\r)|(\r\n)|\r|\n)")
 
		whitespace=re.compile(r"[\t\f\v]")
 
		simpleWhitespace=re.compile(r"[\t\f\v\n\r]")
 
		removeSlashes=re.compile(r"(^|[^\\])((\\\\)*)\\($|[^\\])")
 
		unescapeSlashes=re.compile(r"\\\\")
 

	
 

	
 
class Composed:
 
	def __init__(self,a=None,b=None):
 
		self.a=a
 
		self.b=b
 

	
 
	def __str__(self):
 
		return "{0}:{1}".format(self.a,self.b)
 

	
 

	
 
class Point:
 
	def __init__(self,c,r):
 
		self.r=r
 
		self.c=c
 

	
 
	def __iter__(self):
 
		yield self.c
 
		yield self.r
 

	
 
	def __str__(self):
 
		a=ord("a")
 
		return chr(a+self.c)+chr(a+self.r)
 

	
 
@@ -103,75 +112,67 @@ def number(s,start):
 

	
 
def real(s,start):
 
	m=Regexp.real.match(s,start)
 
	if m is None: raise ParserError("expected a real number matching '{0}'".format(Regexp.real.pattern),s,start)
 
	res=float(m.group(0))
 
	return (m.end(),res)
 

	
 

	
 
def double(s,start):
 
	c=s[start]
 
	if c not in ("1", "2"):
 
		raise ParserError("expected a double value, either '1' or '2'",s,start)
 
	return (start+1,c)
 

	
 

	
 
def color(s,start):
 
	c=s[start]
 
	if c not in ("B", "W"):
 
		raise ParserError("expected a color value, either 'B' or 'W'",s,start)
 
	return (start+1,c)
 

	
 

	
 
def text(simple=True,composed=False):
 
	def f(s,start):
 
		res=""
 
		esc=False
 
		lastC=""
 
		i=start
 
		for i,c in enumerate(s[start:],start):
 
			if esc:
 
				if c!="\n" and c!="\r": res+=c
 
				esc=False
 
			elif (c=="\n" and lastC=="\r") or (c=="\r" and lastC=="\n"): pass
 
			elif c=="\r" or c=="\n" and not simple:
 
				res+="\n"
 
			elif c.isspace():
 
				res+=" "
 
			elif c=="\\":
 
				esc=True
 
			elif c=="]" or (c==":" and composed):
 
				break
 
		regexps=Regexp.Text
 
		m=Regexp.composedText.match(s,start) if composed else Regexp.text.match(s,start)
 
		res=m.group(0)
 
		res=regexps.softBreaks.sub(r"\1\2",res) # remove soft line breaks
 
		if simple:
 
			res=regexps.simpleWhitespace.sub(" ",res) # convert whitespace to spaces, no escapes
 
			else:
 
				res+=c
 
			lastC=c
 
		return (i,res)
 
			res=regexps.whitespace.sub(" ",res) # convert whitespace to spaces, no escapes
 
		res=regexps.removeSlashes.sub(r"\1\2\4",res)
 
		res=regexps.unescapeSlashes.sub(r"\\",res) # unescape slashes
 

	
 
		return (m.end(),res)
 
	return f
 

	
 

	
 
def empty(s,start): return (start,"")
 

	
 

	
 
def anything(s,start):
 
	esc=False
 
	i=start
 
	for i,c in enumerate(s[start:],start):
 
		if esc: esc=False
 
		elif c=="\\": esc=True
 
		elif c=="]": break
 
	return (i,s[start:i])
 

	
 

	
 
# go specific
 
def point(s,start):
 
	m=Regexp.point.match(s,start) # !! limit to board size
 
	if m is None: raise ParserError("expected a point value matching '{0}'".format(Regexp.point.pattern),s,start)
 
	if m.group(0)=="": # pass, !! tt
 
		return (m.end(),tuple())
 
	col=m.group(0)[0]
 
	row=m.group(0)[1]
 
	col=ord(col)-(ord("a") if "a"<=col<="z" else ord("A")-26)
 
	row=ord(row)-(ord("a") if "a"<=row<="z" else ord("A")-26)
 
	return (m.end(),Point(col,row))
 

	
 

	
 
move=point
 
stone=point
src/tests/testSgfParser.py
Show inline comments
 
from itertools import chain
 
import unittest
 
from unittest import TestCase
 
import os
 

	
 
from sgfParser import strRowCol
 
from sgfParser.collection import Collection
 
from sgfParser.property import Property
 
from sgfParser.propValues import text,compose
 

	
 

	
 
dataDir=os.path.join(os.path.dirname(__file__), "data")
 

	
 

	
 
class TestUtils(TestCase):
 
	def testTextPos(self):
 
		s="abc\ndef\rgh\r\nij\n\rklmn"
 
		rc=[
 
			[1,2,3,4],
 
			[1,2,3,4],
 
			[1,2,3,4],
 
			[1,2,3], [1], # don't care about LFCR, we unicode now
 
			[1,2,3,4]
 
		]
 
		res=chain((r+1,c) for (r,row) in enumerate(rc) for c in row)
 
		for (i,(r,c)) in zip(range(len(s)+1), res):
 
			self.assertEqual(strRowCol(s, i), (r, c))
 

	
 

	
 
class TestProperty(TestCase):
 
	def testName(self):
 
		with self.assertRaises(AssertionError):
 
			Property.create("[99]",0)
 
		with self.assertRaises(AssertionError):
 
			Property.create("99[99]",0)
 

	
 
		i,prop=Property.create("MN[99]",0)
 
		self.assertNotEqual((i,prop), (0,None))
 
		self.assertEqual((i,prop.name), (6,"MN"))
 

	
 
	def testText(self):
 
		s=r"""[abc\
 
def
 
ghi]"""
 
		self.assertEqual(text()(s,1)[1], "abcdef ghi")
 
		self.assertEqual(text(False)(s,1)[1], "abcdef\nghi")
 

	
 
		s="""[m\\no\\\tpqr\\]\\\\]"""
 
		self.assertEqual(text()(s,1)[1], "mno pqr]\\")
 
		self.assertEqual(text(False)(s,1)[1], "mno pqr]\\")
 

	
 
		s="""[abc:def]"""
 
		parsed=compose(text(composed=True),text(composed=True))(s,1)
 
		self.assertEqual(str(parsed[1]), "abc:def")
 

	
 

	
 
class TestCollection(TestCase):
 
	def testSubtrees(self):
 
		c=Collection("""
 
(;B[aa]
 
	(;W[ab]PB[Some Black]PW[Some White];B[ac])
 
	(;W[bb]PB[Other Black]PW[Other White])
 
)""")
 
		games=list(c.listGames())
 

	
 
		self.assertEqual(len(games),2)
 
		self.assertRegex(games[0].export(), r"^\(;B\[aa];(PB\[Some Black]|PW\[Some White]|W\[ab]){3};B\[ac]\)$")
 
		self.assertRegex(games[1].export(), r"^\(;B\[aa];(PB\[Other Black]|PW\[Other White]|W\[bb]){3}\)$")
 

	
 
	def testEmptySgf(self):
 
		Collection("(;)")
 

	
 
	def testSimpleSgf(self):
 
		with open(os.path.join(dataDir, "simple.sgf")) as f:
 
			Collection(f.read())
 

	
 
	def testComplexSgf(self):
 
		with open(os.path.join(dataDir, "kogos.sgf")) as f:
 
			Collection(f.read())
0 comments (0 inline, 0 general)