Add a basic SGML parser

I tried BeautifulSoup, which was okay, but was missing an understanding
of how OFX does SGML. That's fine, writing my own parser was not that
big of a deal
This commit is contained in:
Eli Ribble 2016-06-22 14:08:13 -06:00
parent 104289418b
commit 95244d2974
2 changed files with 83 additions and 0 deletions

17
tests/test_sgml.py Normal file
View File

@ -0,0 +1,17 @@
import vanth.sgml
def child_values(node):
return [(child.name, child.value) for child in node.children]
def test_siblings():
result = vanth.sgml.parse("<A><B><C>1<D>2<E>3</B></A>")
assert result.name == 'A'
assert child_values(result['B']) == [('C', '1'), ('D', '2'), ('E', '3')]
def test_closing():
result = vanth.sgml.parse("<A><B><C>1</B><D><E>2</D></A>")
assert result.name == 'A'
assert child_values(result) == [('B', ''), ('D', '')]
assert child_values(result['B']) == [('C', '1')]
assert child_values(result['D']) == [('E', '2')]

66
vanth/sgml.py Normal file
View File

@ -0,0 +1,66 @@
import logging
LOGGER = logging.getLogger(__name__)
class Node(): # pylint: disable=too-few-public-methods
def __init__(self, parent, name, children=None, value=None):
self.children = children or []
self.name = name
self.parent = parent
self.value = value
if parent:
parent.children.append(self)
def __getitem__(self, key):
for child in self.children:
if child.name == key:
return child
def __repr__(self):
return "SGMLNode {} ({})".format(self.name, self.parent.name if self.parent else None)
def parse(content):
state = 'node-content'
buf = ''
parent_node = None
current_node = None
for c in content:
if c == '<':
if state == 'node-content':
if buf == '':
parent_node = current_node
LOGGER.debug("Node content was empty, setting parent node to %s", parent_node)
if current_node:
current_node.value = buf
LOGGER.debug("Set %s to %s", current_node.name, current_node.value)
buf = ''
state = 'node-name'
elif c == '>':
if state == 'node-name':
LOGGER.debug("Saw opening tag %s. With parent %s", buf, parent_node)
state = 'node-content'
current_node = Node(parent_node, buf)
buf = ''
elif state == 'closing-tag':
LOGGER.debug("Saw closing tag %s", buf)
state = 'closed-tag'
parent_node = current_node
while parent_node.parent and parent_node.name != buf:
parent_node = parent_node.parent
parent_node = parent_node.parent
buf = ''
LOGGER.debug("Set new parent to %s", parent_node.name if parent_node else None)
elif c == '/' and buf == '':
state = 'closing-tag'
parent_node = current_node.parent if current_node else None
else:
buf += c
root = current_node or parent_node
while root.parent:
root = root.parent
print(pformat(root))
return root
def pformat(node, indent=0):
children = '\n'.join(pformat(child, indent+1) for child in node.children)
return "{}{}: {}{}".format('\t' * indent, node.name, node.value, "\n" + children if node.children else '')