Add a basic SGML parser
I tried BeautifulSoup, which was okay, but was missing an understanding of how OFX does SGML. That's fine, writing my own parser was not that big of a deal
This commit is contained in:
parent
104289418b
commit
95244d2974
|
@ -0,0 +1,17 @@
|
||||||
|
import vanth.sgml
|
||||||
|
|
||||||
|
|
||||||
|
def child_values(node):
|
||||||
|
return [(child.name, child.value) for child in node.children]
|
||||||
|
|
||||||
|
def test_siblings():
|
||||||
|
result = vanth.sgml.parse("<A><B><C>1<D>2<E>3</B></A>")
|
||||||
|
assert result.name == 'A'
|
||||||
|
assert child_values(result['B']) == [('C', '1'), ('D', '2'), ('E', '3')]
|
||||||
|
|
||||||
|
def test_closing():
|
||||||
|
result = vanth.sgml.parse("<A><B><C>1</B><D><E>2</D></A>")
|
||||||
|
assert result.name == 'A'
|
||||||
|
assert child_values(result) == [('B', ''), ('D', '')]
|
||||||
|
assert child_values(result['B']) == [('C', '1')]
|
||||||
|
assert child_values(result['D']) == [('E', '2')]
|
|
@ -0,0 +1,66 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class Node(): # pylint: disable=too-few-public-methods
|
||||||
|
def __init__(self, parent, name, children=None, value=None):
|
||||||
|
self.children = children or []
|
||||||
|
self.name = name
|
||||||
|
self.parent = parent
|
||||||
|
self.value = value
|
||||||
|
if parent:
|
||||||
|
parent.children.append(self)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
for child in self.children:
|
||||||
|
if child.name == key:
|
||||||
|
return child
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "SGMLNode {} ({})".format(self.name, self.parent.name if self.parent else None)
|
||||||
|
|
||||||
|
def parse(content):
|
||||||
|
state = 'node-content'
|
||||||
|
buf = ''
|
||||||
|
parent_node = None
|
||||||
|
current_node = None
|
||||||
|
for c in content:
|
||||||
|
if c == '<':
|
||||||
|
if state == 'node-content':
|
||||||
|
if buf == '':
|
||||||
|
parent_node = current_node
|
||||||
|
LOGGER.debug("Node content was empty, setting parent node to %s", parent_node)
|
||||||
|
if current_node:
|
||||||
|
current_node.value = buf
|
||||||
|
LOGGER.debug("Set %s to %s", current_node.name, current_node.value)
|
||||||
|
buf = ''
|
||||||
|
state = 'node-name'
|
||||||
|
elif c == '>':
|
||||||
|
if state == 'node-name':
|
||||||
|
LOGGER.debug("Saw opening tag %s. With parent %s", buf, parent_node)
|
||||||
|
state = 'node-content'
|
||||||
|
current_node = Node(parent_node, buf)
|
||||||
|
buf = ''
|
||||||
|
elif state == 'closing-tag':
|
||||||
|
LOGGER.debug("Saw closing tag %s", buf)
|
||||||
|
state = 'closed-tag'
|
||||||
|
parent_node = current_node
|
||||||
|
while parent_node.parent and parent_node.name != buf:
|
||||||
|
parent_node = parent_node.parent
|
||||||
|
parent_node = parent_node.parent
|
||||||
|
buf = ''
|
||||||
|
LOGGER.debug("Set new parent to %s", parent_node.name if parent_node else None)
|
||||||
|
elif c == '/' and buf == '':
|
||||||
|
state = 'closing-tag'
|
||||||
|
parent_node = current_node.parent if current_node else None
|
||||||
|
else:
|
||||||
|
buf += c
|
||||||
|
root = current_node or parent_node
|
||||||
|
while root.parent:
|
||||||
|
root = root.parent
|
||||||
|
print(pformat(root))
|
||||||
|
return root
|
||||||
|
|
||||||
|
def pformat(node, indent=0):
|
||||||
|
children = '\n'.join(pformat(child, indent+1) for child in node.children)
|
||||||
|
return "{}{}: {}{}".format('\t' * indent, node.name, node.value, "\n" + children if node.children else '')
|
Loading…
Reference in New Issue