From 95244d29744003459b41e8d6bdc8ba2ebbd4ea4f Mon Sep 17 00:00:00 2001
From: Eli Ribble <eli@authentise.com>
Date: Wed, 22 Jun 2016 14:08:13 -0600
Subject: [PATCH] Add a basic SGML parser

I tried BeautifulSoup, which was okay, but was missing an understanding
of how OFX does SGML. That's fine, writing my own parser was not that
big of a deal
---
 tests/test_sgml.py | 17 ++++++++++++
 vanth/sgml.py      | 66 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 tests/test_sgml.py
 create mode 100644 vanth/sgml.py

diff --git a/tests/test_sgml.py b/tests/test_sgml.py
new file mode 100644
index 0000000..f6f869a
--- /dev/null
+++ b/tests/test_sgml.py
@@ -0,0 +1,17 @@
+import vanth.sgml
+
+
+def child_values(node):
+    return [(child.name, child.value) for child in node.children]
+
+def test_siblings():
+    result = vanth.sgml.parse("<A><B><C>1<D>2<E>3</B></A>")
+    assert result.name == 'A'
+    assert child_values(result['B']) == [('C', '1'), ('D', '2'), ('E', '3')]
+
+def test_closing():
+    result = vanth.sgml.parse("<A><B><C>1</B><D><E>2</D></A>")
+    assert result.name == 'A'
+    assert child_values(result) == [('B', ''), ('D', '')]
+    assert child_values(result['B']) == [('C', '1')]
+    assert child_values(result['D']) == [('E', '2')]
diff --git a/vanth/sgml.py b/vanth/sgml.py
new file mode 100644
index 0000000..38bc4f9
--- /dev/null
+++ b/vanth/sgml.py
@@ -0,0 +1,66 @@
+import logging
+
+LOGGER = logging.getLogger(__name__)
+
+class Node(): # pylint: disable=too-few-public-methods
+    def __init__(self, parent, name, children=None, value=None):
+        self.children   = children or []
+        self.name       = name
+        self.parent     = parent
+        self.value      = value
+        if parent:
+            parent.children.append(self)
+
+    def __getitem__(self, key):
+        for child in self.children:
+            if child.name == key:
+                return child
+
+    def __repr__(self):
+        return "SGMLNode {} ({})".format(self.name, self.parent.name if self.parent else None)
+
+def parse(content):
+    state = 'node-content'
+    buf = ''
+    parent_node = None
+    current_node = None
+    for c in content:
+        if c == '<':
+            if state == 'node-content':
+                if buf == '':
+                    parent_node = current_node
+                    LOGGER.debug("Node content was empty, setting parent node to %s", parent_node)
+                if current_node:
+                    current_node.value = buf
+                    LOGGER.debug("Set %s to %s", current_node.name, current_node.value)
+            buf = ''
+            state = 'node-name'
+        elif c == '>':
+            if state == 'node-name':
+                LOGGER.debug("Saw opening tag %s. With parent %s", buf, parent_node)
+                state = 'node-content'
+                current_node = Node(parent_node, buf)
+                buf = ''
+            elif state == 'closing-tag':
+                LOGGER.debug("Saw closing tag %s", buf)
+                state = 'closed-tag'
+                parent_node = current_node
+                while parent_node.parent and parent_node.name != buf:
+                    parent_node = parent_node.parent
+                parent_node = parent_node.parent
+                buf = ''
+                LOGGER.debug("Set new parent to %s", parent_node.name if parent_node else None)
+        elif c == '/' and buf == '':
+            state = 'closing-tag'
+            parent_node = current_node.parent if current_node else None
+        else:
+            buf += c
+    root = current_node or parent_node
+    while root.parent:
+        root = root.parent
+    print(pformat(root))
+    return root
+
+def pformat(node, indent=0):
+    children = '\n'.join(pformat(child, indent+1) for child in node.children)
+    return "{}{}: {}{}".format('\t' * indent, node.name, node.value, "\n" + children if node.children else '')