import html5lib
import html5lib.treebuilders.dom
# Expected use:
# curl --compressed >current-work
# python
# Generates current-work-canvas.xhtml, for use by to create the annotated spec document
def extract():
parser = html5lib.html5parser.HTMLParser(tree=html5lib.treebuilders.dom.TreeBuilder)
doc = parser.parse(open('current-work'), encoding='utf-8')
head = doc.getElementsByTagName('head')[0]
for n in head.childNodes:
if n.tagName == 'script':
header = doc.getElementsByTagName('header')[0]
#thecanvas = doc.getElementById('the-canvas') # doesn't work (?!)
thecanvas = [ n for n in doc.getElementsByTagName('h4') if n.getAttribute('id') == 'the-canvas-element' ][0]
keep = [header, thecanvas]
node = thecanvas.nextSibling
while node.nodeName != 'h4':
node = node.nextSibling
p = thecanvas.parentNode
for n in p.childNodes[:]:
if n not in keep:
for n in header.childNodes[3:-4]:
def make_absolute(uri):
if uri.startswith('data:'):
return uri
elif uri[0] == '/':
return '' + uri
return '' + uri
# Fix the stylesheet, icon and image references
for e in doc.getElementsByTagName('link'):
e.setAttribute('href', make_absolute(e.getAttribute('href')))
for img in doc.getElementsByTagName('img'):
img.setAttribute('src', make_absolute(img.getAttribute('src')))
# Convert to XHTML, because it's quicker to re-parse than HTML5
doc.documentElement.setAttribute('xmlns', '')
doc.documentElement.setAttribute('xml:lang', doc.documentElement.getAttribute('lang'))
doc.removeChild(doc.firstChild) # remove the DOCTYPE
open('current-work-canvas.xhtml', 'w').write(doc.toxml(encoding = 'UTF-8'))