changeset 251:2e73abddad21

Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff). Finally write up what fixhtml.py is actually doing.
author Bryan O'Sullivan <bos@serpentine.com>
date Wed, 30 May 2007 21:50:21 -0700
parents 5ecf66974def
children f2061ece8ed9
files en/fixhtml.py
diffstat 1 files changed, 21 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/en/fixhtml.py	Wed May 30 20:06:05 2007 -0700
+++ b/en/fixhtml.py	Wed May 30 21:50:21 2007 -0700
@@ -1,14 +1,33 @@
 #!/usr/bin/env python
+#
+# This script attempts to work around some of the more bizarre and
+# quirky behaviours of htlatex.
+#
+# - We've persuaded htlatex to produce UTF-8, which unfortunately
+#   causes it to use huge character sequences to represent even the
+#   safe 7-bit ASCII subset of UTF-8.  We fix that up.
+#
+# - BUT we have to treat angle brackets (for example, redirections in
+#   shell script snippets) specially, otherwise they'll break the
+#   generated HTML.  (Reported by Johannes Hoff.)
+#
+# - For some reason, htlatex gives a unique ID to each fancyvrb
+#   environment, which makes writing a sane, small CSS stylesheet
+#   impossible.  We squish all those IDs down to nothing.
 
 import os
 import sys
 import re
 
-unicode_re = re.compile(r'&#x00([0-7][0-9a-f]);', re.I)
+angle_re = re.compile(r'(&#x003[CE];)')
+unicode_re = re.compile(r'&#x00([0-7][0-9A-F]);')
 fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I)
 
 tmpsuffix = '.tmp.' + str(os.getpid())
 
+def hide_angle(m):
+    return m.group(1).lower()
+
 def fix_ascii(m):
     return chr(int(m.group(1), 16))
 
@@ -16,6 +35,7 @@
     tmpname = name + tmpsuffix
     ofp = file(tmpname, 'w')
     for line in file(name):
+        line = angle_re.sub(hide_angle, line)
         line = unicode_re.sub(fix_ascii, line)
         line = fancyvrb_re.sub('id="fancyvrb"', line)
         ofp.write(line)