Mercurial > hgbook
changeset 251:2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Finally write up what fixhtml.py is actually doing.
author | Bryan O'Sullivan <bos@serpentine.com> |
---|---|
date | Wed, 30 May 2007 21:50:21 -0700 |
parents | 5ecf66974def |
children | f2061ece8ed9 |
files | en/fixhtml.py |
diffstat | 1 files changed, 21 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/en/fixhtml.py Wed May 30 20:06:05 2007 -0700 +++ b/en/fixhtml.py Wed May 30 21:50:21 2007 -0700 @@ -1,14 +1,33 @@ #!/usr/bin/env python +# +# This script attempts to work around some of the more bizarre and +# quirky behaviours of htlatex. +# +# - We've persuaded htlatex to produce UTF-8, which unfortunately +# causes it to use huge character sequences to represent even the +# safe 7-bit ASCII subset of UTF-8. We fix that up. +# +# - BUT we have to treat angle brackets (for example, redirections in +# shell script snippets) specially, otherwise they'll break the +# generated HTML. (Reported by Johannes Hoff.) +# +# - For some reason, htlatex gives a unique ID to each fancyvrb +# environment, which makes writing a sane, small CSS stylesheet +# impossible. We squish all those IDs down to nothing. import os import sys import re -unicode_re = re.compile(r'�([0-7][0-9a-f]);', re.I) +angle_re = re.compile(r'([CE];)') +unicode_re = re.compile(r'�([0-7][0-9A-F]);') fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I) tmpsuffix = '.tmp.' + str(os.getpid()) +def hide_angle(m): + return m.group(1).lower() + def fix_ascii(m): return chr(int(m.group(1), 16)) @@ -16,6 +35,7 @@ tmpname = name + tmpsuffix ofp = file(tmpname, 'w') for line in file(name): + line = angle_re.sub(hide_angle, line) line = unicode_re.sub(fix_ascii, line) line = fancyvrb_re.sub('id="fancyvrb"', line) ofp.write(line)