annotate en/fixhtml.py @ 275:96ea24a916f9

Merge with myself.
author Bryan O'Sullivan <bos@serpentine.com>
date Mon, 26 Nov 2007 20:42:36 -0800
parents ec6a3bb10986
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
149
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
1 #!/usr/bin/env python
251
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
2 #
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
3 # This script attempts to work around some of the more bizarre and
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
4 # quirky behaviours of htlatex.
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
5 #
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
6 # - We've persuaded htlatex to produce UTF-8, which unfortunately
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
7 # causes it to use huge character sequences to represent even the
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
8 # safe 7-bit ASCII subset of UTF-8. We fix that up.
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
9 #
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
10 # - BUT we have to treat angle brackets (for example, redirections in
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
11 # shell script snippets) specially, otherwise they'll break the
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
12 # generated HTML. (Reported by Johannes Hoff.)
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
13 #
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
14 # - For some reason, htlatex gives a unique ID to each fancyvrb
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
15 # environment, which makes writing a sane, small CSS stylesheet
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
16 # impossible. We squish all those IDs down to nothing.
149
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
17
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
18 import os
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
19 import sys
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
20 import re
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
21
251
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
22 angle_re = re.compile(r'(&#x003[CE];)')
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
23 unicode_re = re.compile(r'&#x00([0-7][0-9A-F]);')
149
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
24 fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I)
260
ec6a3bb10986 HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents: 251
diff changeset
25 ligature_re = re.compile(r'&#xFB0([0-4]);')
149
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
26
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
27 tmpsuffix = '.tmp.' + str(os.getpid())
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
28
251
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
29 def hide_angle(m):
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
30 return m.group(1).lower()
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
31
149
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
32 def fix_ascii(m):
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
33 return chr(int(m.group(1), 16))
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
34
260
ec6a3bb10986 HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents: 251
diff changeset
35 ligatures = ['ff', 'fi', 'fl', 'ffi', 'ffl']
ec6a3bb10986 HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents: 251
diff changeset
36
ec6a3bb10986 HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents: 251
diff changeset
37 def expand_ligature(m):
ec6a3bb10986 HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents: 251
diff changeset
38 return ligatures[int(m.group(1))]
ec6a3bb10986 HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents: 251
diff changeset
39
149
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
40 for name in sys.argv[1:]:
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
41 tmpname = name + tmpsuffix
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
42 ofp = file(tmpname, 'w')
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
43 for line in file(name):
251
2e73abddad21 Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents: 149
diff changeset
44 line = angle_re.sub(hide_angle, line)
149
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
45 line = unicode_re.sub(fix_ascii, line)
260
ec6a3bb10986 HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents: 251
diff changeset
46 line = ligature_re.sub(expand_ligature, line)
149
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
47 line = fancyvrb_re.sub('id="fancyvrb"', line)
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
48 ofp.write(line)
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
49 ofp.close()
16f02802f448 Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff changeset
50 os.rename(tmpname, name)