Mercurial > hgbook
annotate ja/fixhtml.py @ 386:63060ad65ab8
started intro.tex
author | Yoshiki Yazawa <yaz@honeyplanet.jp> |
---|---|
date | Sat, 21 Feb 2009 23:56:55 +0900 |
parents | b0db5adf11c1 |
children |
rev | line source |
---|---|
149
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
251
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
2 # |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
3 # This script attempts to work around some of the more bizarre and |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
4 # quirky behaviours of htlatex. |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
5 # |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
6 # - We've persuaded htlatex to produce UTF-8, which unfortunately |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
7 # causes it to use huge character sequences to represent even the |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
8 # safe 7-bit ASCII subset of UTF-8. We fix that up. |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
9 # |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
10 # - BUT we have to treat angle brackets (for example, redirections in |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
11 # shell script snippets) specially, otherwise they'll break the |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
12 # generated HTML. (Reported by Johannes Hoff.) |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
13 # |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
14 # - For some reason, htlatex gives a unique ID to each fancyvrb |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
15 # environment, which makes writing a sane, small CSS stylesheet |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
16 # impossible. We squish all those IDs down to nothing. |
149
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
17 |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
18 import os |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
19 import sys |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
20 import re |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
21 |
251
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
22 angle_re = re.compile(r'([CE];)') |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
23 unicode_re = re.compile(r'�([0-7][0-9A-F]);') |
149
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
24 fancyvrb_re = re.compile(r'id="fancyvrb\d+"', re.I) |
260
ec6a3bb10986
HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents:
251
diff
changeset
|
25 ligature_re = re.compile(r'ྰ([0-4]);') |
149
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
26 |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
27 tmpsuffix = '.tmp.' + str(os.getpid()) |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
28 |
251
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
29 def hide_angle(m): |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
30 return m.group(1).lower() |
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
31 |
149
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
32 def fix_ascii(m): |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
33 return chr(int(m.group(1), 16)) |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
34 |
260
ec6a3bb10986
HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents:
251
diff
changeset
|
35 ligatures = ['ff', 'fi', 'fl', 'ffi', 'ffl'] |
ec6a3bb10986
HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents:
251
diff
changeset
|
36 |
ec6a3bb10986
HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents:
251
diff
changeset
|
37 def expand_ligature(m): |
ec6a3bb10986
HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents:
251
diff
changeset
|
38 return ligatures[int(m.group(1))] |
ec6a3bb10986
HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents:
251
diff
changeset
|
39 |
149
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
40 for name in sys.argv[1:]: |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
41 tmpname = name + tmpsuffix |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
42 ofp = file(tmpname, 'w') |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
43 for line in file(name): |
251
2e73abddad21
Avoid converting UTF8-encoded angle brackets into ASCII (per Johannes Hoff).
Bryan O'Sullivan <bos@serpentine.com>
parents:
149
diff
changeset
|
44 line = angle_re.sub(hide_angle, line) |
149
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
45 line = unicode_re.sub(fix_ascii, line) |
260
ec6a3bb10986
HTML: replace Unicode ligatures with plain ASCII.
Bryan O'Sullivan <bos@serpentine.com>
parents:
251
diff
changeset
|
46 line = ligature_re.sub(expand_ligature, line) |
149
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
47 line = fancyvrb_re.sub('id="fancyvrb"', line) |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
48 ofp.write(line) |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
49 ofp.close() |
16f02802f448
Start to produce saner HTML and CSS.
Bryan O'Sullivan <bos@serpentine.com>
parents:
diff
changeset
|
50 os.rename(tmpname, name) |