diff en/autoid.py @ 749:7e7c47481e4f

Oops, this is the real merge for my hg's oddity
author Dongsheng Song <dongsheng.song@gmail.com>
date Fri, 20 Mar 2009 16:43:35 +0800
parents c838b3975bc6
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/en/autoid.py	Fri Mar 20 16:43:35 2009 +0800
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+#
+# Add unique ID attributes to para tags.  This script should only be
+# run by one person, since otherwise it introduces the possibility of
+# chaotic conflicts among tags.
+
+import glob, os, re, sys
+
+tagged = re.compile('<para[^>]* id="x_([0-9a-f]+)"[^>]*>', re.M)
+untagged = re.compile('<para>')
+
+names = glob.glob('ch*.xml') + glob.glob('app*.xml')
+
+# First pass: find the highest-numbered paragraph ID.
+
+biggest_id = 0
+seen = set()
+errs = 0
+
+for name in names:
+    for m in tagged.finditer(open(name).read()):
+        i = int(m.group(1),16)
+        if i in seen:
+            print >> sys.stderr, '%s: duplication of ID %s' % (name, i)
+            errs += 1
+        seen.add(i)
+        if i > biggest_id:
+            biggest_id = i
+
+def retag(s):
+    global biggest_id
+    biggest_id += 1
+    return '<para id="x_%x">' % biggest_id
+
+# Second pass: add IDs to paragraphs that currently lack them.
+
+for name in names:
+    f = open(name).read()
+    f1 = untagged.sub(retag, f)
+    if f1 != f:
+        tmpname = name + '.tmp'
+        fp = open(tmpname, 'w')
+        fp.write(f1)
+        fp.close()
+        os.rename(tmpname, name)
+
+sys.exit(errs)