changeset 91922:89b7a215ead9

* international/mule.el (sgml-xml-auto-coding-function): Detect and warn if file encoding is not utf-8 and encoding not specified. (xml-find-file-coding-system): New function. * international/mule-conf.el (file-coding-system-alist): Use it.
author Jason Rumney <jasonr@gnu.org>
date Mon, 18 Feb 2008 01:45:54 +0000
parents 2e27479c19fe
children 950bd398d6c2
files lisp/ChangeLog lisp/international/mule-conf.el lisp/international/mule.el
diffstat 3 files changed, 48 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Sun Feb 17 23:31:06 2008 +0000
+++ b/lisp/ChangeLog	Mon Feb 18 01:45:54 2008 +0000
@@ -1,3 +1,10 @@
+2008-02-18  Jason Rumney  <jasonr@gnu.org>
+
+	* international/mule.el (sgml-xml-auto-coding-function): Detect
+	and warn if file encoding is not utf-8 and encoding not specified.
+	(xml-find-file-coding-system): New function.
+	* international/mule-conf.el (file-coding-system-alist): Use it.
+
 2008-02-17  Glenn Morris  <rgm@gnu.org>
 
 	* international/mule-cmds.el (set-locale-environment): Pass
--- a/lisp/international/mule-conf.el	Sun Feb 17 23:31:06 2008 +0000
+++ b/lisp/international/mule-conf.el	Mon Feb 18 01:45:54 2008 +0000
@@ -1470,11 +1470,7 @@
 (setq file-coding-system-alist
       '(("\\.elc\\'" . utf-8-emacs)
 	("\\.utf\\(-8\\)?\\'" . utf-8)
-	;; This is the defined default for XML documents.  It may be
-	;; overridden by a charset specification in the header.  That
-	;; should be grokked by the auto-coding mechanism, but rms
-	;; vetoed that.  -- fx
-	("\\.xml\\'" . utf-8)
+	("\\.xml\\'" . xml-find-file-coding-system)
 	;; We use raw-text for reading loaddefs.el so that if it
 	;; happens to have DOS or Mac EOLs, they are converted to
 	;; newlines.  This is required to make the special treatment
--- a/lisp/international/mule.el	Sun Feb 17 23:31:06 2008 +0000
+++ b/lisp/international/mule.el	Mon Feb 18 01:45:54 2008 +0000
@@ -2288,7 +2288,22 @@
 		  sym
 		(message "Warning: unknown coding system \"%s\"" match)
 		nil))
-	  'utf-8)))))
+          ;; Files without an encoding tag should be UTF-8. But users
+          ;; may be naive about encodings, and have saved the file from
+          ;; another editor that does not help them get the encoding right.
+          ;; Detect the encoding and warn the user if it is detected as
+          ;; something other than UTF-8.
+	  (let ((detected
+                 (with-coding-priority '(utf-8)
+                   (coding-system-base
+                    (detect-coding-region (point-min) size t)))))
+            ;; Pure ASCII always comes back as undecided.
+            (if (memq detected '(utf-8 undecided))
+                'utf-8
+              (warn "File contents detected as %s.
+  Consider adding an encoding attribute to the xml declaration,
+  or saving as utf-8, as mandated by the xml specification." detected)
+              detected)))))))
 
 (defun sgml-html-meta-auto-coding-function (size)
   "If the buffer has an HTML meta tag, use it to determine encoding.
@@ -2314,6 +2329,30 @@
 	  (message "Warning: unknown coding system \"%s\"" match)
 	  nil)))))
 
+(defun xml-find-file-coding-system (args)
+  "Determine the coding system of an XML file without a declaration.
+Strictly speaking, the file should be utf-8, but mistakes are
+made, and there are genuine cases where XML fragments are saved,
+with the encoding properly specified in a master document, or
+added by processing software."
+  (if (eq (car args) 'insert-file-contents)
+      (let ((detected
+             (with-coding-priority '(utf-8)
+               (coding-system-base
+                (detect-coding-region (point-min) (point-max) t)))))
+        ;; Pure ASCII always comes back as undecided.
+        (if (memq detected '(utf-8 undecided))
+            'utf-8
+          (warn "File contents detected as %s.
+  Consider adding an xml declaration with the encoding specified,
+  or saving as utf-8, as mandated by the xml specification." detected)
+          detected))
+    ;; Don't interfere with the user's wishes for saving the buffer.
+    ;; We did what we could when the buffer was created to ensure the
+    ;; correct encoding was used, or the user was warned, so any
+    ;; non-conformity here is deliberate on the part of the user.
+    'undecided))
+
 ;;;
 (provide 'mule)