Mercurial > emacs
view admin/charsets/mapconv @ 97528:184bb2071e3f
mail/: Add new (temporary) libaries for which to test Rmail/mbox such
that Rmail/babyl is not affected. This creates a facility/feature
called "pmail" (analagous to "rmail") that can be used independently
from Rmail for testing purposes. The plan is to replace the "rmail"
files eventually and remove "pmail" entirely at that point. In the
interim, interested developers can use either Rmail or Pmail or both
(which is not recommended for the casual User or the faint of heart).
author | Paul Reilly <pmr@pajato.com> |
---|---|
date | Mon, 18 Aug 2008 04:51:28 +0000 |
parents | eb2d9dfc8486 |
children | ce88a631c161 |
line wrap: on
line source
#!/bin/sh # Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 # National Institute of Advanced Industrial Science and Technology (AIST) # Registration Number H13PRO009 # This file is part of GNU Emacs. # GNU Emacs is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # GNU Emacs is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. # Commentary: # Convert charset map of various format into this: # 0xXX 0xYYYY # where, # XX is a code point of the charset in hexa-decimal, # YYYY is the corresponding Unicode character code in hexa-decimal. # Arguments are: # $1: source map file # $2: address pattern for sed (optionally with substitution command) # $3: format of source map file # GLIBC-1 GLIBC-2 GLIBC-2-7 CZYBORRA IANA UNICODE YASUOKA MICROSOFT # $4: awk script BASE=`basename $1` case "$3" in GLIBC*) SOURCE="glibc-2.3.2/localedata/charmaps/${BASE}";; CZYBORRA) SOURCE="http://czyborra.com/charsets/${BASE}";; IANA) SOURCE="http://www.iana.org/assignments/charset-reg/${BASE}";; UNICODE) SOURCE="http://www.unicode.org/Public/MAPPINGS/.../${BASE}";; UNICODE2) SOURCE="http://www.unicode.org/Public/MAPPINGS/.../${BASE}";; YASUOKA) SOURCE="http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/.../${BASE}";; MICROSOFT) SOURCE="http://www.microsoft.com/globaldev/reference/oem/${BASE}";; KANJI-DATABASE) SOURCE="data at http://sourceforge.net/cvs/?group_id=26261";; *) echo "Unknown file type: $3"; exit 1;; esac echo "# Generated from $SOURCE" if [ -n "$4" ] ; then if [ -f "$4" ] ; then AWKPROG="gawk -f $4" else echo "Awk program does not exist: $4" exit 1 fi else AWKPROG=cat fi if [ "$3" == "GLIBC-1" ] ; then # Source format is: # <UYYYY> /xXX sed -n -e "$2 p" < $1 \ | sed -e 's,<U\([^>]*\)>[ ]*/x\(..\).*,0x\2 0x\1,' \ | sort | ${AWKPROG} elif [ "$3" == "GLIBC-2" ] ; then # Source format is: # <UYYYY> /xXX/xZZ sed -n -e "$2 p" < $1 \ | sed -e 's,<U\([^>]*\)>[ ]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \ | sort | ${AWKPROG} elif [ "$3" == "GLIBC-2-7" ] ; then # Source format is: # <UYYYY> /xXX/xZZ # We must drop MSBs of XX and ZZ sed -n -e "$2 p" < $1 \ | sed -e 's/xa/x2/g' -e 's/xb/x3/g' -e 's/xc/x4/g' \ -e 's/xd/x5/g' -e 's/xe/x6/g' -e 's/xf/x7/g' \ -e 's,<U\([^>]*\)>[ ]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \ | tee temp \ | sort | ${AWKPROG} elif [ "$3" == "CZYBORRA" ] ; then # Source format is: # =XX U+YYYY zcat $1 | sed -n -e "$2 p" \ | sed -e 's/=\(..\)[^U]*U+\([0-9A-F]*\).*/0x\1 0x\2/' \ | sort | ${AWKPROG} elif [ "$3" == "IANA" ] ; then # Source format is: # 0xXX 0xYYYY sed -n -e "$2 p" < $1 \ | sed -e 's/\(0x[0-9A-Fa-f]*\)[^0]*\(0x[0-9A-Fa-f]*\).*/\1 \2/' \ | sort | ${AWKPROG} elif [ "$3" == "UNICODE" ] ; then # Source format is: # YYYY XX sed -n -e "$2 p" < $1 \ | sed -e 's/\([0-9A-F]*\)[^0-9A-F]*\([0-9A-F]*\).*/0x\2 0x\1/' \ | sort | ${AWKPROG} elif [ "$3" == "UNICODE2" ] ; then # Source format is: # 0xXXXX 0xYYYY # ... sed -n -e "$2 p" < $1 \ | sed -e 's/\([0-9A-Fx]*\)[^0]*\([0-9A-Fx]*\).*/\1 \2/' \ | ${AWKPROG} | sort -n -k 4,4 elif [ "$3" == "YASUOKA" ] ; then # Source format is: # YYYY 0-XXXX (XXXX is a Kuten code) sed -n -e "$2 p" < $1 \ | sed -e 's/\([0-9A-F]*\)[^0]*0-\([0-9]*\).*/0x\2 0x\1/' \ | sort | ${AWKPROG} elif [ "$3" == "MICROSOFT" ] ; then # Source format is: # XX = U+YYYY sed -n -e "$2 p" < $1 \ | sed -e 's/\([0-9A-F]*\).*U+\([0-9A-F]*\).*/0x\1 0x\2/' \ | sort | ${AWKPROG} elif [ "$3" == "KANJI-DATABASE" ] ; then # Source format is: # C?-XXXX U+YYYYY ..... sed -n -e "$2 p" < $1 \ | sed -e 's/...\(....\) U+\([0-9A-F]*\).*/0x\1 0x\2/' \ | sort | ${AWKPROG} else echo "Invalid arguments" exit 1 fi # arch-tag: c33acb47-7eb6-4872-b871-15e1447e8f0e