changeset 3215:ce2d24746c09 trunk

Add support for greek and hebrew character set detection.
author William Pitcock <nenolod@atheme-project.org>
date Thu, 02 Aug 2007 02:00:21 -0500
parents faf6daa29d5c
children e91acf24afbc
files src/libguess/Makefile src/libguess/greek_impl.c src/libguess/guess.c src/libguess/hebrew_impl.c src/libguess/libguess.h
diffstat 5 files changed, 53 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/src/libguess/Makefile	Thu Aug 02 01:38:22 2007 -0500
+++ b/src/libguess/Makefile	Thu Aug 02 02:00:21 2007 -0500
@@ -12,6 +12,8 @@
 	guess.c \
 	arabic_impl.c \
 	cjk_impl.c \
+	greek_impl.c \
+	hebrew_impl.c \
 	russian_impl.c \
 	turkish_impl.c
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/libguess/greek_impl.c	Thu Aug 02 02:00:21 2007 -0500
@@ -0,0 +1,22 @@
+static const char *_guess_gr(const unsigned char *ptr, int size)
+{
+    int i;
+
+    for (i = 0; i < size; i++)
+    {
+        if (ptr[i] == 0x80 ||
+            (ptr[i] >= 0x82 && ptr[i] <= 0x87) ||
+            ptr[i] == 0x89 || ptr[i] == 0x8B ||
+            (ptr[i] >= 0x91 && ptr[i] <= 0x97) ||
+            ptr[i] == 0x99 || ptr[i] == 0x9B || ptr[i] == 0xA4 ||
+            ptr[i] == 0xA5 || ptr[i] == 0xAE)
+            return "CP1253";
+    }
+
+    return "ISO-8859-7";
+}
+
+const char *guess_gr(const char *ptr, int size)
+{
+    return _guess_gr((const unsigned char *) ptr, size);
+}
--- a/src/libguess/guess.c	Thu Aug 02 01:38:22 2007 -0500
+++ b/src/libguess/guess.c	Thu Aug 02 02:00:21 2007 -0500
@@ -33,6 +33,8 @@
     guess_impl_register(GUESS_REGION_RU, guess_ru);
     guess_impl_register(GUESS_REGION_AR, guess_ar);
     guess_impl_register(GUESS_REGION_TR, guess_tr);
+    guess_impl_register(GUESS_REGION_GR, guess_gr);
+    guess_impl_register(GUESS_REGION_HW, guess_hw);
 }
 
 const char *guess_encoding(const char *inbuf, int buflen, const char *lang)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/libguess/hebrew_impl.c	Thu Aug 02 02:00:21 2007 -0500
@@ -0,0 +1,23 @@
+const char *_guess_hw(const unsigned char *ptr, int size)
+{
+    int i;
+
+    for (i = 0; i < size; i++)
+    {
+        if (ptr[i] == 0x80 || (ptr[i] >= 0x82 && ptr[i] <= 0x89) || ptr[i] == 0x8B ||
+            (ptr[i] >= 0x91 && ptr[i] <= 0x99) || ptr[i] == 0x9B || ptr[i] == 0xA1 ||
+            (ptr[i] >= 0xBF && ptr[i] <= 0xC9) ||
+            (ptr[i] >= 0xCB && ptr[i] <= 0xD8))
+            return "CP1255";
+
+        if (ptr[i] == 0xDF)
+            return "ISO-8859-8-I";
+    }
+
+    return "ISO-8859-8-I";
+}
+
+const char *guess_hw(const char *ptr, int size)
+{
+    return _guess_hw((const unsigned char *) ptr, size);
+}
--- a/src/libguess/libguess.h	Thu Aug 02 01:38:22 2007 -0500
+++ b/src/libguess/libguess.h	Thu Aug 02 02:00:21 2007 -0500
@@ -49,6 +49,8 @@
 const char *guess_ru(const char *buf, int buflen);
 const char *guess_ar(const char *buf, int buflen);
 const char *guess_tr(const char *buf, int buflen);
+const char *guess_gr(const char *buf, int buflen);
+const char *guess_hw(const char *buf, int buflen);
 int dfa_validate_utf8(const char *buf, int buflen);
 
 #define GUESS_REGION_JP		"japanese"
@@ -58,6 +60,8 @@
 #define GUESS_REGION_RU		"russian"
 #define GUESS_REGION_AR		"arabic"
 #define GUESS_REGION_TR		"turkish"
+#define GUESS_REGION_GR		"greek"
+#define GUESS_REGION_HW		"hebrew"
 
 const char *guess_encoding(const char *buf, int buflen, const char *lang);
 void guess_init(void);