changeset 3206:6bcfc6561711 trunk

Implement support for Arabic and Turkish.
author William Pitcock <nenolod@atheme-project.org>
date Wed, 01 Aug 2007 08:08:13 -0500
parents e9f66c3905ec
children 8f6f2e194499
files src/libguess/Makefile src/libguess/arabic_impl.c src/libguess/guess.c src/libguess/libguess.h src/libguess/turkish_impl.c
diffstat 5 files changed, 62 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/src/libguess/Makefile	Wed Aug 01 07:48:03 2007 -0500
+++ b/src/libguess/Makefile	Wed Aug 01 08:08:13 2007 -0500
@@ -10,7 +10,9 @@
 
 SOURCES = \
 	guess.c \
-	russian_impl.c
+	arabic_impl.c \
+	russian_impl.c \
+	turkish_impl.c
 
 OBJECTS = ${SOURCES:.c=.o}
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/libguess/arabic_impl.c	Wed Aug 01 08:08:13 2007 -0500
@@ -0,0 +1,28 @@
+#include "libguess.h"
+
+static const char *_guess_ar(const unsigned char *ptr, int size)
+{
+    int i;
+
+    for (i = 0; i < size; i++)
+    {
+        if ((ptr[i] >= 0x80 && ptr[i] <= 0x9F) ||
+            ptr[i] == 0xA1 || ptr[i] == 0xA2 || ptr[i] == 0xA3 ||
+            (ptr[i] >= 0xA5 && ptr[i] <= 0xAB) ||
+            (ptr[i] >= 0xAE && ptr[i] <= 0xBA) ||
+            ptr[i] == 0xBC || ptr[i] == 0xBD ||
+            ptr[i] == 0xBE || ptr[i] == 0xC0 ||
+            (ptr[i] >= 0xDB && ptr[i] <= 0xDF) || (ptr[i] >= 0xF3))
+            return "CP1256";
+    }
+
+    return "ISO-8859-6";
+}
+
+const char *guess_ar(const char *ptr, int size)
+{
+    if (dfa_validate_utf8(ptr, size))
+        return "UTF-8";
+
+    return _guess_ar((const unsigned char *)ptr, size);
+}
--- a/src/libguess/guess.c	Wed Aug 01 07:48:03 2007 -0500
+++ b/src/libguess/guess.c	Wed Aug 01 08:08:13 2007 -0500
@@ -458,6 +458,8 @@
     guess_impl_register(GUESS_REGION_CN, guess_cn);
     guess_impl_register(GUESS_REGION_KR, guess_kr);
     guess_impl_register(GUESS_REGION_RU, guess_ru);
+    guess_impl_register(GUESS_REGION_AR, guess_ar);
+    guess_impl_register(GUESS_REGION_TR, guess_tr);
 }
 
 const char *guess_encoding(const char *inbuf, int buflen, const char *lang)
--- a/src/libguess/libguess.h	Wed Aug 01 07:48:03 2007 -0500
+++ b/src/libguess/libguess.h	Wed Aug 01 08:08:13 2007 -0500
@@ -47,6 +47,8 @@
 const char *guess_cn(const char *buf, int buflen);
 const char *guess_kr(const char *buf, int buflen);
 const char *guess_ru(const char *buf, int buflen);
+const char *guess_ar(const char *buf, int buflen);
+const char *guess_tr(const char *buf, int buflen);
 int dfa_validate_utf8(const char *buf, int buflen);
 
 #define GUESS_REGION_JP		"japanese"
@@ -54,6 +56,8 @@
 #define GUESS_REGION_CN		"chinese"
 #define GUESS_REGION_KR		"korean"
 #define GUESS_REGION_RU		"russian"
+#define GUESS_REGION_AR		"arabic"
+#define GUESS_REGION_AR		"turkish"
 
 const char *guess_encoding(const char *buf, int buflen, const char *lang);
 void guess_init(void);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/libguess/turkish_impl.c	Wed Aug 01 08:08:13 2007 -0500
@@ -0,0 +1,25 @@
+#include "libguess.h"
+
+static const char *_guess_tr(const unsigned char *ptr, int size)
+{
+    int i;
+
+    for (i = 0; i < size; i++)
+    {
+        if (ptr[i] == 0x80 || 
+            (ptr[i] >= 0x82 && ptr[i] <= 0x8C) ||
+            (ptr[i] >= 0x91 && ptr[i] <= 0x9C) || 
+            ptr[ i ] == 0x9F)
+            return "CP1254";
+    }
+
+    return "ISO-8859-9";
+}
+
+const char *guess_tr(const char *ptr, int size)
+{
+    if (dfa_validate_utf8(ptr, size))
+        return "UTF-8";
+
+    return _guess_tr((const unsigned char *)ptr, size);
+}