changeset 110302:fd8902911ce3

Add support for the libxml2 library. This adds the html-parse-string and xml-parse-string functions in the new file src/xml.c, as well as autoconf detection of the library.
author Lars Magne Ingebrigtsen <larsi@gnus.org>
date Fri, 10 Sep 2010 18:44:35 +0200
parents e59d99dc1c71
children fdcc2e819a9b
files ChangeLog configure configure.in doc/lispref/text.texi src/ChangeLog src/Makefile.in src/config.in src/emacs.c src/lisp.h src/xml.c
diffstat 10 files changed, 344 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Fri Sep 10 13:20:51 2010 +0200
+++ b/ChangeLog	Fri Sep 10 18:44:35 2010 +0200
@@ -1,3 +1,7 @@
+2010-09-10  Lars Magne Ingebrigtsen  <larsi@gnus.org>
+
+	* configure.in: Check for libxml2.
+
 2010-09-09  Glenn Morris  <rgm@gnu.org>
 
 	* make-dist: No more TODO files under lisp/.
--- a/configure	Fri Sep 10 13:20:51 2010 +0200
+++ b/configure	Fri Sep 10 18:44:35 2010 +0200
@@ -660,6 +660,8 @@
 LIBS_MAIL
 liblockfile
 ALLOCA
+LIBXML2_LIBS
+LIBXML2_CFLAGS
 LIBXSM
 LIBGPM
 LIBGIF
@@ -807,6 +809,7 @@
 with_gif
 with_png
 with_rsvg
+with_xml2
 with_imagemagick
 with_xft
 with_libotf
@@ -1514,6 +1517,7 @@
   --without-gif           don't compile with GIF image support
   --without-png           don't compile with PNG image support
   --without-rsvg          don't compile with SVG image support
+  --without-xml2          don't compile with XML parsing support
   --with-imagemagick      compile with ImageMagick image support
   --without-xft           don't use XFT for anti aliased fonts
   --without-libotf        don't use libotf for OpenType font support
@@ -2732,6 +2736,14 @@
 fi
 
 
+# Check whether --with-xml2 was given.
+if test "${with_xml2+set}" = set; then :
+  withval=$with_xml2;
+else
+     with_xml2=yes
+fi
+
+
 # Check whether --with-imagemagick was given.
 if test "${with_imagemagick+set}" = set; then :
   withval=$with_imagemagick;
@@ -11070,6 +11082,112 @@
 fi
 
 
+### Use libxml (-lxml2) if available
+if test "${with_xml2}" != "no"; then
+  ### I'm not sure what the version number should be, so I just guessed.
+
+  succeeded=no
+
+  # Extract the first word of "pkg-config", so it can be a program name with args.
+set dummy pkg-config; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if test "${ac_cv_path_PKG_CONFIG+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  case $PKG_CONFIG in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_PKG_CONFIG="$PKG_CONFIG" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_path_PKG_CONFIG="$as_dir/$ac_word$ac_exec_ext"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_path_PKG_CONFIG" && ac_cv_path_PKG_CONFIG="no"
+  ;;
+esac
+fi
+PKG_CONFIG=$ac_cv_path_PKG_CONFIG
+if test -n "$PKG_CONFIG"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $PKG_CONFIG" >&5
+$as_echo "$PKG_CONFIG" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+
+  if test "$PKG_CONFIG" = "no" ; then
+     HAVE_LIBXML2=no
+  else
+     PKG_CONFIG_MIN_VERSION=0.9.0
+     if $PKG_CONFIG --atleast-pkgconfig-version $PKG_CONFIG_MIN_VERSION; then
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for libxml-2.0 > 2.2.0" >&5
+$as_echo_n "checking for libxml-2.0 > 2.2.0... " >&6; }
+
+        if $PKG_CONFIG --exists "libxml-2.0 > 2.2.0" 2>&5; then
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+            succeeded=yes
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking LIBXML2_CFLAGS" >&5
+$as_echo_n "checking LIBXML2_CFLAGS... " >&6; }
+            LIBXML2_CFLAGS=`$PKG_CONFIG --cflags "libxml-2.0 > 2.2.0"|sed -e 's,///*,/,g'`
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LIBXML2_CFLAGS" >&5
+$as_echo "$LIBXML2_CFLAGS" >&6; }
+
+            { $as_echo "$as_me:${as_lineno-$LINENO}: checking LIBXML2_LIBS" >&5
+$as_echo_n "checking LIBXML2_LIBS... " >&6; }
+            LIBXML2_LIBS=`$PKG_CONFIG --libs "libxml-2.0 > 2.2.0"|sed -e 's,///*,/,g'`
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LIBXML2_LIBS" >&5
+$as_echo "$LIBXML2_LIBS" >&6; }
+        else
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+            LIBXML2_CFLAGS=""
+            LIBXML2_LIBS=""
+            ## If we have a custom action on failure, don't print errors, but
+            ## do set a variable so people can do so.
+            LIBXML2_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "libxml-2.0 > 2.2.0"`
+
+        fi
+
+
+
+     else
+        echo "*** Your version of pkg-config is too old. You need version $PKG_CONFIG_MIN_VERSION or newer."
+        echo "*** See http://www.freedesktop.org/software/pkgconfig"
+     fi
+  fi
+
+  if test $succeeded = yes; then
+     HAVE_LIBXML2=yes
+  else
+     HAVE_LIBXML2=no
+  fi
+
+  if test "${HAVE_LIBXML2}" = "yes"; then
+
+$as_echo "#define HAVE_LIBXML2 1" >>confdefs.h
+
+  fi
+fi
+
+
+
 # If netdb.h doesn't declare h_errno, we must declare it by hand.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether netdb declares h_errno" >&5
 $as_echo_n "checking whether netdb declares h_errno... " >&6; }
--- a/configure.in	Fri Sep 10 13:20:51 2010 +0200
+++ b/configure.in	Fri Sep 10 18:44:35 2010 +0200
@@ -155,6 +155,7 @@
 OPTION_DEFAULT_ON([gif],[don't compile with GIF image support])
 OPTION_DEFAULT_ON([png],[don't compile with PNG image support])
 OPTION_DEFAULT_ON([rsvg],[don't compile with SVG image support])
+OPTION_DEFAULT_ON([xml2],[don't compile with XML parsing support])
 OPTION_DEFAULT_OFF([imagemagick],[compile with ImageMagick image support])
 
 OPTION_DEFAULT_ON([xft],[don't use XFT for anti aliased fonts])
@@ -2535,6 +2536,17 @@
 fi
 AC_SUBST(LIBXSM)
 
+### Use libxml (-lxml2) if available
+if test "${with_xml2}" != "no"; then
+  ### I'm not sure what the version number should be, so I just guessed.
+  PKG_CHECK_MODULES(LIBXML2, libxml-2.0 > 2.2.0, HAVE_LIBXML2=yes, HAVE_LIBXML2=no)
+  if test "${HAVE_LIBXML2}" = "yes"; then
+    AC_DEFINE(HAVE_LIBXML2, 1, [Define to 1 if you have the libxml library (-lxml2).])
+  fi
+fi
+AC_SUBST(LIBXML2_LIBS)
+AC_SUBST(LIBXML2_CFLAGS)
+
 # If netdb.h doesn't declare h_errno, we must declare it by hand.
 AC_CACHE_CHECK(whether netdb declares h_errno,
 	       emacs_cv_netdb_declares_h_errno,
--- a/doc/lispref/text.texi	Fri Sep 10 13:20:51 2010 +0200
+++ b/doc/lispref/text.texi	Fri Sep 10 18:44:35 2010 +0200
@@ -59,6 +59,7 @@
                        position stored in a register.
 * Base 64::          Conversion to or from base 64 encoding.
 * MD5 Checksum::     Compute the MD5 "message digest"/"checksum".
+* Parsing HTML::     Parsing HTML and XML.
 * Atomic Changes::   Installing several buffer changes "atomically".
 * Change Hooks::     Supplying functions to be run when text is changed.
 @end menu
@@ -4106,6 +4107,49 @@
 coding instead.
 @end defun
 
+@node Parsing HTML
+@section Parsing HTML
+@cindex parsing html
+@cindex parsing xml
+
+Emacs provides an interface to the @code{libxml2} library via two
+functions: @code{html-parse-buffer} and @code{xml-parse-buffer}.  The
+HTML function will parse ``real world'' HTML and try to return a
+sensible parse tree, while the XML function is somewhat stricter about
+syntax.
+
+They both take a two optional parameter.  The first is a buffer, and
+the second is a base URL to be used to expand relative URLs in the
+document, if any.
+
+Here's an example demonstrating the structure of the parsed data you
+get out.  Given this HTML document:
+
+@example
+<html><hEad></head><body width=101><div class=thing>Foo<div>Yes
+@end example
+
+You get this parse tree:
+
+@example
+(html
+ (head)
+ (body
+  (:width . "101")
+  (div
+   (:class . "thing")
+   (text . "Foo")
+   (div
+    (text . "Yes\n")))))
+@end example
+
+It's a simple tree structure, where the @code{car} for each node is
+the name of the node, and the @code{cdr} is the value, or the list of
+values.
+
+Attributes are coded the same way as child nodes, but with @samp{:} as
+the first character.
+
 @node Atomic Changes
 @section Atomic Change Groups
 @cindex atomic changes
--- a/src/ChangeLog	Fri Sep 10 13:20:51 2010 +0200
+++ b/src/ChangeLog	Fri Sep 10 18:44:35 2010 +0200
@@ -1,3 +1,13 @@
+2010-09-09  Lars Magne Ingebrigtsen  <larsi@gnus.org>
+
+	* xml.c (Fxml_parse_buffer): New function to parse XML files.
+
+2010-09-08  Lars Magne Ingebrigtsen  <larsi@gnus.org>
+
+	* xml.c: New file.
+	(Fhtml_parse_buffer): New function to interface to the libxml2
+	html parsing function.
+
 2010-09-05  Juanma Barranquero  <lekktu@gmail.com>
 
 	* biditype.h: Regenerate.
--- a/src/Makefile.in	Fri Sep 10 13:20:51 2010 +0200
+++ b/src/Makefile.in	Fri Sep 10 18:44:35 2010 +0200
@@ -226,6 +226,8 @@
 IMAGEMAGICK_LIBS= @IMAGEMAGICK_LIBS@
 IMAGEMAGICK_CFLAGS= @IMAGEMAGICK_CFLAGS@
 
+LIBXML2_LIBS = @LIBXML2_LIBS@
+LIBXML2_CFLAGS = @LIBXML2_CFLAGS@
 
 ## widget.o if USE_X_TOOLKIT, otherwise empty.
 WIDGET_OBJ=@WIDGET_OBJ@
@@ -320,7 +322,8 @@
 ## FIXME? MYCPPFLAGS only referenced in etc/DEBUG.
 ALL_CFLAGS=-Demacs -DHAVE_CONFIG_H $(MYCPPFLAGS) -I. -I${srcdir} \
   ${C_SWITCH_MACHINE} ${C_SWITCH_SYSTEM} ${C_SWITCH_X_SITE} \
-  ${C_SWITCH_X_SYSTEM} ${CFLAGS_SOUND} ${RSVG_CFLAGS} ${IMAGEMAGICK_CFLAGS} ${DBUS_CFLAGS} \
+  ${C_SWITCH_X_SYSTEM} ${CFLAGS_SOUND} ${RSVG_CFLAGS} ${IMAGEMAGICK_CFLAGS} \
+  ${LIBXML2_CFLAGS} ${DBUS_CFLAGS} \
   ${GCONF_CFLAGS} ${FREETYPE_CFLAGS} ${FONTCONFIG_CFLAGS} \
   ${LIBOTF_CFLAGS} ${M17N_FLT_CFLAGS} ${DEPFLAGS} ${PROFILING_CFLAGS} \
   ${C_WARNINGS_SWITCH} ${CFLAGS}
@@ -349,7 +352,7 @@
 	syntax.o $(UNEXEC_OBJ) bytecode.o \
 	process.o callproc.o \
 	region-cache.o sound.o atimer.o \
-	doprnt.o strftime.o intervals.o textprop.o composite.o md5.o \
+	doprnt.o strftime.o intervals.o textprop.o composite.o md5.o xml.o \
 	$(MSDOS_OBJ) $(MSDOS_X_OBJ) $(NS_OBJ) $(CYGWIN_OBJ) $(FONT_OBJ)
 
 ## Object files used on some machine or other.
@@ -595,7 +598,8 @@
 ## duplicated symbols.  If the standard libraries were compiled
 ## with GCC, we might need LIB_GCC again after them.
 LIBES = $(LIBS) $(LIBX_BASE) $(LIBX_OTHER) $(LIBSOUND) \
-   $(RSVG_LIBS) ${IMAGEMAGICK_LIBS}  $(DBUS_LIBS) $(LIBGPM) $(LIBRESOLV) $(LIBS_SYSTEM) \
+   $(RSVG_LIBS) ${IMAGEMAGICK_LIBS} $(DBUS_LIBS) \
+   ${LIBXML2_LIBS} $(LIBGPM) $(LIBRESOLV) $(LIBS_SYSTEM) \
    $(LIBS_TERMCAP) $(GETLOADAVG_LIBS) ${GCONF_LIBS} ${LIBSELINUX_LIBS} \
    $(FREETYPE_LIBS) $(FONTCONFIG_LIBS) $(LIBOTF_LIBS) $(M17N_FLT_LIBS) \
    $(LIB_GCC) $(LIB_MATH) $(LIB_STANDARD) $(LIB_GCC)
--- a/src/config.in	Fri Sep 10 13:20:51 2010 +0200
+++ b/src/config.in	Fri Sep 10 18:44:35 2010 +0200
@@ -813,6 +813,9 @@
 /* Define to 1 if you have the SM library (-lSM). */
 #undef HAVE_X_SM
 
+/* Define to 1 if you have the libxml2 library (-lxml2). */
+#undef HAVE_LIBXML2
+
 /* Define to 1 if you want to use the X window system. */
 #undef HAVE_X_WINDOWS
 
--- a/src/emacs.c	Fri Sep 10 13:20:51 2010 +0200
+++ b/src/emacs.c	Fri Sep 10 18:44:35 2010 +0200
@@ -1544,6 +1544,10 @@
 #endif
 #endif /* HAVE_X_WINDOWS */
 
+#ifdef HAVE_LIBXML2
+      syms_of_xml ();
+#endif
+
       syms_of_menu ();
 
 #ifdef HAVE_NTGUI
--- a/src/lisp.h	Fri Sep 10 13:20:51 2010 +0200
+++ b/src/lisp.h	Fri Sep 10 18:44:35 2010 +0200
@@ -3577,6 +3577,11 @@
 EXFUN (Fmsdos_downcase_filename, 1);
 #endif
 
+#ifdef HAVE_LIBXML2
+/* Defined in xml.c */
+extern void syms_of_xml (void);
+#endif
+
 #ifdef HAVE_MENUS
 /* Defined in (x|w32)fns.c, nsfns.m...  */
 extern int have_menus_p (void);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/xml.c	Fri Sep 10 18:44:35 2010 +0200
@@ -0,0 +1,137 @@
+/* Interface to libxml2.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+
+This file is part of GNU Emacs.
+
+GNU Emacs is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+GNU Emacs is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#ifdef HAVE_LIBXML2
+
+#include <setjmp.h>
+#include <libxml/tree.h>
+#include <libxml/parser.h>
+#include <libxml/HTMLparser.h>
+
+#include "lisp.h"
+#include "buffer.h"
+
+Lisp_Object make_dom (xmlNode *node)
+{
+  if (node->type == XML_ELEMENT_NODE) {
+    Lisp_Object result = Fcons (intern (node->name), Qnil);
+    xmlNode *child;
+    xmlAttr *property;
+
+    /* First add the attributes. */
+    property = node->properties;
+    while (property != NULL) {
+      if (property->children &&
+	  property->children->content) {
+	char *pname = xmalloc (strlen (property->name) + 2);
+	*pname = ':';
+	strcpy(pname + 1, property->name);
+	result = Fcons (Fcons (intern (pname),
+			       build_string(property->children->content)),
+			result);
+	xfree (pname);
+      }
+      property = property->next;
+    }
+    /* Then add the children of the node. */
+    child = node->children;
+    while (child != NULL) {
+      result = Fcons (make_dom (child), result);
+      child = child->next;
+    }
+    return Fnreverse (result);
+  } else if (node->type == XML_TEXT_NODE) {
+    Lisp_Object content = Qnil;
+
+    if (node->content)
+      content = build_string (node->content);
+      
+    return Fcons (intern (node->name), content);
+  } else
+    return Qnil;
+}
+
+static Lisp_Object
+parse_buffer (Lisp_Object string, Lisp_Object base_url, int htmlp)
+{
+  xmlDoc *doc;
+  xmlNode *node;
+  Lisp_Object result;
+  int ibeg, iend;
+  char *burl = "";
+  
+  LIBXML_TEST_VERSION;
+	
+  CHECK_STRING (string);
+
+  if (! NILP (base_url)) {
+    CHECK_STRING (base_url);
+    burl = SDATA (base_url);
+  }
+  
+  if (htmlp)
+    doc = htmlReadMemory (SDATA (string), SBYTES (string), burl, "utf-8",
+			  HTML_PARSE_RECOVER|HTML_PARSE_NONET|
+			  HTML_PARSE_NOWARNING|HTML_PARSE_NOERROR);
+  else 
+    doc = xmlReadMemory (SDATA (string), SBYTES (string), burl, "utf-8",
+			 XML_PARSE_NONET|XML_PARSE_NOWARNING|
+			 XML_PARSE_NOERROR);
+
+  if (doc != NULL) {
+    node = xmlDocGetRootElement (doc);
+    if (node != NULL)
+      result = make_dom (node);
+    
+    xmlFreeDoc (doc);
+    xmlCleanupParser ();
+  }
+      
+  return result;
+}
+
+DEFUN ("html-parse-string", Fhtml_parse_string, Shtml_parse_string,
+       0, 2, 0,
+       doc: /* Parse the string as an HTML document and return the parse tree.*/)
+  (Lisp_Object string, Lisp_Object base_url)
+{
+  return parse_buffer (string, base_url, 1);
+}
+
+DEFUN ("xml-parse-string", Fxml_parse_string, Sxml_parse_string,
+       0, 2, 0,
+       doc: /* Parse the string as an XML document and return the parse tree.*/)
+  (Lisp_Object string, Lisp_Object base_url)
+{
+  return parse_buffer (string, base_url, 0);
+}
+
+
+/***********************************************************************
+			    Initialization
+ ***********************************************************************/
+void
+syms_of_xml (void)
+{
+  defsubr (&Shtml_parse_string);
+  defsubr (&Sxml_parse_string);
+}
+
+#endif /* HAVE_LIBXML2 */