libxml2使用说明

Libxml2库的编译和使用

Libxml2库提供了C语言解析和构造xml文档的接口,为后台C语言处理程序和前台应用程序提供了一种通用的通迅方式。

本文以libxml2-2.6.30版本来说明Libxml2库的使用方法。

1.    编译库文件

libxml2-2.6.30.tar.gz文件解压后,进入libxml2-2.6.30文件夹,顺序执行以下命令:

chmod +x ./configure

./configure

make

make install

“chmod +x ./configure”命令增加configure脚本的可执行权限;

“./configure”脚本根据当前编译系统的实际情况生成相应的makefile文件;

“make”命令执行上一命令中生成的makefile文件生成相应的目标文件;

“make install”命令主要把目标文件拷贝到/usr/local目录下,

/usr/local/lib目录下为以下库文件:

libxml2.a  libxml2.la  libxml2.so  libxml2.so.2  libxml2.so.2.6.30  pkgconfig  xml2Conf.sh

/usr/local/include/libxml2目录是Libxml库使用时需要的头文件,包含在libxml子目录下;

2.    使用Libxml2库

Libxml2库的api参考可以从http://www.xmlsoft.org/html/index.html查询。下面以解析一个简单的xml文件为例,给出一个完整的例子。

Xml文档:

<ioMsg>

    <type>she</type>

    <subtype>

       <st1>123</st1>

       <st2>563</st2>

    </subtype>

</ioMsg>

C解析代码xmltest.c:

view plaincopy to clipboardprint?

   1. #include <libxml/parser.h>  
   2. #include <libxml/tree.h>  
   3.   
   4. int main(int argc, char* argv[])  
   5. {  
   6.     xmlDocPtr doc;           //定义解析文档指针  
   7.     xmlNodePtr curNode;      //定义结点指针(你需要它为了在各个结点间移动)  
   8.     xmlChar *szKey;          //临时字符串变量  
   9.     char *szDocName;  
  10.       
  11.     if (argc <= 1)   
  12.     {  
  13.        printf("Usage: %s docname\n", argv[0]);  
  14.        return(0);  
  15.     }  
  16.     szDocName = argv[1];  
  17.     doc = xmlReadFile(szDocName,"GB2312",XML_PARSE_RECOVER); //解析文件  
  18.     if (NULL == doc)  
  19.     {    
  20.        printf("Document not parsed successfully\n");      
  21.        return -1;  
  22.     }  
  23.     curNode = xmlDocGetRootElement(doc); //确定文档根元素  
  24.     if (NULL == curNode)  
  25.     {  
  26.        printf("empty document\n");  
  27.        xmlFreeDoc(doc);  
  28.        return -1;  
  29.     }  
  30.     if (xmlStrcmp(curNode->name, BAD_CAST "ioMsg"))  
  31.     {  
  32.        printf("document of the wrong type, root node != ioMsg\n");  
  33.        xmlFreeDoc(doc);  
  34.        return -1;  
  35.     }  
  36.     curNode = curNode->children;  
  37.     while(curNode != NULL)  
  38.     {  
  39.        //取出节点中的内容  
  40.        szKey = xmlNodeGetContent(curNode);  
  41.        printf("Content value =%s\n", szKey);  
  42.        curNode = curNode->next;  
  43.      }  
  44.      xmlFreeDoc(doc);  
  45.     return 0;     
  46. }  

#include <libxml/parser.h> #include <libxml/tree.h> int main(int argc, char* argv[]) { xmlDocPtr doc; //定义解析文档指针 xmlNodePtr curNode; //定义结点指针(你需要它为了在各个结点间移动) xmlChar *szKey; //临时字符串变量 char *szDocName; if (argc <= 1) { printf("Usage: %s docname\n", argv[0]); return(0); } szDocName = argv[1]; doc = xmlReadFile(szDocName,"GB2312",XML_PARSE_RECOVER); //解析文件 if (NULL == doc) { printf("Document not parsed successfully\n"); return -1; } curNode = xmlDocGetRootElement(doc); //确定文档根元素 if (NULL == curNode) { printf("empty document\n"); xmlFreeDoc(doc); return -1; } if (xmlStrcmp(curNode->name, BAD_CAST "ioMsg")) { printf("document of the wrong type, root node != ioMsg\n"); xmlFreeDoc(doc); return -1; } curNode = curNode->children; while(curNode != NULL) { //取出节点中的内容 szKey = xmlNodeGetContent(curNode); printf("Content value =%s\n", szKey); curNode = curNode->next; } xmlFreeDoc(doc); return 0; }

3.    编译xml解析程序

假设Libxml2库是按步骤1的编译方式,其库文件和头文件分别位于/usr/local/lib和/usr/local/include/libxml2目录下。

动态库编译方式:

cc -o xmltest -I/usr/local/include/libxml2 -L/usr/local/lib -lxml2 xmltest.c

 

静态库的编译方式:

cc -o xmltest -lm -I/usr/local/include/libxml2 xmltest.c /usr/local/lib/libxml2.a

“-I/usr/local/include/libxml2”指定Libxml2库的头文件所在的路径,“-L/usr/local/lib”指定动态库所在路径。

-需要libxml2.a外,还要libz.a和-lm


支持GB2312的LIBXML2库源代码的修改

xmlSaveFileEnc( this->szConfigFile, this->m_doc, "GB2312" );

Libxml2是开源的xml解释器,使用中发现它不支持中文,于是自已加了些代码,支持了GB2312,改了如下代码
//encoding.c
xmlCharEncoding
xmlParseCharEncoding(const char* name)
{
    const char *alias;
    char upper[500];
    int i;

    if (name == NULL)
    return(XML_CHAR_ENCODING_NONE);

    /**//*
     * Do the alias resolution
     */
    alias = xmlGetEncodingAlias(name);
    if (alias != NULL)
    name = alias;

    for (i = 0;i < 499;i++) {
        upper[i] = toupper(name[i]);
    if (upper[i] == 0) break;
    }
    upper[i] = 0;

    if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
    if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
    if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);

    //中文化处理,让该XML分析器支持中文GB2312 BY JRuiui.NET 2005.12.23
    if (!strcmp(upper, "GB2312")) return (XML_CHAR_ENCODING_GB2312);
    /**//*
     * NOTE: if we were able to parse this, the endianness of UTF16 is
     *       already found and in use
     */
    if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
    if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
    
    if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
    if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
    if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);

    /**//*
     * NOTE: if we were able to parse this, the endianness of UCS4 is
     *       already found and in use
     */
    if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
    if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
    if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);

    
    if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
    if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
    if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);

    if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
    if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
    if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);

    if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
    if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
    if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
    if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
    if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
    if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
    if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);

    if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
    if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
    if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);

#ifdef DEBUG_ENCODING
    xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s ", name);
#endif
    return(XML_CHAR_ENCODING_ERROR);
}

const char*
xmlGetCharEncodingName(xmlCharEncoding enc) {
    switch (enc) {
        /**//*中文化处理GB2312编码*/
        case XML_CHAR_ENCODING_GB2312:
        return("GB2312");
        case XML_CHAR_ENCODING_ERROR:
        return(NULL);
        case XML_CHAR_ENCODING_NONE:
        return(NULL);
        case XML_CHAR_ENCODING_UTF8:
        return("UTF-8");
        case XML_CHAR_ENCODING_UTF16LE:
        return("UTF-16");
        case XML_CHAR_ENCODING_UTF16BE:
        return("UTF-16");
        case XML_CHAR_ENCODING_EBCDIC:
            return("EBCDIC");
        case XML_CHAR_ENCODING_UCS4LE:
            return("ISO-10646-UCS-4");
        case XML_CHAR_ENCODING_UCS4BE:
            return("ISO-10646-UCS-4");
        case XML_CHAR_ENCODING_UCS4_2143:
            return("ISO-10646-UCS-4");
        case XML_CHAR_ENCODING_UCS4_3412:
            return("ISO-10646-UCS-4");
        case XML_CHAR_ENCODING_UCS2:
            return("ISO-10646-UCS-2");
        case XML_CHAR_ENCODING_8859_1:
        return("ISO-8859-1");
        case XML_CHAR_ENCODING_8859_2:
        return("ISO-8859-2");
        case XML_CHAR_ENCODING_8859_3:
        return("ISO-8859-3");
        case XML_CHAR_ENCODING_8859_4:
        return("ISO-8859-4");
        case XML_CHAR_ENCODING_8859_5:
        return("ISO-8859-5");
        case XML_CHAR_ENCODING_8859_6:
        return("ISO-8859-6");
        case XML_CHAR_ENCODING_8859_7:
        return("ISO-8859-7");
        case XML_CHAR_ENCODING_8859_8:
        return("ISO-8859-8");
        case XML_CHAR_ENCODING_8859_9:
        return("ISO-8859-9");
        case XML_CHAR_ENCODING_2022_JP:
            return("ISO-2022-JP");
        case XML_CHAR_ENCODING_SHIFT_JIS:
            return("Shift-JIS");
        case XML_CHAR_ENCODING_EUC_JP:
            return("EUC-JP");
    case XML_CHAR_ENCODING_ASCII:
        return(NULL);
    }
    return(NULL);
}

//parserInternals.c
int
xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
    if (ctxt->instate == XML_PARSER_EOF)
    return(0);

    if (ctxt->token != 0) {
    *len = 0;
    return(ctxt->token);
    }    
    if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {
        *len = 1;
        return((int) *ctxt->input->cur);
    }
    //中文化处理
    if (!strcmp(ctxt->input->encoding,"GB2312"))
    {
        ctxt->charset = XML_CHAR_ENCODING_GB2312; //中文GB2312
        *len = 1;
        return((int) *ctxt->input->cur);
    }    
    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
    /**//*
     * We are supposed to handle UTF8, check it's valid
     * From rfc2044: encoding of the Unicode values on UTF-8:
     *
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
     * 0000 0000-0000 007F   0xxxxxxx
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
     *
     * Check for the 0x110000 limit too
     */
    const unsigned char *cur = ctxt->input->cur;
    unsigned char c;
    unsigned int val;

    c = *cur;
    if (c & 0x80) {
        if (cur[1] == 0)
        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
        if ((cur[1] & 0xc0) != 0x80)
        goto encoding_error;
        if ((c & 0xe0) == 0xe0) {

        if (cur[2] == 0)
            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
        if ((cur[2] & 0xc0) != 0x80)
            goto encoding_error;
        if ((c & 0xf0) == 0xf0) {
            if (cur[3] == 0)
            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
            if (((c & 0xf8) != 0xf0) ||
            ((cur[3] & 0xc0) != 0x80))
            goto encoding_error;
            /**//* 4-byte code */
            *len = 4;
            val = (cur[0] & 0x7) << 18;
            val |= (cur[1] & 0x3f) << 12;
            val |= (cur[2] & 0x3f) << 6;
            val |= cur[3] & 0x3f;
        } else {
          /**//* 3-byte code */
            *len = 3;
            val = (cur[0] & 0xf) << 12;
            val |= (cur[1] & 0x3f) << 6;
            val |= cur[2] & 0x3f;
        }
        } else {
          /**//* 2-byte code */
        *len = 2;
        val = (cur[0] & 0x1f) << 6;
        val |= cur[1] & 0x3f;
        }
        if (!IS_CHAR(val)) {
        if ((ctxt->sax != NULL) &&
            (ctxt->sax->error != NULL))
            ctxt->sax->error(ctxt->userData,
                     "Char 0x%X out of allowed range ", val);
        ctxt->errNo = XML_ERR_INVALID_ENCODING;
        ctxt->wellFormed = 0;
        ctxt->disableSAX = 1;
        }    
        return(val);
    } else {
        /**//* 1-byte code */
        *len = 1;
        if (*ctxt->input->cur == 0xD) {
        if (ctxt->input->cur[1] == 0xA) {
            ctxt->nbChars++;
            ctxt->input->cur++;
        }
        return(0xA);
        }
        return((int) *ctxt->input->cur);
    }
    }
    /**//*
     * Assume it's a fixed length encoding (1) with
     * a compatibke encoding for the ASCII set, since
     * XML constructs only use < 128 chars
     */
    *len = 1;
    if (*ctxt->input->cur == 0xD) {
    if (ctxt->input->cur[1] == 0xA) {
        ctxt->nbChars++;
        ctxt->input->cur++;
    }
    return(0xA);
    }
    return((int) *ctxt->input->cur);
encoding_error:
    /**//*
     * If we detect an UTF8 error that probably mean that the
     * input encoding didn't get properly advertized in the
     * declaration header. Report the error and switch the encoding
     * to ISO-Latin-1 (if you don't like this policy, just declare the
     * encoding !)
     */
    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
    ctxt->sax->error(ctxt->userData,
             "Input is not proper UTF-8, indicate encoding ! ");
    ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X ",
            ctxt->input->cur[0], ctxt->input->cur[1],
            ctxt->input->cur[2], ctxt->input->cur[3]);
    }
    ctxt->errNo = XML_ERR_INVALID_ENCODING;

    ctxt->charset = XML_CHAR_ENCODING_8859_1;
    *len = 1;
    return((int) *ctxt->input->cur);
}


//encoding.h
typedef enum {
    XML_CHAR_ENCODING_ERROR=   -1, /**//* No char encoding detected */
    XML_CHAR_ENCODING_NONE=    0, /**//* No char encoding detected */
    XML_CHAR_ENCODING_UTF8=    1, /**//* UTF-8 */
    XML_CHAR_ENCODING_UTF16LE=    2, /**//* UTF-16 little endian */
    XML_CHAR_ENCODING_UTF16BE=    3, /**//* UTF-16 big endian */
    XML_CHAR_ENCODING_UCS4LE=    4, /**//* UCS-4 little endian */
    XML_CHAR_ENCODING_UCS4BE=    5, /**//* UCS-4 big endian */
    XML_CHAR_ENCODING_EBCDIC=    6, /**//* EBCDIC uh! */
    XML_CHAR_ENCODING_UCS4_2143=7, /**//* UCS-4 unusual ordering */
    XML_CHAR_ENCODING_UCS4_3412=8, /**//* UCS-4 unusual ordering */
    XML_CHAR_ENCODING_UCS2=    9, /**//* UCS-2 */
    XML_CHAR_ENCODING_8859_1=    10,/**//* ISO-8859-1 ISO Latin 1 */
    XML_CHAR_ENCODING_8859_2=    11,/**//* ISO-8859-2 ISO Latin 2 */
    XML_CHAR_ENCODING_8859_3=    12,/**//* ISO-8859-3 */
    XML_CHAR_ENCODING_8859_4=    13,/**//* ISO-8859-4 */
    XML_CHAR_ENCODING_8859_5=    14,/**//* ISO-8859-5 */
    XML_CHAR_ENCODING_8859_6=    15,/**//* ISO-8859-6 */
    XML_CHAR_ENCODING_8859_7=    16,/**//* ISO-8859-7 */
    XML_CHAR_ENCODING_8859_8=    17,/**//* ISO-8859-8 */
    XML_CHAR_ENCODING_8859_9=    18,/**//* ISO-8859-9 */
    XML_CHAR_ENCODING_2022_JP=  19,/**//* ISO-2022-JP */
    XML_CHAR_ENCODING_SHIFT_JIS=20,/**//* Shift_JIS */
    XML_CHAR_ENCODING_EUC_JP=   21,/**//* EUC-JP */
    XML_CHAR_ENCODING_ASCII=    22, /**//* pure ASCII */
    XML_CHAR_ENCODING_GB2312 = 23 /**//*GB2312中文化处理*/
} xmlCharEncoding;


  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值