MyXML源码分析系列:我自己写的一个XML解析-基于状态图

项目需要,就自己写了一个解析XML报文的函数,返回值有4种

1 当前XML不完整.

2 能找到一个完整的XML,并且没有多余的字符剩下。

3 能找到一个完整的XML节点,并且有多余的字符还在缓冲区中。

4 报文格式错误

稍微改造下,添加一些处理节点和属性的代码,

就可以构造成码农版XML解析器。(有兴趣的码农可以拿过去改造下,很简单。)

PS:我项目中的XML是从网络报文抓取的,没有换行符和多余的空白这些东西的,

如果是读取文件需要考虑这些因素,当然也不难,仔细一点就可以。

废话不多说,上代码:


#define XML_STRING_ERROR		0//出现了错误,无法统计
#define XML_COMPLETE			1 //完全匹配成功
#define XML_NO_COMPLETE_LOSS_CHAR	2//缺少字符串
#define XML_NO_COMPLETE_MORE_CHAR	3//包含一个匹配的,但是有更多的字符串

//<iq>...</iq> 	1 2 3 4 5
//<iq />			1  6 7
// from=" " 		8  9 10
#define XML_STATE_CLOSED						0//初始状态
#define XML_STATE_RECEIVED_TAG_FIRST_CHAR				1//表明收到了第一种字符,如上面两行所示
#define XML_STATE_RECEIVED_TAG_SECOND_CHAR				2
#define XML_STATE_RECEIVED_TAG_THIRD_CHAR				3
#define XML_STATE_RECEIVED_TAG_FORTH_CHAR				4
#define XML_STATE_RECEIVED_TAG_FIFTH_CHAR				5
#define XML_STATE_RECEIVED_TAG_SIXTH_CHAR				6
#define XML_STATE_RECEIVED_TAG_SEVENTH_CHAR 				7//收到第7种字符
#define XML_STATE_RECEIVED_TAG_EIGHTH_CHAR				8//收到了第8种字符
#define XML_STATE_RECEIVED_TAG_NINTH_CHAR				9//收到了第9种字符
#define XML_STATE_RECEIVED_TAG_TENTH_CHAR				10//收到了第10种字符
#define XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR			11//已经收到了标签的开始的字符
#define XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR 		        12//已经收到标签的结束的字符
#define XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR			13//等待收到属性的字符串	
#define XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR				14//已经收到了属性标签
#define XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR		        15//已经收到了属性内容
#define XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR			16//收到了文本字符串


C语言代码如下:

//originlength表示其最初的长度,不需要str以\0结尾
STATIC INT32 im_xml_is_complete(INT8* str,UINT32 originlength,UINT32* length,INT8** innertextBegin,INT8** innertextEnd)
{	
	UINT32 layer=0;//当前存在的节点的层次
	INT32 result=R_ERROR;
	INT32 state=XML_STATE_CLOSED;//最开始为关闭状态 
	INT8  c=0;
	INT8* p=str;
	UINT32	innertext=0;//0表示还没有登记过,1表示已经登记过
	//innertextBegin,innertextEnd可以为空
	if(NULL==str || 0==originlength || str[0]!='<' || NULL==length)
	{
		return XML_STRING_ERROR;
	}
	//im_log(DEBUG_LEVEL,"state: XML_STATE_CLOSED");
	//表明是一个有效的字符串
	while('\0'!=*p)
	{
		c=*p;
		//im_log(DEBUG_LEVEL,"get char %c",c);
		if(XML_STATE_CLOSED==state)
		{
			if('<'==c) 
			{
				state=XML_STATE_RECEIVED_TAG_FIRST_CHAR;
				layer++;
			}
			else 
				goto error;
		}
		else if(XML_STATE_RECEIVED_TAG_FIRST_CHAR==state)
		{
			if('<'==c||'/'==c||'>'==c||' '==c)
				goto error;
			else 
				state=XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR;
		}
		else if(XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR==state)
		{
			if('<'==c)
				goto error;
			else if('/'==c)
			{
				state=XML_STATE_RECEIVED_TAG_SIXTH_CHAR;
			}
			else if('>'==c) state=XML_STATE_RECEIVED_TAG_SECOND_CHAR;
			else if(' '==c) state=XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR;
			else 
				state=XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR;
		}
		else if(XML_STATE_RECEIVED_TAG_SIXTH_CHAR==state)
		{
			if('>'==c)
			{
				//表明当前标签结束了
				layer--;
				//但是有可能存在父亲节点,所以需要根据父亲节点的层次来判断
				if(0==layer)//表明没有节点了,到达了closed状态
				{
					//此时已经可以返回了,返回时返回两种状态
					//if(*(p+1)=='\0')//表明正好是一个完整的XML
					if(p+1-str==originlength)//表明正好是一个完整的XML
					{
						result=XML_COMPLETE;
						*length=p-str+1;
						return result;
					}
					else
					{
						//表明还存在多余的字符串
						//则计算出当前的字符串长度
						*length=p-str+1;
						return XML_NO_COMPLETE_MORE_CHAR;
					}
				}
				else
				{
					//表明还存在父节点
					state=XML_STATE_RECEIVED_TAG_SECOND_CHAR;
				}
			}
			else 
				goto error;
		}
		else if(XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR==state)
		{
			//表明在等待收到属性字符串
			if('<'==c||'='==c||'"'==c) 
							goto error;			
			else if('>'==c)
			{
				state=XML_STATE_RECEIVED_TAG_SECOND_CHAR;
			}
			else if('/'==c)
			{
				state=XML_STATE_RECEIVED_TAG_SIXTH_CHAR;
			}
			else if(' '==c) //保持当前状态
				state=XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR;
			else //收到了有效属性字符
				state=XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR;
				
		}
		else if(XML_STATE_RECEIVED_TAG_SECOND_CHAR==state)
		{
			if('<'==c)
			{
				//收到这个字符时,不清楚当前的状态,需要根据后面一个字符来判断
				//if(*(p+1)=='\0')
				if(p+1-str==originlength)
				{
					return XML_NO_COMPLETE_LOSS_CHAR;
				}
				else if(*(p+1)=='/')
				{
					//表明是准备结束的
					state=XML_STATE_RECEIVED_TAG_THIRD_CHAR;
				}
				else
				{
					//确实是起一个新的标签
						layer++;
						state=XML_STATE_RECEIVED_TAG_FIRST_CHAR;
				}
			}
			else if('/'==c ||'>'==c)
			{
				goto error;
			}
			else
			{
				state=XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR;
				//如果是第一个innertext则登记
				if(0==innertext&&NULL!=innertextBegin)
				{
					//此时不应该将innertext置为1
					*innertextBegin=p;
				}
			}
		}
		else if(XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR==state)
		{
			if('<'==c)
			{
				state=XML_STATE_RECEIVED_TAG_THIRD_CHAR;
				if(0==innertext&&NULL!=innertextEnd)
				{
					*innertextEnd=p-1;
					innertext=1;//表示以后不写了,找到第一个就行了
				}
			}
			//else if('/'==c||'>'==c) goto error;
			//支持里面包含/字符
			else if('>'==c) goto error;
			else 
					state=XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR;
		}
		else if(XML_STATE_RECEIVED_TAG_THIRD_CHAR==state)
		{
			if('/'==c)
				state=XML_STATE_RECEIVED_TAG_FORTH_CHAR;
			else 	
				goto error;
		}
		else if(XML_STATE_RECEIVED_TAG_FORTH_CHAR==state)
		{
			//第四种状态
			if('<'==c || '/'==c ||'>'==c)
					goto error;
			else 
				state=XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR;
		}
		else if(XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR==state)
		{
			//已经收到了结束字符
			if('<'==c || '/'==c || ' '==c)
				goto error;
			else if('>'==c)
			{
				//表明结束了
				layer--;
				//但是有可能存在父亲节点,所以需要根据父亲节点的层次来判断
				if(0==layer)//表明没有节点了,到达了closed状态
				{
					//此时已经可以返回了,返回时返回两种状态
					//if(*(p+1)=='\0')
					if(p+1-str==originlength)
					{
						result=XML_COMPLETE;
						*length=p-str+1;
						return result;
					}
					else
					{
						//表明还存在多余的字符串
						//则计算出当前的字符串长度
						*length=p-str+1;
						return XML_NO_COMPLETE_MORE_CHAR;
					}
				}
				else
				{
					//表明还存在父节点
					state=XML_STATE_RECEIVED_TAG_SECOND_CHAR;
				}
			}
			else
				state=XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR;
				
		}
		else if(XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR==state)
		{
			if('<'==c||' '==c||'>'==c||'/'==c||'"'==c)
				goto error;
			else if('='==c)
			{
				state=XML_STATE_RECEIVED_TAG_EIGHTH_CHAR;
			}
			else
				//保持状态
				state=XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR;
				
		}
		else if(XML_STATE_RECEIVED_TAG_EIGHTH_CHAR==state)
		{
			//这种情况下只能是收到"
			if('"'==c) state=XML_STATE_RECEIVED_TAG_NINTH_CHAR;
			else 
				goto error;
		}
		else if(XML_STATE_RECEIVED_TAG_NINTH_CHAR==state)
		{
			//已经收到了属性内容的开始的"
			//这里不对属性字符串的内容做过多限制
			if(c=='"')
				state=XML_STATE_RECEIVED_TAG_TENTH_CHAR;
			else
			{
				state=XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR;
			}
		}
		else if(XML_STATE_RECEIVED_TAG_TENTH_CHAR==state)
		{
			if('<'==c||'='==c||'"'==c)
				goto error;
			else if('/'==c)
			{
				state=XML_STATE_RECEIVED_TAG_SIXTH_CHAR;
			}
			else if('>'==c) state=XML_STATE_RECEIVED_TAG_SECOND_CHAR;
			else if(' '==c) state=XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR;
			else 
			{
				//错误的
				goto error;
			}				
		}
		else if(XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR==state)
		{
			if(c=='"')
				state=XML_STATE_RECEIVED_TAG_TENTH_CHAR;
			else
			{
					state=XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR;
			}
		}
		if(0)//暂时不打印
		{
			switch(state)
			{
				case XML_STATE_CLOSED:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_CLOSED");
					break;
				case XML_STATE_RECEIVED_TAG_FIRST_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_FIRST_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_SECOND_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_SECOND_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_THIRD_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_THIRD_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_FORTH_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_FORTH_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_FIFTH_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_FIFTH_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_SIXTH_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_SIXTH_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_SEVENTH_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_SEVENTH_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_EIGHTH_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_EIGHTH_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_NINTH_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_NINTH_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_TENTH_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_TENTH_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_START_CONTENT_CHAR");
					break;
				case XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_TAG_END_CONTENT_CHAR");
					break;
				case XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_WAITING_FOR_RECEIVE_ATTRIBUTES_CHAR");
					break;
				case XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_ATTRIBUTE_TAG_CHAR");
					break;
				case XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_ATTRIBUTE_CONTENT_CHAR");
					break;
				case XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR:
					im_log(DEBUG_LEVEL,"state:%s","XML_STATE_RECEIVED_INNER_TEXT_CONTENT_CHAR");
					break;
				default:
					break;
			}
		}
		p++;
	}	
	return XML_NO_COMPLETE_LOSS_CHAR;
error:
	return XML_STRING_ERROR;	
}



根据这种思路,就可以写出JSON的解析器,

思路还是如上,根据状态图来考虑即可。

特点是流式解析。

转载于:https://my.oschina.net/qiangzigege/blog/178772

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值