使用XmlLite+WinHttp下载RSS

最新推荐文章于 2024-10-01 10:57:28 发布

optman

最新推荐文章于 2024-10-01 10:57:28 发布

阅读量2.1k

点赞数

文章标签： integer stream session null whitespace xml

本文链接：https://blog.csdn.net/optman/article/details/4019729

版权

我在CSDN的第一篇Blog是关于XML，介绍使用MSXML的DOM API来处理XML。当时曾经说后续介绍另一种处理模式SAX，结果一晃过了很多年也没有写。实在是因为SAX模型太过麻烦，也没实际用过。SAX是够麻烦的，连.NET Framework都没有提供类似的API，而改为提供XmlReader/Writer。在非托管环境下，XmlLite提供了类似的XmlReader/Writer接口。

SAX和XmlReader一样，都是关于流式处理XML的。因为使用DOM API必须把整个XML文档首先加载了，这样会占用大量的内存，特别是xml文档很大的时候。那么，我们能不能一边读一边处理呢？这就是SAX和XmlReader要解决的。只是SAX采用Push的模式，而XmlReader采用Pull的模式。在Push的模式下，调用者是被动的等待通知（当前读到了什么位置）。而在Pull模式下，调用者控制着读取进程，当前读到了什么，要不要继续往下读。在Pull模式下，程序流程更清晰明了。

为了更好的理解，试着使用XmlLite+WinHttp写了一个简单的RSS下载解析测试。

这里遇到的一个难题是，如果读到了不认识的Xml节点，想跳过怎么办？特别是，如果这个节点还包括更多的子节点怎么办？我的解决办法是，递归的读，直到遇到结束节点(EndElement)。具体请参见ReadElementToEnd函数。另外需要创建一个IMultiLanguage2对象，这样XmlLite就可以自动识别和转换GB2312编码的Stream了，因为默认只能处理UTF-8编码的。

//FeedReader.h #include <xmllite.h> #include <mlang.h> class XmlLiteReaderHelper { protected: IXmlReader* m_reader; public: XmlLiteReaderHelper(IXmlReader* reader) { m_reader=reader; } const wchar_t* GetAttributeValue(wchar_t* attributeName) { if(S_OK==m_reader->MoveToAttributeByName(attributeName,L"")) { PCWSTR value = 0; m_reader->GetValue(&value,NULL); m_reader->MoveToElement(); //back return value; } return NULL; } bool ReadElement(LPCWSTR* namespaceUri,LPCWSTR* localName) { XmlNodeType nodeType = XmlNodeType_None; while(S_OK == m_reader->Read(&nodeType)) { if(nodeType==XmlNodeType_Element) { m_reader->GetNamespaceUri(namespaceUri,NULL); m_reader->GetLocalName(localName,NULL); return true; } else if(nodeType==XmlNodeType_EndElement) { return false; } } return false; } void ReadElementToEnd() { if(m_reader->IsEmptyElement()) return; XmlNodeType nodeType = XmlNodeType_None; while(S_OK == m_reader->Read(&nodeType)) { if(nodeType==XmlNodeType_Element) { ReadElementToEnd(); } else if(nodeType==XmlNodeType_EndElement) { return; } } } bool FindElement(LPCWSTR namespaceUri,LPCWSTR localName) { PCWSTR _namespaceUri = 0; PCWSTR _localName = 0; while(true) { if(!ReadElement(&_namespaceUri,&_localName)) return false; if(StrCmp(_localName,localName)==0&&StrCmp(_namespaceUri,namespaceUri)==0) { return true; } ReadElementToEnd(); } return false; } const wchar_t* GetNodeValue() { XmlNodeType nodeType = XmlNodeType_None; do{ m_reader->Read(&nodeType); }while(nodeType==XmlNodeType_Whitespace || nodeType==XmlNodeType_Comment); PCWSTR value = 0; if(nodeType==XmlNodeType_Text || nodeType==XmlNodeType_CDATA) m_reader->GetValue(&value,NULL); return value; } }; void ReadRssFeedItem(XmlLiteReaderHelper& helper) { PCWSTR namespaceUri = 0; PCWSTR localName = 0; while(helper.ReadElement(&namespaceUri,&localName)) { if(StrCmp(localName,L"title")==0) { printf("%s",CW2A(helper.GetNodeValue())); } else if(StrCmp(localName,L"link")==0) { printf("%s",CW2A(helper.GetNodeValue())); } else if(StrCmp(localName,L"description")==0) { printf("%s",CW2A(helper.GetNodeValue())); } helper.ReadElementToEnd(); } printf("/r/n"); } void ReadAtomFeedItem(XmlLiteReaderHelper& helper) { PCWSTR namespaceUri = 0; PCWSTR localName = 0; while(helper.ReadElement(&namespaceUri,&localName)) { if(StrCmp(localName,L"title")==0) { printf("%s",CW2A(helper.GetNodeValue())); } else if(StrCmp(localName,L"link")==0) { printf("%s",CW2A(helper.GetAttributeValue(L"href"))); } else if(StrCmp(localName,L"summary")==0) { printf("%s",CW2A(helper.GetNodeValue())); } helper.ReadElementToEnd(); } printf("/r/n"); } void ReadFeed(IStream* stream) { CComPtr<IXmlReader> reader; ::CreateXmlReader(__uuidof(IXmlReader),reinterpret_cast<void**>(&reader),0); CComPtr<IMultiLanguage2> ml; CoCreateInstance(CLSID_CMultiLanguage, NULL,CLSCTX_ALL, IID_IMultiLanguage2 , (void**)&ml); reader->SetProperty(XmlReaderProperty_MultiLanguage,(LONG_PTR)(IMultiLanguage2*)ml); reader->SetInput(stream); XmlLiteReaderHelper helper(reader); int feedItemCount=0; PCWSTR namespaceUri = 0; PCWSTR localName = 0; helper.ReadElement(&namespaceUri,&localName); if(StrCmp(localName,L"rss")==0) { printf("rss version: %S/r/n",helper.GetAttributeValue(L"version")); if(helper.FindElement(L"",L"channel")) { while(helper.ReadElement(&namespaceUri,&localName)) { if(StrCmp(localName,L"item")==0) { printf("%d - ",feedItemCount++); ReadRssFeedItem(helper); } else helper.ReadElementToEnd(); } } } else if(StrCmp(localName,L"feed")==0) { printf("atom version: %S/r/n",helper.GetAttributeValue(L"version")); while(helper.ReadElement(&namespaceUri,&localName)) { if(StrCmp(localName,L"entry")==0) { printf("%d - ",feedItemCount++); ReadAtomFeedItem(helper); } else helper.ReadElementToEnd(); } } else printf("not support format!/r/n"); }

XmlReader只能从IStream对象中读取数据，但是WinHttp并不提供这样的接口怎么办？于是自己构造一个IStream对象，只实现IStream的IUnknown接口和Read方法。

//WinHttpStream.h #pragma once #include <Winhttp.h> class WinHttpStream : public IStream { private: LONG m_refcount; HINTERNET m_Request; public: WinHttpStream(HINTERNET request) { m_refcount = 1; m_Request=request; } ~WinHttpStream() { } public: //IStream public: virtual HRESULT STDMETHODCALLTYPE QueryInterface(REFIID iid, void ** ppvObject) { if (iid == __uuidof(IUnknown) || iid == __uuidof(IStream) || iid == __uuidof(ISequentialStream)) { *ppvObject = static_cast<IStream*>(this); AddRef(); return S_OK; } else return E_NOINTERFACE; } virtual ULONG STDMETHODCALLTYPE AddRef(void) { return (ULONG)InterlockedIncrement(&m_refcount); } virtual ULONG STDMETHODCALLTYPE Release(void) { ULONG res = (ULONG) InterlockedDecrement(&m_refcount); if (res == 0) delete this; return res; } // ISequentialStream Interface public: virtual HRESULT STDMETHODCALLTYPE Read(void* pv, ULONG cb, ULONG* pcbRead) { BOOL result; if (result=::WinHttpReceiveResponse(m_Request,0)) { result=WinHttpReadData( m_Request, (LPVOID)pv,cb , pcbRead ); } return (result) ? S_OK : HRESULT_FROM_WIN32(GetLastError()); } virtual HRESULT STDMETHODCALLTYPE Write(void const* pv, ULONG cb, ULONG* pcbWritten) { BOOL result=::WinHttpWriteData( m_Request, (LPVOID)pv,cb , pcbWritten ); return (result) ? S_OK : HRESULT_FROM_WIN32(GetLastError()); } // IStream Interface public: virtual HRESULT STDMETHODCALLTYPE SetSize(ULARGE_INTEGER) { return E_NOTIMPL; } virtual HRESULT STDMETHODCALLTYPE CopyTo(IStream*, ULARGE_INTEGER, ULARGE_INTEGER*, ULARGE_INTEGER*) { return E_NOTIMPL; } virtual HRESULT STDMETHODCALLTYPE Commit(DWORD) { return E_NOTIMPL; } virtual HRESULT STDMETHODCALLTYPE Revert(void) { return E_NOTIMPL; } virtual HRESULT STDMETHODCALLTYPE LockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD) { return E_NOTIMPL; } virtual HRESULT STDMETHODCALLTYPE UnlockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD) { return E_NOTIMPL; } virtual HRESULT STDMETHODCALLTYPE Clone(IStream **) { return E_NOTIMPL; } virtual HRESULT STDMETHODCALLTYPE Seek(LARGE_INTEGER liDistanceToMove, DWORD dwOrigin, ULARGE_INTEGER* lpNewFilePointer) { return E_NOTIMPL; } virtual HRESULT STDMETHODCALLTYPE Stat(STATSTG* pStatstg, DWORD grfStatFlag) { return E_NOTIMPL; } }; class WinHttpRequest { public: CComPtr<IStream> m_Stream; HINTERNET m_Request,m_Connection,m_Session; WinHttpRequest() { m_Request=m_Connection=m_Session=NULL; } ~WinHttpRequest() { if( m_Request ) WinHttpCloseHandle( m_Request ); if( m_Connection ) WinHttpCloseHandle( m_Connection ); if( m_Session ) WinHttpCloseHandle( m_Session ); } public: DWORD GetStatusCode() { ::WinHttpReceiveResponse(m_Request,0); DWORD statusCode = 0; DWORD statusCodeSize = sizeof(DWORD); ::WinHttpQueryHeaders(m_Request, WINHTTP_QUERY_STATUS_CODE | WINHTTP_QUERY_FLAG_NUMBER, WINHTTP_HEADER_NAME_BY_INDEX, &statusCode, &statusCodeSize, WINHTTP_NO_HEADER_INDEX); return statusCode; } IStream* Open(wchar_t* url) { wchar_t hostName[100]; URL_COMPONENTS urlComp; ZeroMemory(&urlComp, sizeof(urlComp)); urlComp.dwStructSize = sizeof(urlComp); urlComp.dwSchemeLength = -1; urlComp.dwHostNameLength = -1; urlComp.dwUrlPathLength = -1; urlComp.dwExtraInfoLength = -1; if(!::WinHttpCrackUrl(url,wcslen(url),0,&urlComp)) return NULL; wcsncpy_s(hostName,sizeof(hostName)/sizeof(wchar_t),urlComp.lpszHostName,urlComp.dwHostNameLength); m_Session = WinHttpOpen( L"WinHTTP Example/1.0", WINHTTP_ACCESS_TYPE_DEFAULT_PROXY, WINHTTP_NO_PROXY_NAME, WINHTTP_NO_PROXY_BYPASS, 0); if( !m_Session ) return NULL; m_Connection = WinHttpConnect( m_Session, hostName,urlComp.nPort, 0 ); if(!m_Connection ) return NULL; m_Request = WinHttpOpenRequest( m_Connection, L"GET", urlComp.lpszUrlPath, NULL, WINHTTP_NO_REFERER, WINHTTP_DEFAULT_ACCEPT_TYPES, 0 ); if(!m_Request ) return NULL; if( !::WinHttpSendRequest( m_Request,WINHTTP_NO_ADDITIONAL_HEADERS, 0, WINHTTP_NO_REQUEST_DATA, 0, 0, 0 )) return NULL; m_Stream.Attach(new WinHttpStream(m_Request)); return m_Stream; } };

这样就搞定了，由XmlReader驱动WinHttp从网络上读取数据，边读边处理，绝不浪费多余的内存和带宽。

void ReadFeed(wchar_t* url) { WinHttpRequest request; IStream* stream=request.Open(url); if(request.GetStatusCode()==HTTP_STATUS_OK) ReadFeed(stream); }

参考：

MSDN Magazine介绍XmlLite的文章

http://msdn.microsoft.com/zh-cn/magazine/cc163436.aspx

MSDN文档

http://msdn.microsoft.com/en-us/ms752872.aspx