VC++ 根据网址获取网页标题栏和关键字

#include <afxinet.h>  
#include <string>  
using namespace std;
// 获取URL重定向后的文件名,如果没有重定向,返回下载文件名  
CString GetRedirectUrl(CString strUrl)  
{  
    CInternetSession session;   
    CHttpFile * pHttpFile = NULL ;
    CString sNewUrl;   
    pHttpFile = (CHttpFile*)session.OpenURL(strUrl, 1, INTERNET_FLAG_TRANSFER_BINARY);

    //获得重定向文件名
    char szNewUrl[MAX_PATH] = { 0 };
    DWORD dwLen = MAX_PATH;
    BOOL bResult = pHttpFile->QueryOption(INTERNET_OPTION_URL, (LPVOID)szNewUrl, &dwLen);
    sNewUrl = szNewUrl;
    int flag = sNewUrl.ReverseFind('/');  
    sNewUrl = sNewUrl.Left(flag);  
    return sNewUrl;  
}  

BOOL AnalyzeUrl(CString strUrl, CString& strTitle, CString& strKeywords)
{ 
    ASSERT(strUrl != "");
    TRACE("AccessThreadId=%d\n", GetCurrentThreadId());

    CInternetSession session ; 
    CHttpFile * pHttpFile = NULL ; 
    session.SetOption(INTERNET_OPTION_CONNECT_TIMEOUT, 3000);      // 5秒的连接超时  
    session.SetOption(INTERNET_OPTION_SEND_TIMEOUT, 1000);          // 1秒的发送超时  
    session.SetOption(INTERNET_OPTION_RECEIVE_TIMEOUT, 5000);       // 7秒的接收超时  
    session.SetOption(INTERNET_OPTION_DATA_SEND_TIMEOUT, 1000);     // 1秒的发送超时  
    session.SetOption(INTERNET_OPTION_DATA_RECEIVE_TIMEOUT, 5000);  // 5秒的接收超时  
    session.SetOption(INTERNET_OPTION_CONNECT_RETRIES, 1);          // 1次重试
goto_redirect:
    try
    {
        pHttpFile = (CHttpFile*)session.OpenURL(strUrl);
        if ( pHttpFile )
        {
            DWORD dwStateCode;
            pHttpFile->QueryInfoStatusCode(dwStateCode); 
            if(dwStateCode == HTTP_STATUS_OK || dwStateCode == HTTP_STATUS_REDIRECT || dwStateCode == HTTP_STATUS_REDIRECT_METHOD || dwStateCode == HTTP_STATUS_MOVED || dwStateCode == HTTP_STATUS_USE_PROXY ) 
            {
                if (dwStateCode != HTTP_STATUS_OK)
                {
                    if ( dwStateCode == HTTP_STATUS_USE_PROXY )
                    {
//                          INTERNET_PROXY_INFO proxyinfo;  
//                          proxyinfo.dwAccessType = INTERNET_OPEN_TYPE_PROXY;  
//                          proxyinfo.lpszProxy ="127.0.0.1:8080";  
//                          proxyinfo.lpszProxyBypass = NULL;  
//                          session.SetOption(INTERNET_OPTION_PROXY,(LPVOID)&proxyinfo,  sizeof(INTERNET_PROXY_INFO)); 
                    }
                    else
                    {
                        strUrl = GetRedirectUrl(strUrl);
                    }

                    if(pHttpFile != NULL) 
                    { 
                        pHttpFile->Close(); 
                        delete pHttpFile;
                        pHttpFile = NULL;
                    }
                    session.Close();
                    goto goto_redirect;
                }

                CString strRespond;
                CString strContext;
                
//                 /*设置请求相关参数*/  
//                 pHttpFile->AddRequestHeaders(_T("Accept: */*,application/json"));
//                 pHttpFile->AddRequestHeaders(_T("Accept-Charset:UTF8"));  
//                 pHttpFile->AddRequestHeaders(_T("Accept-Language: zh-cn;q=0.8,en;q=0.6,ja;q=0.4"));  
//                 pHttpFile->AddRequestHeaders(_T("Content-Type:application/json"));

                UINT CodePage = CP_UTF8;
                while(pHttpFile->ReadString(strContext))   //读取网页数据 
                {
                    //先判断编码格式
                    if (-1!=strContext.Find(_T("charset=gbk")))                    
                        CodePage = CP_ACP;
                    else if (-1!=strContext.Find(_T("charset=GBK")))
                        CodePage = CP_ACP;
                    else if (-1!=strContext.Find(_T("charset=gb2312")))
                        CodePage = CP_ACP;
                    else if (-1!=strContext.Find(_T("charset=GB2312")))
                        CodePage = CP_ACP;
                    else if (-1!=strContext.Find(_T("charset=utf-8")))
                        CodePage = CP_UTF8;
                    else if (-1!=strContext.Find(_T("charset=UTF-8")))
                        CodePage = CP_UTF8;

                    // -->unicode
                    char *utf8 = strContext.GetBuffer(strContext.GetLength());
                    int nBufferSize = MultiByteToWideChar(CodePage, 0, utf8, -1, NULL, 0);
                    wchar_t *unicode = new wchar_t[nBufferSize + 1];
                    MultiByteToWideChar(CodePage, 0, utf8, -1 , unicode, nBufferSize);

                    // unicode->gb2312
                    nBufferSize = WideCharToMultiByte(CP_ACP, 0, unicode, -1, NULL, 0, NULL, NULL);
                    char *gb2312 = new char[nBufferSize + 1];
                    WideCharToMultiByte(CP_ACP, 0, unicode, -1, gb2312, nBufferSize, NULL, NULL);

                    strRespond += gb2312;
                    delete[] unicode;
                    delete[] gb2312; 
                }

                strRespond = strRespond.MakeLower();
                int nTitlePos = strRespond.Find(_T("<title>")) + strlen(_T("<title>"));
                int nTitleEnd = strRespond.Find(_T("</title>"), nTitlePos);
                if ( nTitlePos>strlen(_T("<title>")) && nTitleEnd>nTitlePos )
                    strTitle = strRespond.Mid(nTitlePos, nTitleEnd-nTitlePos);
       
                int nKeywordsPos = strRespond.Find(_T("<meta name=\"keywords\" content=\"")) + strlen(_T("<meta name=\"keywords\" content=\""));
                int nKeywordsEnd = strRespond.Find(_T("/>"), nKeywordsPos);
                if ( nKeywordsPos>strlen(_T("<meta name=\"keywords\" content=\"")) && nKeywordsEnd>nKeywordsPos )
                {
                    strKeywords = strRespond.Mid(nKeywordsPos, nKeywordsEnd-nKeywordsPos);
                    int pos = strKeywords.ReverseFind(_T('\"'));
                    strKeywords = strKeywords.Left(pos);
                }

                /*
                <html>
                    <head>
                        <script>
                            location.replace(location.href.replace("https://","http://"));
                        </script>
                    </head>
                    <body>
                        <noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
                    </body>
                </html>
                */
                if (strTitle.IsEmpty() && strKeywords.IsEmpty() )
                {
                    // 可能重定向了
                    int nBodyPos = strRespond.Find(_T("<body>")) + strlen(_T("<body>"));
                    int nBodyEnd = strRespond.Find(_T("</body>"), nBodyPos);
                    if ( nBodyEnd>strlen(_T("<body>")) && nBodyEnd>nBodyPos )
                    {
                        CString strBody = strRespond.Mid(nBodyPos, nBodyEnd-nBodyPos);
                        if ( strBody.Find(_T("http-equiv=\"refresh\"")) > 0 )
                        {
                            int nUrlPos = strBody.Find(_T("url=")) + strlen(_T("url="));
                            int nUrlEnd = strBody.Find(_T("\""), nUrlPos);
                            if (nUrlEnd>nUrlPos)
                            {
                                strUrl = strBody.Mid(nUrlPos, nUrlEnd-nUrlPos);
                                if(pHttpFile != NULL) 
                                { 
                                    pHttpFile->Close(); 
                                    delete pHttpFile;
                                    pHttpFile = NULL;
                                }
                                session.Close();
                                goto goto_redirect;
                            }
                        }
                    }
                }

                if(pHttpFile != NULL) 
                { 
                    pHttpFile->Close(); 
                    delete pHttpFile; 
                    pHttpFile = NULL;
                }
                session.Close();
                return TRUE;
             }
        }
        
        if(pHttpFile != NULL) 
        { 
            pHttpFile->Close(); 
            delete pHttpFile; 
            pHttpFile = NULL;
        }
        session.Close();
    }
    catch(CInternetException* e) 
    { 
        TCHAR tszErrString[256]; 
        e->GetErrorMessage(tszErrString, sizeof(tszErrString)); 
        TRACE(_T("AnalyzeUrl fail. URL: %s, Error: %s"), strUrl, tszErrString); 
        e->Delete();
        if(pHttpFile != NULL) 
        { 
            pHttpFile->Close(); 
            delete pHttpFile; 
            pHttpFile = NULL;
        }
        session.Close();
    } 
    catch(...) 
    { 
        if(pHttpFile != NULL) 
        { 
            pHttpFile->Close(); 
            delete pHttpFile; 
            pHttpFile = NULL;
        }
        session.Close();
    } 
    
    if(pHttpFile != NULL) 
    { 
        pHttpFile->Close(); 
        delete pHttpFile; 
        pHttpFile = NULL;
    }
    return FALSE;
}

调用函数AnalyzeUrl结果如下:

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值