#include <afxinet.h>
#include <string>
using namespace std;
// 获取URL重定向后的文件名,如果没有重定向,返回下载文件名
CString GetRedirectUrl(CString strUrl)
{
CInternetSession session;
CHttpFile * pHttpFile = NULL ;
CString sNewUrl;
pHttpFile = (CHttpFile*)session.OpenURL(strUrl, 1, INTERNET_FLAG_TRANSFER_BINARY);
//获得重定向文件名
char szNewUrl[MAX_PATH] = { 0 };
DWORD dwLen = MAX_PATH;
BOOL bResult = pHttpFile->QueryOption(INTERNET_OPTION_URL, (LPVOID)szNewUrl, &dwLen);
sNewUrl = szNewUrl;
int flag = sNewUrl.ReverseFind('/');
sNewUrl = sNewUrl.Left(flag);
return sNewUrl;
}
BOOL AnalyzeUrl(CString strUrl, CString& strTitle, CString& strKeywords)
{
ASSERT(strUrl != "");
TRACE("AccessThreadId=%d\n", GetCurrentThreadId());
CInternetSession session ;
CHttpFile * pHttpFile = NULL ;
session.SetOption(INTERNET_OPTION_CONNECT_TIMEOUT, 3000); // 5秒的连接超时
session.SetOption(INTERNET_OPTION_SEND_TIMEOUT, 1000); // 1秒的发送超时
session.SetOption(INTERNET_OPTION_RECEIVE_TIMEOUT, 5000); // 7秒的接收超时
session.SetOption(INTERNET_OPTION_DATA_SEND_TIMEOUT, 1000); // 1秒的发送超时
session.SetOption(INTERNET_OPTION_DATA_RECEIVE_TIMEOUT, 5000); // 5秒的接收超时
session.SetOption(INTERNET_OPTION_CONNECT_RETRIES, 1); // 1次重试
goto_redirect:
try
{
pHttpFile = (CHttpFile*)session.OpenURL(strUrl);
if ( pHttpFile )
{
DWORD dwStateCode;
pHttpFile->QueryInfoStatusCode(dwStateCode);
if(dwStateCode == HTTP_STATUS_OK || dwStateCode == HTTP_STATUS_REDIRECT || dwStateCode == HTTP_STATUS_REDIRECT_METHOD || dwStateCode == HTTP_STATUS_MOVED || dwStateCode == HTTP_STATUS_USE_PROXY )
{
if (dwStateCode != HTTP_STATUS_OK)
{
if ( dwStateCode == HTTP_STATUS_USE_PROXY )
{
// INTERNET_PROXY_INFO proxyinfo;
// proxyinfo.dwAccessType = INTERNET_OPEN_TYPE_PROXY;
// proxyinfo.lpszProxy ="127.0.0.1:8080";
// proxyinfo.lpszProxyBypass = NULL;
// session.SetOption(INTERNET_OPTION_PROXY,(LPVOID)&proxyinfo, sizeof(INTERNET_PROXY_INFO));
}
else
{
strUrl = GetRedirectUrl(strUrl);
}
if(pHttpFile != NULL)
{
pHttpFile->Close();
delete pHttpFile;
pHttpFile = NULL;
}
session.Close();
goto goto_redirect;
}
CString strRespond;
CString strContext;
// /*设置请求相关参数*/
// pHttpFile->AddRequestHeaders(_T("Accept: */*,application/json"));
// pHttpFile->AddRequestHeaders(_T("Accept-Charset:UTF8"));
// pHttpFile->AddRequestHeaders(_T("Accept-Language: zh-cn;q=0.8,en;q=0.6,ja;q=0.4"));
// pHttpFile->AddRequestHeaders(_T("Content-Type:application/json"));
UINT CodePage = CP_UTF8;
while(pHttpFile->ReadString(strContext)) //读取网页数据
{
//先判断编码格式
if (-1!=strContext.Find(_T("charset=gbk")))
CodePage = CP_ACP;
else if (-1!=strContext.Find(_T("charset=GBK")))
CodePage = CP_ACP;
else if (-1!=strContext.Find(_T("charset=gb2312")))
CodePage = CP_ACP;
else if (-1!=strContext.Find(_T("charset=GB2312")))
CodePage = CP_ACP;
else if (-1!=strContext.Find(_T("charset=utf-8")))
CodePage = CP_UTF8;
else if (-1!=strContext.Find(_T("charset=UTF-8")))
CodePage = CP_UTF8;
// -->unicode
char *utf8 = strContext.GetBuffer(strContext.GetLength());
int nBufferSize = MultiByteToWideChar(CodePage, 0, utf8, -1, NULL, 0);
wchar_t *unicode = new wchar_t[nBufferSize + 1];
MultiByteToWideChar(CodePage, 0, utf8, -1 , unicode, nBufferSize);
// unicode->gb2312
nBufferSize = WideCharToMultiByte(CP_ACP, 0, unicode, -1, NULL, 0, NULL, NULL);
char *gb2312 = new char[nBufferSize + 1];
WideCharToMultiByte(CP_ACP, 0, unicode, -1, gb2312, nBufferSize, NULL, NULL);
strRespond += gb2312;
delete[] unicode;
delete[] gb2312;
}
strRespond = strRespond.MakeLower();
int nTitlePos = strRespond.Find(_T("<title>")) + strlen(_T("<title>"));
int nTitleEnd = strRespond.Find(_T("</title>"), nTitlePos);
if ( nTitlePos>strlen(_T("<title>")) && nTitleEnd>nTitlePos )
strTitle = strRespond.Mid(nTitlePos, nTitleEnd-nTitlePos);
int nKeywordsPos = strRespond.Find(_T("<meta name=\"keywords\" content=\"")) + strlen(_T("<meta name=\"keywords\" content=\""));
int nKeywordsEnd = strRespond.Find(_T("/>"), nKeywordsPos);
if ( nKeywordsPos>strlen(_T("<meta name=\"keywords\" content=\"")) && nKeywordsEnd>nKeywordsPos )
{
strKeywords = strRespond.Mid(nKeywordsPos, nKeywordsEnd-nKeywordsPos);
int pos = strKeywords.ReverseFind(_T('\"'));
strKeywords = strKeywords.Left(pos);
}
/*
<html>
<head>
<script>
location.replace(location.href.replace("https://","http://"));
</script>
</head>
<body>
<noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
</body>
</html>
*/
if (strTitle.IsEmpty() && strKeywords.IsEmpty() )
{
// 可能重定向了
int nBodyPos = strRespond.Find(_T("<body>")) + strlen(_T("<body>"));
int nBodyEnd = strRespond.Find(_T("</body>"), nBodyPos);
if ( nBodyEnd>strlen(_T("<body>")) && nBodyEnd>nBodyPos )
{
CString strBody = strRespond.Mid(nBodyPos, nBodyEnd-nBodyPos);
if ( strBody.Find(_T("http-equiv=\"refresh\"")) > 0 )
{
int nUrlPos = strBody.Find(_T("url=")) + strlen(_T("url="));
int nUrlEnd = strBody.Find(_T("\""), nUrlPos);
if (nUrlEnd>nUrlPos)
{
strUrl = strBody.Mid(nUrlPos, nUrlEnd-nUrlPos);
if(pHttpFile != NULL)
{
pHttpFile->Close();
delete pHttpFile;
pHttpFile = NULL;
}
session.Close();
goto goto_redirect;
}
}
}
}
if(pHttpFile != NULL)
{
pHttpFile->Close();
delete pHttpFile;
pHttpFile = NULL;
}
session.Close();
return TRUE;
}
}
if(pHttpFile != NULL)
{
pHttpFile->Close();
delete pHttpFile;
pHttpFile = NULL;
}
session.Close();
}
catch(CInternetException* e)
{
TCHAR tszErrString[256];
e->GetErrorMessage(tszErrString, sizeof(tszErrString));
TRACE(_T("AnalyzeUrl fail. URL: %s, Error: %s"), strUrl, tszErrString);
e->Delete();
if(pHttpFile != NULL)
{
pHttpFile->Close();
delete pHttpFile;
pHttpFile = NULL;
}
session.Close();
}
catch(...)
{
if(pHttpFile != NULL)
{
pHttpFile->Close();
delete pHttpFile;
pHttpFile = NULL;
}
session.Close();
}
if(pHttpFile != NULL)
{
pHttpFile->Close();
delete pHttpFile;
pHttpFile = NULL;
}
return FALSE;
}
调用函数AnalyzeUrl结果如下: