/**
* 说明:根据数组规则抓取网页内容
* 参数:url网页地址/arr抓取规则/length数组长度/编码CP_UTF8 CP_ACP
* 返回:处理后的字符串
例子:
string 测试url = "http://zhannei.baidu.com/cse/search?q=%E7%B3%BB%E7%BB%9F&p=1&s=7845455592055299828&nsid=";
const int 测试规则Length = 11;
string 测试规则[测试规则Length][4] = {
{ "home", "class=\"result-list\"" },//确定开始标识
{ "end", "id=\"footer\"" },//确定结束标识
{ "itmeHome", "class=\"result-item result-game-item\"" },//确定list行开始
{ "itmeEnd", "preBold\">更新时间" },//确定list行结束(为空代表同一行)
{ "截取", "cpos=\"title\" href=\"", "\" title=\"", "0" },//url截取
{ "截取", "title=\"", "\" class=\"result-game-item-title-link\"", "0" }, //名称截取
{ "截取", "class=\"result-game-item-desc\">", "<div class=\"result-game-item-info\"", "1" },//介绍
{ "转义", "", "", "0" },//介绍
{ "截取", "preBold\">作者:</span>", "<span class=\"result-game-item-info-tag-title preBold\">类型:", "1" },
{ "截取", "<span>", "</span>", "1" },
{ "转义", "", "", "0" }//作者
};
vector<vector<CString>> veArr = util->html(测试url, 测试规则, 测试规则Length, CP_UTF8);
*/
vector<vector<CString>> FileUtil::html(CString url, string arr[][4], int length, UINT BM){
vector<vector<CString>> arrVector;//html数据存放链表
CString str;
CInternetSession sess(_TEXT(""));
CHttpFile* file = (CHttpFile*)sess.OpenURL(url);
char tmp[61440] = { 0 };//2048
int home = 0; //开始标识 1标识开始
int itmeHome = -1;//当非同行时去数据为1
CString itmeValue;//多行list数据临时变量
while (file->ReadString((LPTSTR)tmp, 30720)) //1024
{
try{
int nBufferSize = MultiByteToWideChar(BM, 0, tmp, -1, NULL, 0); //取得所需缓存的多少
wchar_t *pBuffer = (wchar_t*)malloc(nBufferSize * sizeof(wchar_t));//申请缓存空间
MultiByteToWideChar(BM, 0, tmp, -1, pBuffer, nBufferSize * sizeof(wchar_t));//转码 CP_UTF8 CP_ACP
//判断是否为结束标识
if (checkIsStr((LPCSTR)(CStringA)pBuffer, arr[1][1])){
free(pBuffer); //释放缓存
file->Close();
delete file;
file = NULL;
return arrVector;
}
//判断是否为开始标识
if (checkIsStr((LPCSTR)(CStringA)pBuffer, arr[0][1])){
home = 1;//标识开始
}
//当标识为开始时,开始处理数据
if (home == 1){
//代表list行开始标识与结束标识都在同一行
if (arr[3][1] == ""){
//校验是非为正确list数据
if (checkIsStr((LPCSTR)(CStringA)pBuffer, arr[2][1])){
vector<CString> vect;
CString linshi = (LPCSTR)(CStringA)pBuffer;
for (int i = 4; i < length; i++){
if ("截取" == arr[i][0]){
CString subStr = subCString((LPCSTR)(CStringA)linshi, arr[i][1].c_str(), arr[i][2].c_str());
if (arr[i][3] == "1"){
linshi = subStr;
}
else if (arr[i][3] == "0"){
vect.push_back(subStr);
}
}else if ("取值" == arr[i][0]){
if (arr[i][3] == "1"){
linshi = linshi;
}
else if (arr[i][3] == "0"){
vect.push_back(linshi);
}
}
else if ("替换" == arr[i][0]){
if (arr[i][3] == "1"){
linshi = replace_all_distinct((string)(CStringA)linshi, arr[i][1], arr[i][2]).c_str();
}
else if (arr[i][3] == "0"){
vect.push_back(replace_all_distinct((string)(CStringA)linshi, arr[i][1], arr[i][2]).c_str());
}
}
else if ("转义" == arr[i][0]){
if (arr[i][3] == "1"){
linshi = replaceHTML(linshi);
}
else if (arr[i][3] == "0"){
vect.push_back(replaceHTML(linshi));
}
}
//当等于0代表此次非0规则结束将数据还原 等待下次规则
if (arr[i][3] == "0"){
if ("" != itmeValue){
linshi = (LPCSTR)(CStringA)itmeValue;
}
else{
linshi = (LPCSTR)(CStringA)pBuffer;
}
}
}
arrVector.push_back(vect);
vect.clear();
vector<CString>(vect).swap(vect);
}
}
else{
//开始于结束标识非同一行
//校验是非为正确list数据
if (checkIsStr((LPCSTR)(CStringA)pBuffer, arr[2][1])){
itmeHome = 0;
}
if (itmeHome>=0){
if (itmeHome == 1){
vector<CString> vect;
CString linshi = (LPCSTR)(CStringA)itmeValue;
for (int i = 4; i < length; i++){
if ("截取" == arr[i][0]){
CString subStr = subCString((LPCSTR)(CStringA)linshi, arr[i][1].c_str(), arr[i][2].c_str());
if (arr[i][3]=="1"){
linshi = subStr;
}
else if (arr[i][3] == "0"){
vect.push_back(subStr);
}
}
else if ("取值" == arr[i][0]){
if (arr[i][3] == "1"){
linshi = linshi;
}
else if (arr[i][3] == "0"){
vect.push_back(linshi);
}
}
else if ("替换" == arr[i][0]){
if (arr[i][3] == "1"){
linshi = replace_all_distinct((string)(CStringA)linshi, arr[i][1], arr[i][2]).c_str();
}
else if (arr[i][3] == "0"){
vect.push_back(replace_all_distinct((string)(CStringA)linshi, arr[i][1], arr[i][2]).c_str());
}
}
else if ("转义" == arr[i][0]){
if (arr[i][3] == "1"){
linshi = replaceHTML(linshi);
}
else if (arr[i][3] == "0"){
vect.push_back(replaceHTML(linshi));
}
}
//当等于0代表此次非0规则结束将数据还原 等待下次规则
if (arr[i][3] == "0"){
linshi = (LPCSTR)(CStringA)itmeValue;
}
}
arrVector.push_back(vect);
itmeValue = _T("");//重置临时itme变量
itmeHome = 0;//重置list状态
}
else{
itmeValue += pBuffer;
}
//是否为itme结束(当list数据都存入临时变量时执行)
if (checkIsStr((LPCSTR)(CStringA)pBuffer, arr[3][1])){
itmeHome = 1;
}
}
}
}
free(pBuffer); //释放缓存
}
catch (...){
AfxMessageBox((LPCSTR)(CStringA)"存在异常数据!");
}
}
file->Close();
delete file;
file = NULL;
return arrVector;
}
一个根据自定义数组规则来抓取网页内容的工具函数,可以在此基础上完善。执行效率并不高,有兴趣大神的话可以帮忙完善完善~
完整源码在这个工具类里面:http://download.csdn.net/download/jkl012789/9986841