在之前的几篇博文中,曾经设计实现了单线程爬虫和多线程爬虫,但是自己私下想了想,其实在实现多线程爬虫时,所有的线程都是自己通过调用底层的API来实现的,这样的调用一般来说并不是很好,并且又鉴于自己最近学习了下boost::thread相关的东西,于是索性想将之前的那个多线程爬虫程序改为boost::thread版的多线程爬虫,好了,废话说了这么多,现在该是设计代码的时候,代码如下:
#ifndef __HTTP_CURL__H
#define __HTTP_CURL__H
#include <boost/smart_ptr.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/locks.hpp>
#include <boost/thread.hpp>
#include <boost/function.hpp>
#include <boost/bind.hpp>
#include <curl/curl.h>
#include <string>
#include <set>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
using namespace std;
using namespace boost;
#define MAX_BUFFERSIZE 1024*10
#define MAX_THREAD 10
static int fileIndex = 1;
boost::mutex mut;
std::set<string> urlSet;
std::set<string> finishUrlSet;
typedef set<string>::iterator urlSet_Iter;
#define BEGIN_SPIDER(path) {urlSet.insert(path);}
class HttpCurl
{
public:
HttpCurl()
{
conn = NULL;
}
~HttpCurl()
{
curl_easy_cleanup(conn);
}
static bool HttpCurlInit()
{
urlSet.clear();
finishUrlSet.clear();
CURLcode code;
code = curl_global_init(CURL_GLOBAL_DEFAULT);
if(CURLE_OK != code)
{
printf("Failed to global init default\n");
return false;
}
return true;
}
bool InitCurlObject(string& context)
{
CURLcode code;
conn = curl_easy_init();
if(NULL == conn)
{
printf("Failed to create CURL\n");
return false;
}
if(!setWriteFunc())
{
printf("Failed to set write\n");
return false;
}
if(!setWriteBuff(context))
{
printf("Failed to set buffer\n");
return false;
}
return true;
}
bool setWriteFunc()
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_WRITEFUNCTION,HttpCurl::write);
if(CURLE_OK != code)
{
printf("Failed to set write\n");
return false;
}
return true;
}
bool setWriteBuff(string& context)
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_WRITEDATA,&context);
if(CURLE_OK != code)
{
printf("Failed to set write data\n");
return false;
}
return true;
}
bool setUrl(string& url)
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_URL,url.c_str());
if(CURLE_OK != code)
{
printf("Failed to set URL\n");
return false;
}
return true;
}
bool getHttpResponse()
{
CURLcode code;
assert(conn);
code = curl_easy_perform(conn);
if(CURLE_OK != code)
{
printf("Failed to get response\n");
return false;
}
return true;
}
static long write(void* data,int size,int nmemb,string& context)
{
long sizes = size*nmemb;
std::string temp((char*)data,sizes);
context += temp;
return sizes;
}
bool save(const string& context,string filename)
{
CURLcode code;
int retcode = 0;
code = curl_easy_getinfo(conn,CURLINFO_RESPONSE_CODE,&retcode);
if((CURLE_OK == code)&& retcode ==200)
{
int length = strlen(context.c_str());
FILE* file = fopen(filename.c_str(),"w+");
fseek(file,0,SEEK_SET);
fwrite(context.c_str(),1,length,file);
fclose(file);
return true;
}
return false;
}
private:
CURL* conn;
};
class Spider
{
public:
Spider(shared_ptr<HttpCurl>& cul):httpCurl(cul)
{
httpCurlUrlSet.clear();
context.clear();
initCurl(httpCurl,context);
}
~Spider(){}
bool initCurl(shared_ptr<HttpCurl>& httpCurl,string& context)
{
return httpCurl->InitCurlObject(context);
}
void parseUrl(const string& context)
{
const string tag = "href";
const string tag2 = "\"";
const string tag3 = "http";
string::size_type tempBegin,tempEnd,iter,httpIter;
tempBegin = tempEnd = 0;
iter= context.find(tag);
while(iter != string::npos)
{
tempBegin = context.find(tag2,iter);
if(tempBegin != string::npos)
{
++tempBegin;
tempEnd = context.find(tag2,tempBegin);
}
if(tempEnd != string::npos && tempEnd > tempBegin)
{
string url;
url.assign(context,tempBegin,(tempEnd-tempBegin));
httpIter = url.find(tag3);
if(httpIter != string::npos)
httpCurlUrlSet.insert(url);
}
iter = context.find(tag,tempEnd);
}
printf("httpCurlUrlSet.size():%d\n",httpCurlUrlSet.size());
}
bool write(const string& context,const string& filename)
{
return httpCurl->save(context,filename);
}
void start(string url,string& context)
{
char filename[64];
memset(filename,0,sizeof(filename));
sprintf(filename,"%d.html",fileIndex++);
httpCurl->setUrl(url);
if(httpCurl->getHttpResponse())
{
parseUrl(context);
write(context,filename);
insertUrl();
}
}
void insertUrl()
{
boost::unique_lock<boost::mutex> lock(mut);
for( urlSet_Iter iter = httpCurlUrlSet.begin();iter != httpCurlUrlSet.end();++iter)
urlSet.insert(*iter);
httpCurlUrlSet.clear();
}
void displayUrl()
{
urlSet_Iter iter = urlSet.begin();
for(; iter != urlSet.end();++iter)
{
cout<<*iter<<endl;
}
}
string& getContext()
{
return context;
}
string getUrl()
{
urlSet_Iter iter;
string url;
boost::unique_lock<boost::mutex> lock(mut);
for(iter = urlSet.begin();iter != urlSet.end();++iter)
{
if(finishUrlSet.find(*iter) != finishUrlSet.end())
continue;
break;
}
if(iter != urlSet.end())
{
url = *iter;
urlSet.erase(iter);
finishUrlSet.insert(url);
return url;
}
return "";
}
private:
shared_ptr<HttpCurl> httpCurl;
std::set<std::string> httpCurlUrlSet;
std::string context;
};
static void task(Spider* spider)
{
assert(spider);
for(;;)
{
string url = spider->getUrl();
if(url != "")
{
printf("url=%s\n",url.c_str());
spider->start(url,spider->getContext());
}
}
}
#endif
测试程序:
#include "curlTest.h"
int main()
{
HttpCurl::HttpCurlInit();
BEGIN_SPIDER("www.baidu.com");
shared_ptr<HttpCurl> curl1(new HttpCurl());
Spider spider1(curl1);
boost::thread thr1(boost::bind(&task,&spider1));
shared_ptr<HttpCurl> curl2(new HttpCurl());
Spider spider2(curl2);
boost::thread thr2(boost::bind(&task,&spider2));
thr1.join();
thr2.join();
//sleep(100);
return 0;
}
测试结果:
url=http://anquan.baidu.com/bbs/thread-10353-1-1.html
httpCurlUrlSet.size():221
url=http://anquan.baidu.com/bbs/thread-10356-1-1.html
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82280&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82424&ptid=10356
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82455&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82507&ptid=10353
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82703&ptid=10353
httpCurlUrlSet.size():223
url=http://anquan.baidu.com/bbs/thread-10360-1-1.html
httpCurlUrlSet.size():365
url=http://anquan.baidu.com/bbs/thread-10365-1-1.html
httpCurlUrlSet.size():224
url=http://anquan.baidu.com/bbs/thread-10454-1-1.html
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82343&ptid=10365
httpCurlUrlSet.size():229
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82848&ptid=10454
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82964&ptid=10454
httpCurlUrlSet.size():229
httpCurlUrlSet.size():366
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=82969&ptid=10454
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=83171&ptid=10454
总结
本篇博文主要是针对上几篇博文的修改,主要是将相关的线程部分改为了boost::thread库,为了保持程序的完整性,将所有的程序贴了出来,方便阅读,设计思想很简单,主要是为boost::thread线程提供相应的处理函数即可,在实现的过程中,开始是想使用重载operator()的形式,但是测试下来发现其爬取不到任何的东西,个人感觉可能是在注册爬取网页缓存区出了问题,遂将其实现为函数的形式,然后再显式将其注册给线程,发现可行,总之,找到能够解决的方案就行,不要太追究其中的部分细节,尤其是在时间很紧的情况下,好了,等有时间再看看这个问题,本博文到此结束,多谢
如果需要,请注明转载,多谢