libcurl多线程下载图像网页

//ImgDown.h

#ifndef _ImgDown_H_200102_
#define _ImgDown_H_200102_

#include <zlib.h>
#include <queue>
#include "Tse.h"
#include "Url.h"


#include <curl/curl.h>

typedef struct IMGURL
{
	string rootpage;
	string imglink;
}ImgLINK;

// I have had My heart broken. 2007.5.22.

using namespace std;

class CImgDown
{
public:
	string workDBNum;

	unsigned int num_scanner;

	unsigned int Pic;//Downed pictures

	unsigned int myDownPicLMT;

public:
	CImgDown();
	CImgDown(string);

	~CImgDown();

	int DoImgDown();

	void DownloadFile(unsigned long oPic,ImgLINK );

	// fetch the web pages. Each thread just execute this function.
	void fetch(void *arg);

	// add a parsed url into the collection
	int AddUrl(string);

private:
	const long MIN_IMG_LEN = 10000 ;
	const long MAX_IMG_LEN = 9000000;

	const int SPIDERS_EACH_SITE = 3 ;
	const int SPIDERS_NUM = 8 ;

	unsigned long ordRec;

	struct MemoryStruct {
		char *memory;
		size_t size;
	};

private:
	size_t imgFetch(string strUrl, char **fileBuf );
	static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp);

};
#endif

//ImgDown.cpp

#include "ImgDown.h"


int num_pthread ;

string seedImgUrlFile;	// seed URL file name

pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutexNewUrlMD5 = PTHREAD_MUTEX_INITIALIZER;

bool b_fOver,finish;
long nOffset;

multimap<string,ImgLINK > mmapImgUrls;
map<string,int> mapHosts;
typedef map<string,ImgLINK>::value_type mImgVal;

void* start(void *arg)
{
	( (CImgDown*)arg )->fetch(arg);
}

void CImgDown::fetch(void *arg)
{
	string strUrl,host="";
	ImgLINK urlPair;

	string	strGHost = "";
	unsigned long oPic;//Downed pictures

	long seedNum = 0 ;


	for(;;){

		seedNum++; // = 0

		pthread_mutex_lock(&mutexCollection);

		urlPair.imglink = "";

		bool cat = false;
		while(1){

			if(mmapImgUrls.empty()){
				if(!finish){
					AddUrl(seedImgUrlFile);
				}else break;
			}

			multimap<string,ImgLINK>::iterator it7=mmapImgUrls.find(host);
			if( it7 != mmapImgUrls.end() ){
				// get an URL Pair;
				urlPair = (*it7).second ;
				// remove it from the collection
				mmapImgUrls.erase( it7++ );
				cat = true;
				ordRec++;
				oPic = ordRec;

				break;
			}else{

				if(mapHosts.empty()){
					if(mmapImgUrls.size() < 3000 && !finish){
						AddUrl(seedImgUrlFile);
					}else break;
				}//Add host.

//if(mapHosts.empty()) break;//Bug 2011.12.28. Found ! Add this line;

				map<string,int>::iterator it7=mapHosts.begin();
				if( it7 != mapHosts.end() ){// get an host;
					host = it7->first ;
					if(--it7->second==0)
					// remove it from the collection;
						mapHosts.erase(host);
				}else break;
			}
		}//while(1)

		pthread_mutex_unlock(&mutexCollection);//??

		if( cat ){
			if( strGHost != host ){
				strGHost = host;
			}

			(( CImgDown* )arg)->DownloadFile(oPic,urlPair);
		} else break;

		usleep(1);
	}//_for
}

size_t  CImgDown::WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{

	size_t realsize = size * nmemb;

	struct MemoryStruct *mem = (struct MemoryStruct *)userp;

	char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1);

	if(ptr == NULL) {
		    /* out of memory! */ 
		printf("not enough memory (realloc returned NULL)\n");
	//	exit(1);
		return 0;
	}

	mem->memory = ptr;
	memcpy(&(mem->memory[mem->size]), contents, realsize);
	mem->size += realsize;
	mem->memory[mem->size] = 0;

	return realsize;
}
 

size_t CImgDown::imgFetch(string strUrl, char **fileBuf )
{

	CURL *curl_handle;
	CURLcode res;

	struct MemoryStruct DataChunk;


	DataChunk.memory = (char *)malloc(1);  /* will be grown as needed by the realloc above */ 
	DataChunk.size = 0;    /* no data at this point */ 


	/* init the curl session */ 
	curl_handle = curl_easy_init();

	/* specify URL to get */
	curl_easy_setopt(curl_handle, CURLOPT_URL, strUrl.c_str());

	/* complete within 20 seconds */

	curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 20L);


	curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);

	curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&DataChunk);

	/* some servers don't like requests that are made without a user-agent field, so we provide one */ 
	curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");

	/* get it! */ 
	res = curl_easy_perform(curl_handle);

	/* check for errors */ 
	if(res != CURLE_OK) {
		fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
		curl_easy_cleanup(curl_handle);
		return -1;
	}else{
		*fileBuf =  DataChunk.memory;
	}

	/* cleanup curl stuff */ 
	curl_easy_cleanup(curl_handle);

	return DataChunk.size ;// +(int)chunk.size;
}


void CImgDown::DownloadFile(unsigned long oPic,ImgLINK urlPair )
{
	string dom;
	string::size_type idx;

	if( (idx = urlPair.imglink.rfind(".")) != string::npos ){
		dom = urlPair.imglink.substr(idx);
	}else // impossible .
		cerr << "Error 1; imgDown.cpp " << urlPair.imglink << endl;


	if( dom!=".jpg" &&  dom!=".jpeg" && dom!=".bmp" )
		return;

	char	*downloaded_file = NULL;

	size_t file_length = imgFetch(urlPair.imglink, &downloaded_file );

	FILE *fp;


	if(file_length == -1){ // unreachable, skipped.

		if (downloaded_file)
		{
			free(downloaded_file); downloaded_file=NULL;
		}
		return;
	}

	if ( !downloaded_file){
		if (downloaded_file){
			free(downloaded_file); downloaded_file=NULL;
		}
		return;
	}

//sss============================================================================

//make name:

	char food[128];
	sprintf(food,"Img%ld%s",oPic,dom.c_str());//store at central;

	if( file_length > MIN_IMG_LEN && file_length < MAX_IMG_LEN){

		fp=fopen(food,"wb");

		if(!fp){
			cerr << "Error 7; imgDown.cpp can not open the file." << endl;

			if (downloaded_file)
			{
				free(downloaded_file); downloaded_file=NULL;
			}


			return ;
		}

		int ret=fwrite(downloaded_file,file_length,1,fp);

		if(ret!=1){
			cerr << "Error 8; imgDown.cpp can not write the pixel data." << endl;

			if (downloaded_file){
				free(downloaded_file); downloaded_file=NULL;
			}

			fclose(fp);
			return ;
		}

		if (downloaded_file){
			free(downloaded_file); downloaded_file=NULL;
		}

		fclose(fp);
//************************************************/


//cout << "Please See Imagure :" << food << endl;

	}else{
		if (downloaded_file){
			free(downloaded_file); downloaded_file=NULL;
		}

		return;
	}//e


	pthread_mutex_lock(&mutexDetect);


	/
	// Record img down infomation

	Pic++;//indeed down success;

	pthread_mutex_unlock(&mutexDetect);

	usleep(1);

	return;
}


// Construction/Destruction


CImgDown::CImgDown()
{
	nOffset=0;
	ordRec =0;

	b_fOver=false;
	finish=false;
}

CImgDown::CImgDown(string inFile)
{
	seedImgUrlFile = inFile;

	nOffset=0;
	ordRec =0;

	b_fOver=false;
	finish=false;
}

CImgDown::~CImgDown()
{
}

static void SigTerm(int x)
{

	cerr << "Error 2; imgDown.cpp Terminated!" << endl;

	exit(1);
}

int CImgDown::DoImgDown()
{

	curl_global_init(CURL_GLOBAL_ALL);


	pthread_t thrd; 

	//* set the signal function */
	signal(SIGTERM, SigTerm);
	signal(SIGKILL, SigTerm);
	signal(SIGINT, SigTerm);
	signal(SIGPIPE, SIG_IGN);
	signal(SIGCHLD,SIG_IGN);

//	signal(SIGABRT, SigTerm);

	// open the files for output

	if(AddUrl(seedImgUrlFile)) return 1;

	num_pthread = num_scanner;

	// Create thread ID structures. 
	pthread_t *tids = (pthread_t*)malloc(num_scanner * sizeof(pthread_t)); 

	if( tids == NULL){
		cerr << "Error 4; imgDown.cpp" << endl;
	}

	for(unsigned int i=0; i< num_scanner; i++){
		if( pthread_create( &tids[i], NULL, start, this))
			cerr << "Error 5; imgDown.cpp create threads error" << endl;
	}

	b_fOver = true;



	for (unsigned int i = 0; i < num_scanner; ++i){
		(void)pthread_join(tids[i], NULL);
	}


	curl_global_cleanup();

	return 0;
}

int CImgDown::AddUrl(string InputFile)
{

	CUrl iUrl;
	string strUrl,host;

	ImgLINK urlPair;

	// open the seed url file
	ifstream ifsSeed(InputFile.c_str());
	if (!ifsSeed){
		finish=true;
		return 1;
	} 

	ifsSeed.seekg(nOffset);

	string::size_type idx;
	map<string, int >::iterator itHost;

	int i=0;
	for(;;){//each time read a group of lines.

		nOffset = ifsSeed.tellg();

		if ( !getline(ifsSeed,strUrl) ){
			finish=true;
			break;
		}

		if(((idx = strUrl.find("Root:")) != string::npos)){
			if( i> 1000) break;

			strUrl = strUrl.substr(5);
			urlPair.rootpage = strUrl;

			iUrl.ParseUrlEx(strUrl);//wrong is impossible.
			host = iUrl.m_sHost;

			continue;
		}//else

		urlPair.imglink = strUrl; 

		// make sure limited threads crawling on a site ?
		mmapImgUrls.insert(mImgVal( host, urlPair));
		mapHosts.insert(pair<string,int>( host, SPIDERS_EACH_SITE));
		//Number of spide crawl on a website.

		i++;
	}//_for

	ifsSeed.close();
	return 0;
}//_add

//Url.cpp

#include <iostream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>

#include "Tse.h"
#include "Url.h"

//* Is X "."?  */
#define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
//* Is X ".."?  */
#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))


typedef map<string,string>::value_type valTypeCHL;

struct scheme_data
{
	char *leading_string;
	int default_port;
	int enabled;
};

//* Supported schemes: */
static struct scheme_data supported_schemes[] =
{
	{ "http://",  DEFAULT_HTTP_PORT,  1 },
	{ "ftp://",   DEFAULT_FTP_PORT,   1 },

	//* SCHEME_INVALID */
	{ NULL,       -1,                 0 }
};

//* Returns the scheme type if the scheme is supported, or SCHEME_INVALID if not.  */
void CUrl::ParseScheme (const char *url)
{
	int i;

	for (i = 0; supported_schemes[i].leading_string; i++)

		if (0 == strncasecmp (url, supported_schemes[i].leading_string,
                          strlen (supported_schemes[i].leading_string))) {

			if (supported_schemes[i].enabled){
				this->m_eScheme = (enum url_scheme) i;
				return;
			}else{
				this->m_eScheme = SCHEME_INVALID;
				return;
			}
		}

	this->m_eScheme = SCHEME_INVALID;
	return;
}

//************************************************************************
// *  Function name: ParseUrlEx
// *  Input argv:
// *  	-- strUrl: url
// *  Output argv:
// *  	--
// *  Return:
//   	true: success
//   	false: fail
// *  Fucntion Description: break an URL into scheme, host, port and request.
// *  			result as member variants
// *  Be careful:	release the memory by the client
//************************************************************************/
bool CUrl::ParseUrlEx(string strUrl)
{
	char protocol[10];
	char host[HOST_LEN];
	char request[256];
	int port = -1;

	memset( protocol, 0, sizeof(protocol) );
	memset( host, 0, sizeof(host) );
	memset( request, 0, sizeof(request) );

	this->ParseScheme(strUrl.c_str());
	if( this->m_eScheme != SCHEME_HTTP ){
		return false;
	}

	ParseUrlEx(strUrl.c_str(),
			protocol, sizeof(protocol),
			host, sizeof(host),
			request, sizeof(request),
			&port);

	m_sUrl  = strUrl;
	m_sHost = host;
	m_sPath = request;

	if( port > 0 ){
		m_nPort = port;
	}

	return true;
}

//************************************************************************
//*  Function name: ParseUrlEx
//*  Input argv:
//*  	-- url: host name
//*  	-- protocol: result protocol
//*  	-- lprotocol: protocol length
//*  	-- host: result host
//*  	-- lhost: host length
//*  	-- request: result request
//*  	-- lrequest: request length
//*  Output argv:
//*  	--
//*  Return:
//   	true: success
//   	false: fail
//*  Fucntion Description: break an URL into scheme, host, port and request.
//*  			result as argvs
// *  Be careful:
//************************************************************************/
void CUrl::ParseUrlEx(const char *url,
		char *protocol, int lprotocol,
		char *host, int lhost,
		char *request, int lrequest,
		int *port)
{
	char *work,*ptr,*ptr2;

	*protocol = *host = *request = 0;
	*port = 80;

	int len = strlen(url);
	//pthread_mutex_lock(&mutexMemory);
	work = new char[len + 1];
	//pthread_mutex_unlock(&mutexMemory);
	memset(work, 0, len+1);
	strncpy(work, url, len);

	// find protocol if any
	ptr = strchr(work, ':');
	if( ptr != NULL ){
		*(ptr++) = 0;
		strncpy( protocol, work, lprotocol );
	} else {
		strncpy( protocol, "HTTP", lprotocol );
		ptr = work;
	}

	// skip past opening /'s
	if( (*ptr=='/') && (*(ptr+1)=='/') )
		ptr+=2;

	// find host
	ptr2 = ptr;
	while( IsValidHostChar(*ptr2) && *ptr2 )
		ptr2++;
	*ptr2 = 0;
	strncpy( host, ptr, lhost );

	// find the request
	int offset = ptr2 - work;
	const char *pStr = url + offset;
	strncpy( request, pStr, lrequest );

	// find the port number, if any
	ptr = strchr( host, ':' );
	if( ptr != NULL ){
		*ptr = 0;
		*port = atoi(ptr+1);
	}

	delete [] work;

	work = NULL;
}


CUrl::CUrl()
{
	this->m_sUrl = ""; 
	this->m_eScheme= SCHEME_INVALID;
        
	this->m_sHost = "";  
	this->m_nPort = DEFAULT_HTTP_PORT; 
        
	this->m_sPath = "";

}

CUrl::~CUrl()
{

}


//**********************************************************************************
//*  Function name: IsValidHostChar
//*  Input argv:
//*  	-- ch: the character for testing
//*  Output argv:
//*  	-- 
//*  Return:
//   	true: is valid
//   	false: is invalid
//*  Function Description: test the specified character valid
//*  			for a host name, i.e. A-Z or 0-9 or -.:
//**********************************************************************************/
bool CUrl::IsValidHostChar(char ch)
{
	return( isalpha(ch) || isdigit(ch)
		|| ch=='-' || ch=='.' || ch==':' || ch=='_');
}

//**********************************************************************************
//*  Function name: IsValidHost
//*  Input argv:
//*  	-- ch: the character for testing
//*  Output argv:
//*  	-- 
//*  Return:
//   	true: is valid
//   	false: is invalid
//*  Function Description: test the specified character valid
//*  			for a host name, i.e. A-Z or 0-9 or -.:
//*  Be careful:
//**********************************************************************************/
bool CUrl::IsValidHost(const char *host)
{
	if( !host ){
		return false;
	}

	if( strlen(host) < 6 ){ // in case host like "www", "pku", etc.
		return false;
	}

	char ch;
	for(unsigned int i=0; i<strlen(host); i++){
		ch = *(host++);
		if( !IsValidHostChar(ch) ){
			return false;
		}
	}

	return true;
}

//**********************************************************************************
//*  Function name: IsValidIp
//*  Input argv:
//*  	-- ip: ip
//*  Output argv:
//*  	-- 
//*  Return:
//   	true: inside the ip block
//   	false: outside the ip block
//*  Function Description: decide teh ip whether or not inside the ip block
// *  Be careful:
//**********************************************************************************/

bool CUrl::IsValidIp(const char *ip)
{
	if( ip == NULL ){
		return false;
	}

	unsigned long inaddr = (unsigned long)inet_addr(ip);
	if( inaddr == INADDR_NONE ){	// invalid ip
		return false;
	}

	// if block range is not given, we think it inside also
	return true;
}


//Url.h

#ifndef _URL_H_200102_
#define _URL_H_200102_

#include <string>
const unsigned int URL_LEN	= 256;
const unsigned int HOST_LEN	= 256;

using namespace std;

enum url_scheme {
	SCHEME_HTTP,
	SCHEME_FTP,
	SCHEME_INVALID
};

const int DEFAULT_HTTP_PORT = 80;
const int DEFAULT_FTP_PORT  = 21;

class CUrl
{
public:
	string m_sUrl;			// Original URL
	enum url_scheme m_eScheme;	// URL scheme

	string	m_sHost;		// Extracted hostname 
	int	m_nPort;		// Port number
	string	m_sPath;		// Request

public:
	CUrl();
	~CUrl();

	// break  an URL into scheme, host, port and request.
	// result as member variants
	bool ParseUrlEx(string strUrl);

	// break an URL into scheme, host, port and request.
	// result url as argvs
	void ParseUrlEx(const char *url, char *protocol, int lprotocol,
			char *host, int lhost,
			char *request, int lrequest, int *port);

	// get the ip address by host name
	char *GetIpByHost(const char *host);

	bool IsValidHost(const char *ip);

	bool IsValidIp(const char *ip);
	bool IsVisitedUrl(const char *url);

	bool IsUnReachedUrl(const char *url);
	bool IsValidHostChar(char ch);

private:
	void ParseScheme (const char *url);

};

#endif

//micSky.cpp

#include <string>
#include <fstream>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "ImgDown.h"
 
int main(int arc, char* arv[])
{

	CImgDown iImgDown("imgUrl");
	iImgDown.num_scanner = 8;

	iImgDown.DoImgDown();

	exit(0);
}



Tse.h

#ifndef _TSE_H_080112_
#define _TSE_H_080112_

#ifndef		_STRING_
#define		_STRING_
#include	<string>
#endif


using namespace std;


//============= Include file ==================

#ifndef         _IOSTREAM_
#define         _IOSTREAM_
#include        <iostream>
#endif

#ifndef         _FSTREAM_
#define         _FSTREAM_
#include        <fstream>
#endif

#ifndef         _STDIO_H_
#define         _STDIO_H_
#include        <stdio.h>
#endif

#ifndef         _STDLIB_H_
#define         _STDLIB_H_
#include        <stdlib.h>
#endif

#ifndef         _CSTDLIB_
#define         _CSTDLIB_
#include        <cstdlib>
#endif

#ifndef         _CSTRING_
#define         _CSTRING_
#include        <cstring>
#endif

#ifndef         _DIRENT_H_
#define         _DIRENT_H_
#include        <dirent.h>
#endif

#ifndef         _UNISTD_H_
#define         _UNISTD_H_
#include        <unistd.h>
#endif

#ifndef         _S_DIR_H_
#define         _S_DIR_H_
#include        <sys/dir.h>
#endif

#ifndef         _S_TYPES_H_
#define         _S_TYPES_H_
#include        <sys/types.h>
#endif

#ifndef         _S_STAT_H_
#define         _S_STAT_H_
#include        <sys/stat.h>
#endif

#ifndef         _FTW_H_
#define         _FTW_H_
#include        <ftw.h>
#endif

#ifndef         _LIST_
#define         _LIST_
#include        <list>
#endif

#ifndef         _MAP_
#define         _MAP_
#include        <map>
#endif


#ifndef         _ERROR_H_
#define         _ERROR_H_
#include        <error.h>
#endif

#ifndef		_STREAMBUF_
#define		_STREAMBUF_
#include 	<streambuf>
#endif

#ifndef		_IOMANIP_
#define		_IOMANIP_	
#include	 <iomanip>
#endif

#ifndef		_TIME_H_
#define		_TIME_H_	
#include	 <time.h>
#endif

#ifndef		_CTIME_
#define		_CTIME_
#include 	<ctime>
#endif

#ifndef		_ALGORITHM_
#define		_ALGORITHM_
#include 	<algorithm>
#endif

#ifndef		_CCTYPE_
#define		_CCTYPE_
#include 	<cctype>
#endif


#ifndef		_VECTOR_
#define		_VECTOR_
#include 	<vector>
#endif

#ifndef		_ITERATOR_
#define		_ITERATOR_
#include	<iterator>
#endif

#ifndef		_DEQUE_
#define		_DEQUE_
#include 	<deque>
#endif

#ifndef		_SET_
#define		_SET_
#include 	<set>
#endif

#ifndef		_CASSERT_
#define		_CASSERT_
#include	<cassert>
#endif

#ifndef		_SIGNAL_H_
#define		_SIGNAL_H_
#include	<signal.h>
#endif

#ifndef		_SOCKET_H_	
#define		_SOCKET_H_
#include	<sys/socket.h>
#endif

#ifndef		_IN_H_
#define		_IN_H_
#include	<netinet/in.h>
#endif

#ifndef		_INET_H_
#define		_INET_H_
#include	<arpa/inet.h>
#endif


#endif
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值