//ImgDown.h
#ifndef _ImgDown_H_200102_
#define _ImgDown_H_200102_
#include <zlib.h>
#include <queue>
#include "Tse.h"
#include "Url.h"
#include <curl/curl.h>
typedef struct IMGURL
{
string rootpage;
string imglink;
}ImgLINK;
// I have had My heart broken. 2007.5.22.
using namespace std;
class CImgDown
{
public:
string workDBNum;
unsigned int num_scanner;
unsigned int Pic;//Downed pictures
unsigned int myDownPicLMT;
public:
CImgDown();
CImgDown(string);
~CImgDown();
int DoImgDown();
void DownloadFile(unsigned long oPic,ImgLINK );
// fetch the web pages. Each thread just execute this function.
void fetch(void *arg);
// add a parsed url into the collection
int AddUrl(string);
private:
const long MIN_IMG_LEN = 10000 ;
const long MAX_IMG_LEN = 9000000;
const int SPIDERS_EACH_SITE = 3 ;
const int SPIDERS_NUM = 8 ;
unsigned long ordRec;
struct MemoryStruct {
char *memory;
size_t size;
};
private:
size_t imgFetch(string strUrl, char **fileBuf );
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp);
};
#endif
//ImgDown.cpp
#include "ImgDown.h"
int num_pthread ;
string seedImgUrlFile; // seed URL file name
pthread_mutex_t mutexCollection = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutexDetect = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutexNewUrlMD5 = PTHREAD_MUTEX_INITIALIZER;
bool b_fOver,finish;
long nOffset;
multimap<string,ImgLINK > mmapImgUrls;
map<string,int> mapHosts;
typedef map<string,ImgLINK>::value_type mImgVal;
void* start(void *arg)
{
( (CImgDown*)arg )->fetch(arg);
}
void CImgDown::fetch(void *arg)
{
string strUrl,host="";
ImgLINK urlPair;
string strGHost = "";
unsigned long oPic;//Downed pictures
long seedNum = 0 ;
for(;;){
seedNum++; // = 0
pthread_mutex_lock(&mutexCollection);
urlPair.imglink = "";
bool cat = false;
while(1){
if(mmapImgUrls.empty()){
if(!finish){
AddUrl(seedImgUrlFile);
}else break;
}
multimap<string,ImgLINK>::iterator it7=mmapImgUrls.find(host);
if( it7 != mmapImgUrls.end() ){
// get an URL Pair;
urlPair = (*it7).second ;
// remove it from the collection
mmapImgUrls.erase( it7++ );
cat = true;
ordRec++;
oPic = ordRec;
break;
}else{
if(mapHosts.empty()){
if(mmapImgUrls.size() < 3000 && !finish){
AddUrl(seedImgUrlFile);
}else break;
}//Add host.
//if(mapHosts.empty()) break;//Bug 2011.12.28. Found ! Add this line;
map<string,int>::iterator it7=mapHosts.begin();
if( it7 != mapHosts.end() ){// get an host;
host = it7->first ;
if(--it7->second==0)
// remove it from the collection;
mapHosts.erase(host);
}else break;
}
}//while(1)
pthread_mutex_unlock(&mutexCollection);//??
if( cat ){
if( strGHost != host ){
strGHost = host;
}
(( CImgDown* )arg)->DownloadFile(oPic,urlPair);
} else break;
usleep(1);
}//_for
}
size_t CImgDown::WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1);
if(ptr == NULL) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
// exit(1);
return 0;
}
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
size_t CImgDown::imgFetch(string strUrl, char **fileBuf )
{
CURL *curl_handle;
CURLcode res;
struct MemoryStruct DataChunk;
DataChunk.memory = (char *)malloc(1); /* will be grown as needed by the realloc above */
DataChunk.size = 0; /* no data at this point */
/* init the curl session */
curl_handle = curl_easy_init();
/* specify URL to get */
curl_easy_setopt(curl_handle, CURLOPT_URL, strUrl.c_str());
/* complete within 20 seconds */
curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 20L);
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&DataChunk);
/* some servers don't like requests that are made without a user-agent field, so we provide one */
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
/* get it! */
res = curl_easy_perform(curl_handle);
/* check for errors */
if(res != CURLE_OK) {
fprintf(stderr, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
curl_easy_cleanup(curl_handle);
return -1;
}else{
*fileBuf = DataChunk.memory;
}
/* cleanup curl stuff */
curl_easy_cleanup(curl_handle);
return DataChunk.size ;// +(int)chunk.size;
}
void CImgDown::DownloadFile(unsigned long oPic,ImgLINK urlPair )
{
string dom;
string::size_type idx;
if( (idx = urlPair.imglink.rfind(".")) != string::npos ){
dom = urlPair.imglink.substr(idx);
}else // impossible .
cerr << "Error 1; imgDown.cpp " << urlPair.imglink << endl;
if( dom!=".jpg" && dom!=".jpeg" && dom!=".bmp" )
return;
char *downloaded_file = NULL;
size_t file_length = imgFetch(urlPair.imglink, &downloaded_file );
FILE *fp;
if(file_length == -1){ // unreachable, skipped.
if (downloaded_file)
{
free(downloaded_file); downloaded_file=NULL;
}
return;
}
if ( !downloaded_file){
if (downloaded_file){
free(downloaded_file); downloaded_file=NULL;
}
return;
}
//sss============================================================================
//make name:
char food[128];
sprintf(food,"Img%ld%s",oPic,dom.c_str());//store at central;
if( file_length > MIN_IMG_LEN && file_length < MAX_IMG_LEN){
fp=fopen(food,"wb");
if(!fp){
cerr << "Error 7; imgDown.cpp can not open the file." << endl;
if (downloaded_file)
{
free(downloaded_file); downloaded_file=NULL;
}
return ;
}
int ret=fwrite(downloaded_file,file_length,1,fp);
if(ret!=1){
cerr << "Error 8; imgDown.cpp can not write the pixel data." << endl;
if (downloaded_file){
free(downloaded_file); downloaded_file=NULL;
}
fclose(fp);
return ;
}
if (downloaded_file){
free(downloaded_file); downloaded_file=NULL;
}
fclose(fp);
//************************************************/
//cout << "Please See Imagure :" << food << endl;
}else{
if (downloaded_file){
free(downloaded_file); downloaded_file=NULL;
}
return;
}//e
pthread_mutex_lock(&mutexDetect);
/
// Record img down infomation
Pic++;//indeed down success;
pthread_mutex_unlock(&mutexDetect);
usleep(1);
return;
}
// Construction/Destruction
CImgDown::CImgDown()
{
nOffset=0;
ordRec =0;
b_fOver=false;
finish=false;
}
CImgDown::CImgDown(string inFile)
{
seedImgUrlFile = inFile;
nOffset=0;
ordRec =0;
b_fOver=false;
finish=false;
}
CImgDown::~CImgDown()
{
}
static void SigTerm(int x)
{
cerr << "Error 2; imgDown.cpp Terminated!" << endl;
exit(1);
}
int CImgDown::DoImgDown()
{
curl_global_init(CURL_GLOBAL_ALL);
pthread_t thrd;
//* set the signal function */
signal(SIGTERM, SigTerm);
signal(SIGKILL, SigTerm);
signal(SIGINT, SigTerm);
signal(SIGPIPE, SIG_IGN);
signal(SIGCHLD,SIG_IGN);
// signal(SIGABRT, SigTerm);
// open the files for output
if(AddUrl(seedImgUrlFile)) return 1;
num_pthread = num_scanner;
// Create thread ID structures.
pthread_t *tids = (pthread_t*)malloc(num_scanner * sizeof(pthread_t));
if( tids == NULL){
cerr << "Error 4; imgDown.cpp" << endl;
}
for(unsigned int i=0; i< num_scanner; i++){
if( pthread_create( &tids[i], NULL, start, this))
cerr << "Error 5; imgDown.cpp create threads error" << endl;
}
b_fOver = true;
for (unsigned int i = 0; i < num_scanner; ++i){
(void)pthread_join(tids[i], NULL);
}
curl_global_cleanup();
return 0;
}
int CImgDown::AddUrl(string InputFile)
{
CUrl iUrl;
string strUrl,host;
ImgLINK urlPair;
// open the seed url file
ifstream ifsSeed(InputFile.c_str());
if (!ifsSeed){
finish=true;
return 1;
}
ifsSeed.seekg(nOffset);
string::size_type idx;
map<string, int >::iterator itHost;
int i=0;
for(;;){//each time read a group of lines.
nOffset = ifsSeed.tellg();
if ( !getline(ifsSeed,strUrl) ){
finish=true;
break;
}
if(((idx = strUrl.find("Root:")) != string::npos)){
if( i> 1000) break;
strUrl = strUrl.substr(5);
urlPair.rootpage = strUrl;
iUrl.ParseUrlEx(strUrl);//wrong is impossible.
host = iUrl.m_sHost;
continue;
}//else
urlPair.imglink = strUrl;
// make sure limited threads crawling on a site ?
mmapImgUrls.insert(mImgVal( host, urlPair));
mapHosts.insert(pair<string,int>( host, SPIDERS_EACH_SITE));
//Number of spide crawl on a website.
i++;
}//_for
ifsSeed.close();
return 0;
}//_add
//Url.cpp
#include <iostream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>
#include "Tse.h"
#include "Url.h"
//* Is X "."? */
#define DOTP(x) ((*(x) == '.') && (!*(x + 1)))
//* Is X ".."? */
#define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2)))
typedef map<string,string>::value_type valTypeCHL;
struct scheme_data
{
char *leading_string;
int default_port;
int enabled;
};
//* Supported schemes: */
static struct scheme_data supported_schemes[] =
{
{ "http://", DEFAULT_HTTP_PORT, 1 },
{ "ftp://", DEFAULT_FTP_PORT, 1 },
//* SCHEME_INVALID */
{ NULL, -1, 0 }
};
//* Returns the scheme type if the scheme is supported, or SCHEME_INVALID if not. */
void CUrl::ParseScheme (const char *url)
{
int i;
for (i = 0; supported_schemes[i].leading_string; i++)
if (0 == strncasecmp (url, supported_schemes[i].leading_string,
strlen (supported_schemes[i].leading_string))) {
if (supported_schemes[i].enabled){
this->m_eScheme = (enum url_scheme) i;
return;
}else{
this->m_eScheme = SCHEME_INVALID;
return;
}
}
this->m_eScheme = SCHEME_INVALID;
return;
}
//************************************************************************
// * Function name: ParseUrlEx
// * Input argv:
// * -- strUrl: url
// * Output argv:
// * --
// * Return:
// true: success
// false: fail
// * Fucntion Description: break an URL into scheme, host, port and request.
// * result as member variants
// * Be careful: release the memory by the client
//************************************************************************/
bool CUrl::ParseUrlEx(string strUrl)
{
char protocol[10];
char host[HOST_LEN];
char request[256];
int port = -1;
memset( protocol, 0, sizeof(protocol) );
memset( host, 0, sizeof(host) );
memset( request, 0, sizeof(request) );
this->ParseScheme(strUrl.c_str());
if( this->m_eScheme != SCHEME_HTTP ){
return false;
}
ParseUrlEx(strUrl.c_str(),
protocol, sizeof(protocol),
host, sizeof(host),
request, sizeof(request),
&port);
m_sUrl = strUrl;
m_sHost = host;
m_sPath = request;
if( port > 0 ){
m_nPort = port;
}
return true;
}
//************************************************************************
//* Function name: ParseUrlEx
//* Input argv:
//* -- url: host name
//* -- protocol: result protocol
//* -- lprotocol: protocol length
//* -- host: result host
//* -- lhost: host length
//* -- request: result request
//* -- lrequest: request length
//* Output argv:
//* --
//* Return:
// true: success
// false: fail
//* Fucntion Description: break an URL into scheme, host, port and request.
//* result as argvs
// * Be careful:
//************************************************************************/
void CUrl::ParseUrlEx(const char *url,
char *protocol, int lprotocol,
char *host, int lhost,
char *request, int lrequest,
int *port)
{
char *work,*ptr,*ptr2;
*protocol = *host = *request = 0;
*port = 80;
int len = strlen(url);
//pthread_mutex_lock(&mutexMemory);
work = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset(work, 0, len+1);
strncpy(work, url, len);
// find protocol if any
ptr = strchr(work, ':');
if( ptr != NULL ){
*(ptr++) = 0;
strncpy( protocol, work, lprotocol );
} else {
strncpy( protocol, "HTTP", lprotocol );
ptr = work;
}
// skip past opening /'s
if( (*ptr=='/') && (*(ptr+1)=='/') )
ptr+=2;
// find host
ptr2 = ptr;
while( IsValidHostChar(*ptr2) && *ptr2 )
ptr2++;
*ptr2 = 0;
strncpy( host, ptr, lhost );
// find the request
int offset = ptr2 - work;
const char *pStr = url + offset;
strncpy( request, pStr, lrequest );
// find the port number, if any
ptr = strchr( host, ':' );
if( ptr != NULL ){
*ptr = 0;
*port = atoi(ptr+1);
}
delete [] work;
work = NULL;
}
CUrl::CUrl()
{
this->m_sUrl = "";
this->m_eScheme= SCHEME_INVALID;
this->m_sHost = "";
this->m_nPort = DEFAULT_HTTP_PORT;
this->m_sPath = "";
}
CUrl::~CUrl()
{
}
//**********************************************************************************
//* Function name: IsValidHostChar
//* Input argv:
//* -- ch: the character for testing
//* Output argv:
//* --
//* Return:
// true: is valid
// false: is invalid
//* Function Description: test the specified character valid
//* for a host name, i.e. A-Z or 0-9 or -.:
//**********************************************************************************/
bool CUrl::IsValidHostChar(char ch)
{
return( isalpha(ch) || isdigit(ch)
|| ch=='-' || ch=='.' || ch==':' || ch=='_');
}
//**********************************************************************************
//* Function name: IsValidHost
//* Input argv:
//* -- ch: the character for testing
//* Output argv:
//* --
//* Return:
// true: is valid
// false: is invalid
//* Function Description: test the specified character valid
//* for a host name, i.e. A-Z or 0-9 or -.:
//* Be careful:
//**********************************************************************************/
bool CUrl::IsValidHost(const char *host)
{
if( !host ){
return false;
}
if( strlen(host) < 6 ){ // in case host like "www", "pku", etc.
return false;
}
char ch;
for(unsigned int i=0; i<strlen(host); i++){
ch = *(host++);
if( !IsValidHostChar(ch) ){
return false;
}
}
return true;
}
//**********************************************************************************
//* Function name: IsValidIp
//* Input argv:
//* -- ip: ip
//* Output argv:
//* --
//* Return:
// true: inside the ip block
// false: outside the ip block
//* Function Description: decide teh ip whether or not inside the ip block
// * Be careful:
//**********************************************************************************/
bool CUrl::IsValidIp(const char *ip)
{
if( ip == NULL ){
return false;
}
unsigned long inaddr = (unsigned long)inet_addr(ip);
if( inaddr == INADDR_NONE ){ // invalid ip
return false;
}
// if block range is not given, we think it inside also
return true;
}
//Url.h
#ifndef _URL_H_200102_
#define _URL_H_200102_
#include <string>
const unsigned int URL_LEN = 256;
const unsigned int HOST_LEN = 256;
using namespace std;
enum url_scheme {
SCHEME_HTTP,
SCHEME_FTP,
SCHEME_INVALID
};
const int DEFAULT_HTTP_PORT = 80;
const int DEFAULT_FTP_PORT = 21;
class CUrl
{
public:
string m_sUrl; // Original URL
enum url_scheme m_eScheme; // URL scheme
string m_sHost; // Extracted hostname
int m_nPort; // Port number
string m_sPath; // Request
public:
CUrl();
~CUrl();
// break an URL into scheme, host, port and request.
// result as member variants
bool ParseUrlEx(string strUrl);
// break an URL into scheme, host, port and request.
// result url as argvs
void ParseUrlEx(const char *url, char *protocol, int lprotocol,
char *host, int lhost,
char *request, int lrequest, int *port);
// get the ip address by host name
char *GetIpByHost(const char *host);
bool IsValidHost(const char *ip);
bool IsValidIp(const char *ip);
bool IsVisitedUrl(const char *url);
bool IsUnReachedUrl(const char *url);
bool IsValidHostChar(char ch);
private:
void ParseScheme (const char *url);
};
#endif
//micSky.cpp
#include <string>
#include <fstream>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ImgDown.h"
int main(int arc, char* arv[])
{
CImgDown iImgDown("imgUrl");
iImgDown.num_scanner = 8;
iImgDown.DoImgDown();
exit(0);
}
Tse.h
#ifndef _TSE_H_080112_
#define _TSE_H_080112_
#ifndef _STRING_
#define _STRING_
#include <string>
#endif
using namespace std;
//============= Include file ==================
#ifndef _IOSTREAM_
#define _IOSTREAM_
#include <iostream>
#endif
#ifndef _FSTREAM_
#define _FSTREAM_
#include <fstream>
#endif
#ifndef _STDIO_H_
#define _STDIO_H_
#include <stdio.h>
#endif
#ifndef _STDLIB_H_
#define _STDLIB_H_
#include <stdlib.h>
#endif
#ifndef _CSTDLIB_
#define _CSTDLIB_
#include <cstdlib>
#endif
#ifndef _CSTRING_
#define _CSTRING_
#include <cstring>
#endif
#ifndef _DIRENT_H_
#define _DIRENT_H_
#include <dirent.h>
#endif
#ifndef _UNISTD_H_
#define _UNISTD_H_
#include <unistd.h>
#endif
#ifndef _S_DIR_H_
#define _S_DIR_H_
#include <sys/dir.h>
#endif
#ifndef _S_TYPES_H_
#define _S_TYPES_H_
#include <sys/types.h>
#endif
#ifndef _S_STAT_H_
#define _S_STAT_H_
#include <sys/stat.h>
#endif
#ifndef _FTW_H_
#define _FTW_H_
#include <ftw.h>
#endif
#ifndef _LIST_
#define _LIST_
#include <list>
#endif
#ifndef _MAP_
#define _MAP_
#include <map>
#endif
#ifndef _ERROR_H_
#define _ERROR_H_
#include <error.h>
#endif
#ifndef _STREAMBUF_
#define _STREAMBUF_
#include <streambuf>
#endif
#ifndef _IOMANIP_
#define _IOMANIP_
#include <iomanip>
#endif
#ifndef _TIME_H_
#define _TIME_H_
#include <time.h>
#endif
#ifndef _CTIME_
#define _CTIME_
#include <ctime>
#endif
#ifndef _ALGORITHM_
#define _ALGORITHM_
#include <algorithm>
#endif
#ifndef _CCTYPE_
#define _CCTYPE_
#include <cctype>
#endif
#ifndef _VECTOR_
#define _VECTOR_
#include <vector>
#endif
#ifndef _ITERATOR_
#define _ITERATOR_
#include <iterator>
#endif
#ifndef _DEQUE_
#define _DEQUE_
#include <deque>
#endif
#ifndef _SET_
#define _SET_
#include <set>
#endif
#ifndef _CASSERT_
#define _CASSERT_
#include <cassert>
#endif
#ifndef _SIGNAL_H_
#define _SIGNAL_H_
#include <signal.h>
#endif
#ifndef _SOCKET_H_
#define _SOCKET_H_
#include <sys/socket.h>
#endif
#ifndef _IN_H_
#define _IN_H_
#include <netinet/in.h>
#endif
#ifndef _INET_H_
#define _INET_H_
#include <arpa/inet.h>
#endif
#endif