天网搜索TSE部分源码分析-url.cpp

最新推荐文章于 2013-01-09 11:30:39 发布

deepfuture

最新推荐文章于 2013-01-09 11:30:39 发布

阅读量674

点赞数

分类专栏：搜索引擎与人工智能文章标签： scheme string url null struct image

本文链接：https://blog.csdn.net/deepfuture/article/details/5085834

版权

搜索引擎与人工智能专栏收录该内容

217 篇文章 1 订阅

订阅专栏

//根据一个给定的URL，组成消息体，发送给该URL指向的服务器。

//为此，定义Url类

//url.cpp

#include<iostream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>

#include "Tse.h"
#include "Url.h"
#include "Http.h"
#include "Md5.h"
#include "StrFun.h"

//对网址的"."分隔符进行判断，主要分析是否以"."或".."结束
#define DOTP(x) ((*(x) == '.') &&(!*(x + 1)))

#define DDOTP(x) ((*(x) == '.') &&(*(x + 1) == '.') && (!*(x +2)))

map<string,string>mapCacheHostLookup;
extern vector<string>vsUnreachHost;
pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
extern set<string>setVisitedUrlMD5;
extern map<unsigned long,unsignedlong> mapIpBlock;
typedefmap<string,string>::value_typevalTypeCHL;

//定义连接类型结构

struct scheme_data
{
char *leading_string;//连接头字符
int default_port;//默认端口
int enabled;//允许连接否
};

//所有连接情况的定义
static struct scheme_data supported_schemes[] =
{
{ "http://", DEFAULT_HTTP_PORT, 1 },
{ "ftp://", DEFAULT_FTP_PORT, 1},

{NULL, -1, 0 }
};

//分析并填充连接类型

void CUrl::ParseScheme (const char *url)
{
int i;

for (i = 0;supported_schemes[i].leading_string; i++)

if (0 ==strncasecmp (url, supported_schemes[i].leading_string,
strlen (supported_schemes[i].leading_string))) {

　//判断连接类型，并更新m_eScheme成员

　　if (supported_schemes[i].enabled){
    this->m_eScheme= (enum url_scheme) i;
    return;
   }else{
    this->m_eScheme= SCHEME_INVALID;
    return;
   }
  }

this->m_eScheme= SCHEME_INVALID;
return;
}

bool CUrl::ParseUrlEx(string strUrl)
{
char protocol[10];
char host[HOST_LEN];
char request[256];
int port = -1;

//初始化相关填充区域

memset( protocol, 0,sizeof(protocol) );
memset( host, 0, sizeof(host) );
memset( request, 0, sizeof(request) );

//分析并填充连接类型

this->ParseScheme(strUrl.c_str());

//如果非HTTP类型，返回FALSE
if( this->m_eScheme != SCHEME_HTTP){
return false;
}

//如果是HTTP类型，ParseUrlEx继续处理，注意每个参数都有传送它的长度

ParseUrlEx(strUrl.c_str(),
   protocol,sizeof(protocol),
   host,sizeof(host),
   request,sizeof(request),
   &port);

//将取出后的URL的相关信息写入成员变量中

m_sUrl =strUrl;
m_sHost = host;
m_sPath = request;

if( port > 0){
m_nPort = port;
}

return true;
}

//对URL进一步处理

void CUrl::ParseUrlEx(const char *url,
  char *protocol, intlprotocol,
  char *host, int lhost,
  char *request, intlrequest,
  int *port)
{
char *work,*ptr,*ptr2;

*protocol = *host = *request =0;
*port = 80;

//准备一个临时缓冲区WORK

int len = strlen(url);
//pthread_mutex_lock(&mutexMemory);
work = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset(work, 0, len+1);
strncpy(work, url, len);

//依据":"找到协议，如果URL内有协议类型，则写入protocol，否则默认为HTTP

// find protocol if any
ptr = strchr(work, ':');
if( ptr != NULL ){
*(ptr++) = 0;//以免lprotocol比work中的协议字符的长度还要长
strncpy( protocol, work,lprotocol );
} else

{//有些URL省略了HTTP的前缀
strncpy( protocol, "HTTP",lprotocol );
ptr = work;
}

//跳过"//"

// skip past opening /'s
if( (*ptr=='/')&& (*(ptr+1)=='/') )
ptr+=2;

//使用ptr2处理主机

// find host
ptr2 = ptr;
while( IsValidHostChar(*ptr2)&& *ptr2 )
ptr2++;
*ptr2 = 0;//保证复制主机字符的正确性和安全性
strncpy( host, ptr, lhost );

//处理请求部分的字符

// find the request
int offset = ptr2 - work;
const char *pStr = url + offset;

//PSTR指向请求部分的起始部分通过offset偏移地址
strncpy( request, pStr, lrequest );

//处理端口号

// find the port number, ifany
ptr = strchr( host, ':' );
if( ptr != NULL ){
*ptr = 0;
*port = atoi(ptr+1);
}

//pthread_mutex_lock(&mutexMemory);
delete [] work;
//pthread_mutex_unlock(&mutexMemory);
work = NULL;
}

CUrl::CUrl()
{

//不带参数的构造函数初始化类成员
this->m_sUrl = "";
this->m_eScheme=SCHEME_INVALID;

this->m_sHost ="";
this->m_nPort =DEFAULT_HTTP_PORT;

this->m_sPath = "";

}

CUrl::~CUrl()
{

}

//通过主机字符得到IP地址
char * CUrl::GetIpByHost(const char *host)
{
if( !host ){ // nullpointer
return NULL;
}

if( !IsValidHost(host) ){ //invalid host
return NULL;
}

unsigned long inaddr = 0;
char *result = NULL;
int len = 0;

//主机转32位二进制数的IP地址
inaddr = (unsigned long)inet_addr( host );
//if ( (int)inaddr != -1){
if ( inaddr != INADDR_NONE){ // host is justip

//＝＝＝主机名如果为XX.XX.XX.XX形式的IP地址字符，则inet_addr执行成功
  len = strlen(host);
  //pthread_mutex_lock(&mutexMemory);
  result = new char[len+1];
  //pthread_mutex_unlock(&mutexMemory);
  memset(result, 0, len+1);
  memcpy(result, host, len);

return result;

}

else

{
//firt find from cache
//＝＝主机名字符为非IP形式

//Cache暂存的内存以主机名为索引查找IP地址
map<string,string>::iteratorit = mapCacheHostLookup.find(host);

if( it !=mapCacheHostLookup.end() ){ // find in host lookupcache
const char *strHostIp;

strHostIp= (*it).second.c_str();

   inaddr =(unsigned long)inet_addr( strHostIp );
   //if ((int)inaddr != -1){
   if ( inaddr!= INADDR_NONE ){

//成功在Cache中找到主机名对应的IP地址
    len= strlen(strHostIp);
    //pthread_mutex_lock(&mutexMemory);
    result= new char[len+1];
    //pthread_mutex_unlock(&mutexMemory);
    memset(result, 0, len+1 );
    memcpy(result, strHostIp, len );

    //cout<< ":)" ;

    returnresult;
         }
  }
}

//均未找到，只能求助于DNS服务

// if still not find, then try by DNSserver
struct hostent *hp;
hp =gethostbyname(host);//通过主机名获得IP地址
if(hp == NULL) {
//cout<< "gethostbyname() error inGetIpByHost: " << host<< endl;
return NULL;
}

// cache host lookup

//in为32位的IP地址结构变量
struct in_addr in;

bcopy(*(hp->h_addr_list),(caddr_t)&in, hp->h_length);
//inet_ntoa()是对structin_addr*结构转变成可以答应的ip的点进字

//符串，inet_ntop也是同样的功能，不过inet_ntop是使用于ipv4,ipv6

//而inet_ntoa只能用于ipv4

//AF_INET表示为IP地址形式即INTERNET地址家族

char abuf[INET_ADDRSTRLEN];

//将IN中的地址转成带点的IP地址字符形式
       if( inet_ntop(AF_INET, (void *)&in,abuf,sizeof(abuf)) == NULL ){
  cout<< "inet_ntop() return error inGetIpByHost" << endl;
  return NULL;

} else {

//可以成功转化成字符形式的IP写入abuf

  pthread_mutex_lock(&mutexCacheHost);
  //if(mapCacheHostLookup.count(host) == 0){
//更新CACHE中的主机和IP地址对

if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end()){

   //cout<< endl<< host<< " and "<< abuf<< endl;
   mapCacheHostLookup.insert(valTypeCHL ( host, abuf));
  }
  pthread_mutex_unlock(&mutexCacheHost);

}

// return result
len = strlen(abuf);
//pthread_mutex_lock(&mutexMemory);
result = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset( result, 0, len+1 );
memcpy( result, abuf, len );

return result;
}

bool CUrl::IsValidHostChar(char ch)
{

//所有数字，字母，-,.,:,_为主机字符可接受部分
return( isalpha(ch) || isdigit(ch)
|| ch=='-' || ch=='.' ||ch==':' || ch=='_');
}

bool CUrl::IsValidHost(const char *host)
{
if( !host ){
return false;
}

if( strlen(host) < 6 ){ // incase host like "www", "pku", etc.
return false;
}

char ch;
for(unsigned int i=0;i<strlen(host); i++){
  ch = *(host++);
  if( !IsValidHostChar(ch)){
   returnfalse;
  }
}

return true;
}

bool CUrl::IsVisitedUrl(const char *url)
{
if( !url ){
return true; // if be null, wethink it have been visited
}

CMD5 iMD5;
iMD5.GenerateMD5( (unsigned char*)url,strlen(url) );
string strDigest = iMD5.ToString();

if( setVisitedUrlMD5.find(strDigest) !=setVisitedUrlMD5.end() ) {
return true;
} else {
return false;
}

}

bool CUrl::IsValidIp(const char *ip)
{
if( ip == NULL ){
return false;
}

unsigned long inaddr = (unsignedlong)inet_addr(ip);
if( inaddr == INADDR_NONE ){ //invalid ip
return false;
}

if( mapIpBlock.size() > 0){
  map<unsignedlong,unsigned long>::iterator pos;
  for(pos=mapIpBlock.begin();pos!=mapIpBlock.end(); ++pos){
   unsigned longret;

   ret =inaddr & ~((*pos).second);
   if( ret ==(*pos).first ){ // inside
    returntrue;
   }
  }

// outside
return false;
}

// if block range is not given, we think itinside also
return true;
}

bool CUrl::IsForeignHost(string host)
{
if( host.empty() ) return true;
if( host.size() > HOST_LEN )return true;

unsigned long inaddr = 0;

inaddr = (unsigned long)inet_addr(host.c_str() );
if ( inaddr != INADDR_NONE){ // host is justip
return false;
}

string::size_type idx = host.rfind('.');
string tmp;
if( idx != string::npos ){
tmp = host.substr(idx+1);
}

CStrFun::Str2Lower( tmp, tmp.size() );
const char *home_host[] ={
"cn","com","net","org","info",
"biz","tv","cc", "hk","tw"
};

int home_host_num = 10;

for(int i=0; i<home_host_num;i++){
if( tmp == home_host[i] )
returnfalse;
}

return true;
}

bool CUrl::IsImageUrl(string url)
{
if( url.empty() ) return false;
if( url.size() > HOST_LEN ) returnfalse;

string::size_type idx = url.rfind('.');
string tmp;
if( idx != string::npos ){
tmp = url.substr(idx+1);
}

CStrFun::Str2Lower( tmp, tmp.size() );
const char *image_type[] ={
"gif","jpg","jpeg","png","bmp",
"tif","psd"
};

int image_type_num = 7;

for (int i=0; i<image_type_num;i++)
{
if( tmp == image_type[i])
returntrue;
}

return false;
}

deepfuture

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
天网搜索TSE部分源码分析-url.cpp

//根据一个给定的URL，组成消息体，发送给该URL指向的服务器。 //为此，定义Url类//url.cpp#include#include #include #include #include "Tse.h"#include "Url.h"#include "Http.h"#include "Md5.h"#include "StrFun.h"//对网址的"."分隔符进行判
复制链接

扫一扫