天网搜索TSE部分源码分析-url.cpp

//根据一个给定的URL,组成消息体,发送给该URL指向的服务器。

//为此,定义Url类
//url.cpp

#include<iostream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>

#include "Tse.h"
#include "Url.h"
#include "Http.h"
#include "Md5.h"
#include "StrFun.h"

//对网址的"."分隔符进行判断,主要分析是否以"."或".."结束
#define DOTP(x) ((*(x) == '.') &&(!*(x + 1)))

#define DDOTP(x) ((*(x) == '.') &&(*(x + 1) == '.') && (!*(x +2)))

 

map<string,string>mapCacheHostLookup;
extern vector<string>vsUnreachHost;
pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
extern set<string>setVisitedUrlMD5;
extern map<unsigned long,unsignedlong> mapIpBlock;
typedefmap<string,string>::value_typevalTypeCHL;

 

//定义连接类型结构

struct scheme_data
{
 char *leading_string;//连接头字符
 int default_port;//默认端口
 int enabled;//允许连接否
};

 

//所有连接情况的定义
static struct scheme_data supported_schemes[] =
{
 { "http://", DEFAULT_HTTP_PORT,  1 },
 { "ftp://",  DEFAULT_FTP_PORT,   1},

 
 {NULL,      -1,                0 }
};


//分析并填充连接类型

void CUrl::ParseScheme (const char *url)
{
 int i;

 for (i = 0;supported_schemes[i].leading_string; i++)

  if (0 ==strncasecmp (url, supported_schemes[i].leading_string,
                         strlen (supported_schemes[i].leading_string))) {

    //判断连接类型,并更新m_eScheme成员

  if (supported_schemes[i].enabled){
    this->m_eScheme= (enum url_scheme) i;
    return;
   }else{
    this->m_eScheme= SCHEME_INVALID;
    return;
   }
  }

 this->m_eScheme= SCHEME_INVALID;
 return;
}


bool CUrl::ParseUrlEx(string strUrl)
{
 char protocol[10];
 char host[HOST_LEN];
 char request[256];
 int port = -1;

//初始化相关填充区域

 memset( protocol, 0,sizeof(protocol) );
 memset( host, 0, sizeof(host) );
 memset( request, 0, sizeof(request) );

//分析并填充连接类型

 this->ParseScheme(strUrl.c_str());

//如果非HTTP类型,返回FALSE
 if( this->m_eScheme != SCHEME_HTTP){
  return false;
 }

 //如果是HTTP类型,ParseUrlEx继续处理,注意每个参数都有传送它的长度

 ParseUrlEx(strUrl.c_str(),
   protocol,sizeof(protocol),
   host,sizeof(host),
   request,sizeof(request),
   &port);

//将取出后的URL的相关信息写入成员变量中

 m_sUrl  =strUrl;
 m_sHost = host;
 m_sPath = request;

 if( port > 0){
  m_nPort = port;
 }

 return true;
}


//对URL进一步处理
 

void CUrl::ParseUrlEx(const char *url,
  char *protocol, intlprotocol,
  char *host, int lhost,
  char *request, intlrequest,
  int *port)
{
 char *work,*ptr,*ptr2;

 *protocol = *host = *request =0;
 *port = 80;

 //准备一个临时缓冲区WORK

 int len = strlen(url);
 //pthread_mutex_lock(&mutexMemory);
 work = new char[len + 1];
 //pthread_mutex_unlock(&mutexMemory);
 memset(work, 0, len+1);
 strncpy(work, url, len);

 //依据":"找到协议,如果URL内有协议类型,则写入protocol,否则默认为HTTP

 // find protocol if any
 ptr = strchr(work, ':');
 if( ptr != NULL ){
  *(ptr++) = 0;//以免lprotocol比work中的协议字符的长度还要长
  strncpy( protocol, work,lprotocol );
 } else

{//有些URL省略了HTTP的前缀
  strncpy( protocol, "HTTP",lprotocol );
  ptr = work;
 }

//跳过"//"

 // skip past opening /'s
 if( (*ptr=='/')&& (*(ptr+1)=='/') )
  ptr+=2;

//使用ptr2处理主机

 // find host
 ptr2 = ptr;
 while( IsValidHostChar(*ptr2)&& *ptr2 )
  ptr2++;
 *ptr2 = 0;//保证复制主机字符的正确性和安全性
 strncpy( host, ptr, lhost );

 

 //处理请求部分的字符

 // find the request
 int offset = ptr2 - work;
 const char *pStr = url + offset;

//PSTR指向请求部分的起始部分通过offset偏移地址
 strncpy( request, pStr, lrequest );

 

//处理端口号

 // find the port number, ifany
 ptr = strchr( host, ':' );
 if( ptr != NULL ){
  *ptr = 0;
  *port = atoi(ptr+1);
 }

 //pthread_mutex_lock(&mutexMemory);
 delete [] work;
 //pthread_mutex_unlock(&mutexMemory);
 work = NULL;
}


CUrl::CUrl()
{

//不带参数的构造函数初始化类成员
 this->m_sUrl = "";
 this->m_eScheme=SCHEME_INVALID;
       
 this->m_sHost =""; 
 this->m_nPort =DEFAULT_HTTP_PORT;
       
 this->m_sPath = "";
 

}

CUrl::~CUrl()
{

}


//通过主机字符得到IP地址
char * CUrl::GetIpByHost(const char *host)
{
 if( !host ){ // nullpointer
  return NULL;
 }

 if( !IsValidHost(host) ){ //invalid host
  return NULL;
 }

 unsigned long inaddr = 0;
 char *result = NULL;
 int len = 0;

 //主机转32位二进制数的IP地址
 inaddr = (unsigned long)inet_addr( host );
 //if ( (int)inaddr != -1){
 if ( inaddr != INADDR_NONE){ // host is justip

 //===主机名如果为XX.XX.XX.XX形式的IP地址字符,则inet_addr执行成功
  len = strlen(host);
  //pthread_mutex_lock(&mutexMemory);
  result = new char[len+1];
  //pthread_mutex_unlock(&mutexMemory);
  memset(result, 0, len+1);
  memcpy(result, host, len);

  return result;

       }

else

{
  //firt find from cache
  //==主机名字符为非IP形式

 //Cache暂存的内存以主机名为索引查找IP地址
  map<string,string>::iteratorit  = mapCacheHostLookup.find(host);

  if( it !=mapCacheHostLookup.end() ){ // find in host lookupcache
   const char *strHostIp;

   strHostIp= (*it).second.c_str();

   inaddr =(unsigned long)inet_addr( strHostIp );
   //if ((int)inaddr != -1){
   if ( inaddr!= INADDR_NONE ){

 //成功在Cache中找到主机名对应的IP地址
    len= strlen(strHostIp);
    //pthread_mutex_lock(&mutexMemory);
    result= new char[len+1];
    //pthread_mutex_unlock(&mutexMemory);
    memset(result, 0, len+1 );
    memcpy(result, strHostIp, len );

    //cout<< ":)" ;
    
    returnresult;
         }
  }
 }

//均未找到,只能求助于DNS服务

 // if still not find, then try by DNSserver
 struct hostent *hp; 
 hp =gethostbyname(host);//通过主机名获得IP地址
 if(hp == NULL) {
  //cout<< "gethostbyname() error inGetIpByHost: " << host<< endl;
  return NULL;
 }

 // cache host lookup

//in为32位的IP地址结构变量
       struct  in_addr in;

 bcopy(*(hp->h_addr_list),(caddr_t)&in, hp->h_length);
 //inet_ntoa()是对structin_addr*结构转变成可以答应的ip的点进字

//符串,inet_ntop也是同样的功能,不过inet_ntop是使用于ipv4,ipv6

//而inet_ntoa只能用于ipv4


 //AF_INET表示为IP地址形式即INTERNET地址家族

char   abuf[INET_ADDRSTRLEN];

//将IN中的地址转成带点的IP地址字符形式
       if( inet_ntop(AF_INET, (void *)&in,abuf,sizeof(abuf)) == NULL ){
  cout<< "inet_ntop() return error inGetIpByHost" << endl;
  return NULL;

 } else {

//可以成功转化成字符形式的IP写入abuf

  pthread_mutex_lock(&mutexCacheHost);
  //if(mapCacheHostLookup.count(host) == 0){
//更新CACHE中的主机和IP地址对  

if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end()){

  
   //cout<< endl<< host<< " and "<< abuf<< endl;
   mapCacheHostLookup.insert(valTypeCHL ( host, abuf));
  }
  pthread_mutex_unlock(&mutexCacheHost);

 }

 // return result
 len = strlen(abuf);
 //pthread_mutex_lock(&mutexMemory);
 result = new char[len + 1];
 //pthread_mutex_unlock(&mutexMemory);
 memset( result, 0, len+1 );
 memcpy( result, abuf, len );

 return result;
}


bool CUrl::IsValidHostChar(char ch)
{

//所有数字,字母,-,.,:,_为主机字符可接受部分
 return( isalpha(ch) || isdigit(ch)
  || ch=='-' || ch=='.' ||ch==':' || ch=='_');
}


bool CUrl::IsValidHost(const char *host)
{
 if( !host ){
  return false;
 }

 if( strlen(host) < 6 ){ // incase host like "www", "pku", etc.
  return false;
 }

 char ch;
 for(unsigned int i=0;i<strlen(host); i++){
  ch = *(host++);
  if( !IsValidHostChar(ch)){
   returnfalse;
  }
 }

 return true;
}


bool CUrl::IsVisitedUrl(const char *url)
{
 if( !url ){
  return true; // if be null, wethink it have been visited
 }

 CMD5 iMD5;
 iMD5.GenerateMD5( (unsigned char*)url,strlen(url) );
 string strDigest = iMD5.ToString();

 if( setVisitedUrlMD5.find(strDigest) !=setVisitedUrlMD5.end() ) {
  return true;
 } else {
  return false;
 }

}



bool CUrl::IsValidIp(const char *ip)
{
 if( ip == NULL ){
  return false;
 }

 unsigned long inaddr = (unsignedlong)inet_addr(ip);
 if( inaddr == INADDR_NONE ){ //invalid ip
  return false;
 }

 if( mapIpBlock.size() > 0){
  map<unsignedlong,unsigned long>::iterator pos;
  for(pos=mapIpBlock.begin();pos!=mapIpBlock.end(); ++pos){
   unsigned longret;

   ret =inaddr & ~((*pos).second);
   if( ret ==(*pos).first ){ // inside
    returntrue;
   }
  }

  // outside
  return false;
 }

 // if block range is not given, we think itinside also
 return true;
}


bool CUrl::IsForeignHost(string host)
{
 if( host.empty() ) return true;
 if( host.size() > HOST_LEN )return true;

 unsigned long inaddr = 0;

 inaddr = (unsigned long)inet_addr(host.c_str() );
 if ( inaddr != INADDR_NONE){ // host is justip
  return false;
 }

 string::size_type idx = host.rfind('.');
 string tmp;
 if( idx != string::npos ){
  tmp = host.substr(idx+1);
 }

 CStrFun::Str2Lower( tmp, tmp.size() );
 const char *home_host[] ={
  "cn","com","net","org","info",
  "biz","tv","cc", "hk","tw"
 };

 int home_host_num = 10;

 for(int i=0; i<home_host_num;i++){
  if( tmp == home_host[i] )
   returnfalse;
 }

 return true;
}
 
 
bool CUrl::IsImageUrl(string url)
{
 if( url.empty() ) return false;
 if( url.size() > HOST_LEN ) returnfalse;

 string::size_type idx = url.rfind('.');
 string tmp;
 if( idx != string::npos ){
  tmp = url.substr(idx+1);
 }

 CStrFun::Str2Lower( tmp, tmp.size() );
 const char *image_type[] ={
  "gif","jpg","jpeg","png","bmp",
  "tif","psd"
 };

 int image_type_num = 7;

 for (int i=0; i<image_type_num;i++)
 {
  if( tmp == image_type[i])
   returntrue;
 }

 return false;
}
 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
TSE(Tiny Search Engine) ======================= (Temporary) Web home: http://162.105.80.44/~yhf/Realcourse/ TSE is free utility for non-interactive download of files from the Web. It supports HTTP. According to query word or url, it retrieve results from crawled pages. It can follow links in HTML pages and create output files in Tianwang (http://e.pku.edu.cn/) format or ISAM format files. Additionally, it provies link structures which can be used to rebuild the web frame. --------------------------- Main functions in the TSE: 1) normal crawling, named SE, e.g: crawling all pages in PKU scope. and retrieve results from crawled pages according to query word or url, 2) crawling images and corresponding pages, named ImgSE. --------------------------- INSTALL: 1) execute "tar xvfz tse.XXX.gz" --------------------------- Before running the program, note Note: The program is default for normal crawling (SE). For ImgSE, you should: 1. change codes with the following requirements, 1) In "Page.cpp" file, find two same functions "CPage::IsFilterLink(string plink)" One is for ImgSE whose urls must include "tupian", "photo", "ttjstk", etc. the other is for normal crawling. For ImgSE, remember to comment the paragraph and choose right "CPage::IsFilterLink(string plink)". For SE, remember to open the paragraph and choose righ "CPage::IsFilterLink(string plink)". 2) In Http.cpp file i. find "if( iPage.m_sContentType.find("image") != string::npos )" Comment the right paragraph. 3) In Crawl.cpp file, i. "if( iPage.m_sContentType != "text/html" Comment the right paragraph. ii. find "if(file_length < 40)" Choose right one line. iii. find "iMD5.GenerateMD5( (unsigned char*)iPage.m_sContent.c_str(), iPage.m_sContent.length() )" Comment the right paragraph. iv. find "if (iUrl.IsImageUrl(strUrl))" Comment the right paragraph. 2.sh Clean; (Note not remove link4History.url, you should commnet "rm -f link4History.url" line first) secondly use "link4History.url" as a seed file. "link4History" is produced while normal crawling (SE). --------------------------- EXECUTION: execute "make clean; sh Clean;make". 1) for normal crawling and retrieving ./Tse -c tse_seed.img According to query word or url, retrieve results from crawled pages ./Tse -s 2) for ImgSE ./Tse -c tse_seed.img After moving Tianwang.raw.* data to secure place, execute ./Tse -c link4History.url --------------------------- Detail functions: 1) suporting multithreads crawling pages 2) persistent HTTP connection 3) DNS cache 4) IP block 5) filter unreachable hosts 6) parsing hyperlinks from crawled pages 7) recursively crawling pages h) Outputing Tianwang format or ISAM format files --------------------------- Files in the package Tse --- Tse execute file tse_unreachHost.list --- unreachable hosts according to PKU IP block tse_seed.pku --- PKU seeds tse_ipblock --- PKU IP block ... Directories in the package hlink,include,lib,stack,uri directories --- Parse links from a page --------------------------- Please report bugs in TSE to MAINTAINERS: YAN Hongfei * Created: YAN Hongfei, Network lab of Peking University. * Created: July 15 2003. version 0.1.1 * # Can crawl web pages with a process * Updated: Aug 20 2003. version 1.0.0 !!!! * # Can crawl web pages with multithreads * Updated: Nov 08 2003. version 1.0.1 * # more classes in the codes * Updated: Nov 16 2003. version 1.1.0 * # integrate a new version linkparser provided by XIE Han * # according to all MD5 values of pages content, * for all the pages not seen before, store a new page * Updated: Nov 21 2003. version 1.1.1 * # record all duplicate urls in terms of content MD5

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值