Linux获取网页源码的几种方法

第一个为利用linux下的工具来获取网页源码,我用的是Wget,也可以使用Curl,curl的话更加的灵活,可以设置很多参数;

//通过Wget来获取网页
string GetHtmlByWget(string url)
{
    //获取待下载网页文件名
    string fileName = url.substr((int)url.find_last_of("/") + 1);
    if(fileName != "")
    {
        string strCom = "wget -q "; //wget命令,-q表示不显示下载信息
        strCom.append(url);
        system(strCom.c_str()); //执行wget

        ifstream fin(fileName.c_str());
        if(!fin)
        {
            return "";
        }
        string strHtml = "";
        char chTemp[1024] = "";
        //读取网页文件到内存中
        while(fin.getline(chTemp , 1024))
        {
            strHtml.append(string(chTemp));
            strcpy(chTemp , "");
        }
        fin.close();
        strCom = "rm -f ";  //删除文件命令,-f表示直接删除不做任何提示
        strCom.append(fileName);
        system(strCom.c_str()); //删除刚才下载下来的文件
        return strHtml; //返回网页源码
    }
    else
    {
        return "";
    }
}

第二个是用的socket的来获取源码:

//通过GET获取网页源码
string GetHtmlByGet(string url)
{
    string strHtmlContent = "";
    int sockfd;
    struct sockaddr_in addr;
    struct hostent *pURL;
    char text[RECVBUF];

    //分析链接
    UrlInfo urlInfo = ParseURL(url);
    string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate";
    //不同的主机UserAgent不同
    string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10";
    //将端口转换为字符串
    char t[6];
    string  strPort;
    sprintf(t,"%d", urlInfo.Port);
    strPort = t;
    //构造发送字符串
    string strRequest = "";
    strRequest.append("GET ");
    strRequest.append(urlInfo.File);
    strRequest.append("?");
    strRequest.append(urlInfo.Body);
    strRequest.append(" HTTP/1.1\r\n");
    strRequest.append(sAccept);
    strRequest.append("\r\nUser-Agent:");
    strRequest.append(sUserAgent);
    strRequest.append("\r\nHost:");
    strRequest.append(urlInfo.Host);
    strRequest.append(":");
    strRequest.append(strPort);
    strRequest.append("\r\nConnection: Keep-Alive\r\n\r\n");

    char* host = const_cast<char*>(urlInfo.Host.c_str());
    sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式发送
    pURL = gethostbyname(host);
    addr.sin_family = AF_INET;
    addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr);
    addr.sin_port = htons(80);

    //连接
    connect(sockfd,(struct sockaddr *)&addr,sizeof(addr));
    //发送
    send(sockfd, const_cast<char*>(strRequest.c_str()), strRequest.length(), 0);
    //接受
    while(recv(sockfd, text, RECVBUF, 0) > 0)
    {
        strHtmlContent.append(text);
        bzero(text,RECVBUF);
    }
    //关闭socket
    close(sockfd);
    //返回接受结果
    return strHtmlContent;
}

第三个使用libcurl:

#include <stdio.h> 
 #include <string.h> 
 #include <curl/curl.h> 

 #define MAX_BUF 	 65536 

 char wr_buf[MAX_BUF+1]; 
 int  wr_index; 

 /* 
 * Write data callback function (called within the context of 
 * curl_easy_perform. 
 */ 
 size_t write_data( void *buffer, size_t size, size_t nmemb, void *userp ) 
 { 
  int segsize = size * nmemb; 

  /* Check to see if this data exceeds the size of our buffer. If so, 
   * set the user-defined context value and return 0 to indicate a 
   * problem to curl. 
   */ 
  if ( wr_index + segsize > MAX_BUF ) { 
    *(int *)userp = 1; 
    return 0; 
  } 

  /* Copy the data from the curl buffer into our buffer */ 
  memcpy( (void *)&wr_buf[wr_index], buffer, (size_t)segsize ); 

  /* Update the write index */ 
  wr_index += segsize; 

  /* Null terminate the buffer */ 
  wr_buf[wr_index] = 0; 

  /* Return the number of bytes received, indicating to curl that all is okay */ 
  return segsize; 
 } 


 /* 
 * Simple curl application to read the index.html file from a Web site. 
 */ 
 int main( void ) 
 { 
  CURL *curl; 
  CURLcode ret; 
  int  wr_error; 

  wr_error = 0; 
  wr_index = 0; 

  /* First step, init curl */ 
  curl = curl_easy_init(); 
  if (!curl) { 
    printf("couldn't init curl\n"); 
    return 0; 
  } 

  /* Tell curl the URL of the file we're going to retrieve */ 
  curl_easy_setopt( curl, CURLOPT_URL, "www.exampledomain.com" ); 

  /* Tell curl that we'll receive data to the function write_data, and 
   * also provide it with a context pointer for our error return. 
   */ 
  curl_easy_setopt( curl, CURLOPT_WRITEDATA, (void *)&wr_error ); 
  curl_easy_setopt( curl, CURLOPT_WRITEFUNCTION, write_data ); 

  /* Allow curl to perform the action */ 
  ret = curl_easy_perform( curl ); 

  printf( "ret = %d (write_error = %d)\n", ret, wr_error ); 

  /* Emit the page if curl indicates that no errors occurred */ 
  if ( ret == 0 ) printf( "%s\n", wr_buf ); 

  curl_easy_cleanup( curl ); 

  return 0; 
 } 



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值