通过代理服务器下载网页
这几天研究通过代理服务器下载网页的相关东西,发现从网上根本找不到相关的代码,最后还是把wget的源码拿来研究了一下,才算明白了一点。
具体步骤如下:
1. 第一步,连接到代理服务器。
// 建立到服务端的连接
int my_connect( uint32_t *pServerAddress , unsigned short ServerPort, const char * pHost, int nTimeout,
const char* pLocalAddress, unsigned short sLocalPort,URLS *pUrl )
{
int nTry = 0;
char strConn[255];
int nSock = socket(AF_INET, SOCK_STREAM, 0);
if( nSock < 0 )
return -1;
// 配置服务地址
struct sockaddr_in serverAddr, clientAddr;
char* ip = (char*)pServerAddress;
int nReuse = 1;
memset(&serverAddr, 0, sizeof(serverAddr));
serverAddr.sin_family = AF_INET;
if( !pServerAddress ) // 未指定地址
goto fail;
serverAddr.sin_addr.s_addr = *pServerAddress;
serverAddr.sin_port = htons(ServerPort);
if( !setNonBlocking(nSock,true) )
goto fail;
while( ::connect(nSock, (sockaddr*)&serverAddr, sizeof(serverAddr)) == -1 )
{
if( errno == EADDRNOTAVAIL || errno == ENOTCONN )
{
//fprintf(stderr,"sock try %d\n",errno);
//usleep(10000);
nTry++;
if( nTry<1 )
continue;
}
if( errno != EINPROGRESS )
goto fail;
break;
goto fail;
}
return nSock;
fail:
printf("connect error %d %s \n",errno,strerror(errno));
close(nSock);
return -1;
}
2. 发送Get请求,并接收数据,格式如下:
GET http://3g.sina.com.cn/ HTTP/1.0
User-Agent: Wget/1.11.1
Accept: */*
Host: 3g.sina.com.cn
代码如下:
int FileNum= 0;
int iGetData(int sock,URLS *pUrltest)
{
epoll_event* events = new epoll_event[2];
int epfd = epoll_create(2);
struct epoll_event ev;
memset(&ev, 0, sizeof(ev));
ev.data.ptr = (void*)sock;
ev.events = EPOLLOUT;
if( epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) == -1 )
{
delete []events;
return 1;
}
while( true )
{
// wait each connect for ever
int nFds = epoll_wait(epfd, events, 2, MAX_TIME_OUT);
int i;
if(nFds <= 0)
{
delete []events;
return -1;
}
for( i=0; i<nFds; i++ )
{
int nEventSock = (int)(long)events[i].data.ptr;
//fprintf(stderr,"event %x\n",events[i].events);
}
// connect ok
setNonBlocking(sock,false);
char* buf = new char[1024*1024];
char* outbuf = new char[1024*1024*10];
char* ua = getenv("USERAGENT");
if( !ua || strlen(ua)==0 )
ua = "";
char* acp = getenv("ACCEPT");
if( !acp || strlen(acp)==0 )
acp = "*/*";
char* ckie = getenv("COOKIE");
if( !ckie || strlen(ckie)==0 )
ckie = "";
char* p = buf;
if( pUrltest->port != 80 )
{
p += sprintf(buf,"GET %s HTTP/1.0\r\n"
,pUrltest->m_url.c_str());
}
else
{
p += sprintf(buf,"GET %s HTTP/1.0\r\n"
,pUrltest->m_url.c_str());
}
p+= sprintf(p,"User-Agent: %s\r\n","Wget/1.11.1");
if( ckie[0] )
p += sprintf(p,"Cookie: %s\r\n",ckie);
p += sprintf(p,"Accept: */*\r\n");
p += sprintf(p,"Host: %s\r\n\r\n",pUrltest->host.c_str());
//fprintf(stderr,"\n%s\n",buf);
//printf("%s\n",buf);
if( writeBlock1(sock,buf,strlen(buf)) != strlen(buf) )
{
fprintf(stderr,"write failed %d\n",errno);
delete []events;
delete []buf;
delete []outbuf;
return 1;
}
int n = readBlock(sock,buf,1024*1023);
#if 1
if( n > 0 )
{
buf[n] = '\0';
char* ph = strstr(buf,"\r\n\r\n");
while( ph )
{
ph += 4;
std::string head = std::string(buf,ph-buf);
char* p = strstr(head.c_str(),"Transfer-Encoding:");
if( p )
{
int size = 0;
char* pb = ph;
int nChunkLen = -1;
char* p1 = pb;
while( pb-buf < n )
{
// chunk begin
SKIP_BLANK(p1);
char* p0 = strchr(p1,'\n');
if( !p0 )
break;
char* p2 = p1;
FIND_BLANK(p2);
nChunkLen = hexdec(p1,p2-p1);
p2 = p0+1; // chunk head
if( nChunkLen < 0 || nChunkLen == 0 )
break;
if( nChunkLen > (int)n-(p2-buf) )
nChunkLen = n-(p2-buf);
p1 = p2+nChunkLen; // chunk tvend
memmove(pb,p2,nChunkLen);
size += nChunkLen;
p0 = strchr(p1,'\n');
if( !p0 )
break;
pb = ph+size;
p1 = p0+1;
}
n = size+(ph-buf);
}
p = strstr(head.c_str(),"Content-Encoding:");
if( p )
{
p += 17;
while(*p&&*p==' ')p++;
if( !*p )
break;
char* p1 = p;
while(*p&&*p!='\r')p++;
std::string type = std::string(p1,p-p1);
CHttpDecompress decomp;
if( decomp.setType(type.c_str()) != HTTP_DECOMPRESS_OK )
break;
CompressStruct cs;
memset(&cs,0,sizeof(cs));
cs.m_inBuf = (Bytef*)ph;
cs.m_inLen = n-(ph-buf);
char* obuf = new char[1024*1024*9+head.size()];
cs.m_outBuf = (Bytef*)obuf+head.size();
cs.m_outLen = 1024*1024*9;
if( decomp.decompress(cs) < 0 )
{
delete[] obuf;
break;
}
memcpy(obuf,head.c_str(),head.size());
delete[] buf;
buf = obuf;
n = cs.m_routLen+head.size();
}
break;
}
}
#endif
#ifdef PRINT_OUT
char acFileName[1024];
sprintf(acFileName,"%d.html",FileNum);
FileNum++;
FILE *pf = fopen(acFileName,"a+");
fwrite(buf,strlen(buf),1,pf);
fclose(pf);
#endif
delete []buf;
delete []outbuf;
break;
}
close(epfd);
close(sock);
delete[] events;
return 0;
}