小型抓图爬虫程序实现c++
最近得到偶然机会到公司实习,实践发现有很多东西不懂。特别是网络编程方面,故写一个基于socket的网络爬虫来稍微了解一下网络结构之类的东西。哈哈。
这个程序主要功能是实现了给定一个主网页,然后抓取它以及它的链接网页的图片。
实现方法是,先把主网页从page_url队列中提取,再提取其中的a标签href链接和img标签的src链接,分别加入page_url队列和image_url队列。再重复上述步骤,即,删除page_url头结点,提取其中链接。
因为新手,程序性能也比较低下,所以设置了当提取页面数太多时,停止提取url的操作。
放代码!
functions.h头文件:
#ifndef FUNCTIONS_H
#define FUNCTIONS_H
#include <string>
#include <iostream>
#include <fstream>
#include <list>
#include "winsock2.h"
#include <time.h>
#include <queue>
#include <hash_set>
#pragma comment(lib, "ws2_32.lib")
bool GetHttpRespond(const std::string& url, char * &respond, int& bytes_read);
bool ParseUrl(const std::string& url, std::string& host, std::string& resource);
bool GetPagesAndImages(std::string main_url, std::list<std::string>& images_url);
bool UrlDownloadToFile(std::list<std::string> images_url);
std::string GetFileName(const std::string url);
#endif
functions.cpp:
#include "functions.h"
#include <iostream>
#include <urlmon.h>
#define DEFAULT_PAGE_BUF_SIZE 1048576
#define A_TAG_SIZE 500
#define IMAGE_TAG_SIZE 500
#define TEMP_SIZE 500
bool GetHttpRespond(const std::string& url, char * &respond, int& bytes_read){
//初始化WSA
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2,2), &wsaData) != 0)
{
std::cout << "WSA failed to start!\n";
return false;
}
//解析url
std::string host,resource;
if (!ParseUrl(url, host, resource)){
std::cout << "can't Parse :" << url << std::endl;
return false;
}
//建立socket
hostent* p_host=gethostbyname(host.c_str());
if (!p_host){
std::cout << "page is invalid! :" << url << "\n";
return false;
}
SOCKET _socket = socket(PF_INET,SOCK_STREAM,0);
if (_socket == INVALID_SOCKET){
std::cout << "socket failed to create!\n";
return false;
}
//获取sockadd
SOCKADDR_IN addr_in;
addr_in.sin_family = PF_INET;
addr_in.sin_port = htons(80);
memcpy(&addr_in.sin_addr,p_host->h_addr,4);
//建立连接
if (connect(_socket, (SOCKADDR *)& addr_in, sizeof(addr_in))!=0){
std::cout << "connection failed!\n";
closesocket(_socket);
return false;
}
//准备http请求头
std::string request = "GET " + resource + " HTTP/1.1\r\n" + "Host:" + host + "\r\n" + "Connection:close\r\n\r\n";
//发送数据
if (send(_socket, request.c_str(), request.size(), 0) == SOCKET_ERROR){
std::cout << "can't send request!\n";
return false;
}
//接收数据
char* recv_data = (char *)malloc(DEFAULT_PAGE_BUF_SIZE);
int temp = 1, content_size = DEFAULT_PAGE_BUF_SIZE;
if (!recv_data){
std::cout << "malloc failed!\n";
free(recv_data);
return false;
}
memset(recv_data, 0, content_size); //该步骤省去可能会出现乱码
while (temp){
temp = recv(_socket, recv_data+bytes_read, content_size-bytes_read, 0);
if (temp > 0){
bytes_read += temp;
}
else if(temp==SOCKET_ERROR){
std::cout << "can't receive data!\n";
free(recv_data);
return false;
}
//剩余空间太小时扩大buf空间
if (content_size - bytes_read < 100){
content_size *= 2;
recv_data = (char *)realloc(recv_data,content_size);
}
}
//返回
recv_data[bytes_read] = '\0';
respond = recv_data;
closesocket(_socket);
return true;
}
bool ParseUrl(const std::string& url, std::string& host, std::string& resource){
char p_host[100], p_resource[1000];
const char *p_url;
memset(p_host, 0, sizeof(p_host));
memset(p_resource, 0, sizeof(p_resource));
//去除http://协议部分
p_url = strstr(url.c_str(),"http://");
if (!p_url) return false;
else p_url += strlen("http://");
//域名错误处理
if (strstr(url.c_str(), "/") == 0)
return false;
if (strstr(url.c_str(), "\r") || strstr(url.c_str(), "\n"))
return false;
//分割域名,获取主机名和资源名
sscanf_s(p_url, "%[^/]%s", p_host, sizeof(p_host), p_resource, sizeof(p_resource));
host = p_host;
resource = p_resource;
return true;
}
bool GetPagesAndImages(std::string main_url, std::list<std::string>& images_url){
std::queue<std::string> page_urls;
page_urls.push(main_url);
//队列广搜 && 当url解析数目大于100或url数目等于0时,不再提取url
int push_flag = 1;
while (!page_urls.empty()){
const char * a_ptr = nullptr;
char * respond = nullptr;
int bytes_read = 0;
if (!GetHttpRespond(page_urls.front(), respond, bytes_read)){
page_urls.pop();
continue;
}
page_urls.pop();
a_ptr = respond;
if (page_urls.size() > 100)
push_flag = 0;
//提取url
if (push_flag){
while (a_ptr = strstr(a_ptr, "<a href=\"")){
a_ptr += strlen("<a href=\"");
char temp[A_TAG_SIZE];
sscanf_s(a_ptr, "%[^\"]", temp, sizeof(temp));
a_ptr++;
page_urls.push(temp);
}
}
//获取页面图片资源链接
a_ptr = respond;
while (a_ptr = strstr(a_ptr, "<img src=\"")){
a_ptr += strlen("<img src=\"");
char temp[IMAGE_TAG_SIZE];
sscanf_s(a_ptr, "%[^\"]", temp, sizeof(temp));
a_ptr++;
images_url.push_back(temp);
}
free(respond);///
}
return true;
}
bool UrlDownloadToFile(std::list<std::string> &images_url){
//创建文件夹
std::string path = "md d:\\CrawlerImages";
system(path.c_str());
//下载图片
while (!images_url.empty()){
std::ofstream out;
const char * p_bak = nullptr;
char * respond = nullptr;
int bytes_read = 0;
if (!GetHttpRespond(images_url.front(), respond, bytes_read)){
std::cout << "img create false! url:" << images_url.front() << std::endl;
images_url.pop_front();
continue;
}
std::string filename;
filename = GetFileName(images_url.front());
out.open("D://CrawlerImages/" + filename,std::ios::binary);
p_bak = respond;
const char * p_find = strstr(p_bak, "\r\n\r\n") + strlen("\r\n\r\n");
if (!p_find){
std::cout << "can't find '\\r\\n'\n";
continue;
}
out.write(p_find, bytes_read - (p_find - p_bak));
out.close();
images_url.pop_front();
free(respond);
}
return true;
}
std::string GetFileName(const std::string url){
std::string filename;
char host_temp[TEMP_SIZE], filename_temp[TEMP_SIZE];
const char * p_url = nullptr;
memset(host_temp, 0, sizeof(host_temp));
memset(filename_temp, 0, sizeof(filename_temp));
//去除http://协议部分
p_url = strstr(url.c_str(), "http://");
if (!p_url) return false;
else p_url += strlen("http://");
//分割域名,获取主机名和资源名
sscanf_s(p_url, "%[^/]%s", host_temp, sizeof(host_temp), filename_temp, sizeof(filename_temp));
filename = filename_temp;
for (int i = 0; i < filename.size(); i++)
if (filename[i] == '/')
filename[i] = '^';
return filename;
}
main.cpp 主函数
#include<iostream>
#include"functions.h"
using namespace std;
int main(){
string url;
std::cout << "plase input the url of web\n(like 'http://www.baidu.com/', the last '/' is important!):\n";
while (std::cin >> url){
std::list<string> images_url;
GetPagesAndImages(url, images_url);
UrlDownloadToFile(images_url);
std::cout << "\n\n\n\n\nEnd...\nPress any keys to continue...\n";
system("pause");
system("cls");
std::cout << "plase input the url of web\n(like 'http://www.baidu.com/', the last '/' is important!):\n";
}
};
脚注
为了方便,也没有把图片按文件夹分类什么的,比较杂乱。图片存放位置在D盘CrawlerImages上,没有的话,会自动创建。所以编码完,要记得清理一下。
同时请教大神,为啥我这个获取链接时会出现:
例如:
原网页链接:http://www.meinanzi.com/
存储在队列中的链接:http://www.mei\r\nnanzi.c\r\nom/
这个问题在去掉ParseUrl函数中的
if (strstr(url.c_str(), “\r”) || strstr(url.c_str(), “\n”))
return false;
时,会发生中断。
参考:
http://blog.csdn.net/huangxy10/article/details/8120106
http://blog.csdn.net/ttyttytty12/article/details/8141910