参考网址: http://blog.jobbole.com/82628/
文件名:a.cpp
/*
功能: 演示了利用boost中的asio和regex提取网页中符合指定格式的网址
环境: Fedora20
编译: g++ -o a a.cpp -Wall -Os -std=c++11 -lboost_system -lboost_regex
说明: 默认Fedora20中未安装boost库, 可通过在终端执行"yum install boost-devel"来安装
*/
#include <iostream>
#include <exception>
#include <boost/regex.hpp> // 正则表达式库
#include <boost/asio.hpp> // asio网络库
using namespace std;
using namespace boost;
set<string> get_strings(istream& is, regex pat) // 从is中取出所有的网址
{
set<string> res;
smatch m;
for(string s; getline(is, s); )
{
if(regex_search(s, m, pat))
res.insert(m[0]);
}
return res;
}
void connect_to_file(iostream& s, const string& server, const string& file)
{
if(!s)
throw runtime_error{"can't connect\n"};
s << "GET " << "http://" + server + "/" + file << " HTTP/1.0\r\n";
s << "Host: " << server << "\r\n";
s << "Accept: */*\r\n";
s << "Connection: close\r\n\r\n";
string http_version;
unsigned int status_code;
s >> http_version >> status_code;
cout << http_version << ", " << status_code << endl;
string status_message;
getline(s, status_message);
if(!s || http_version.substr(0, 5) != "HTTP/")
throw runtime_error{"Invalid response"};
if(status_code != 200)
{
char buf[64] = {0};
sprintf(buf, "Response returned with status code: %d", status_code);
throw runtime_error{buf};
}
string header;
while(getline(s, header) && header != "\r")
;
}
int main()
{
try
{
string server = "www.stroustrup.com";
asio::ip::tcp::iostream s {server, "http"};
connect_to_file(s, server, "C++.html");
regex pat{R"((http://)?www([./#\+-]\w*)+)"}; // 网址正则表达式
for (auto x: get_strings(s, pat))
cout << x << endl;
}
catch(std::exception& e) // boost中也有exception, 所以这里要写全
{
cout << "Exception: " << e.what() << endl;
return -1;
}
return 0;
}