boost c++ lib on linux(5) - 小练习——获取网页内容并提取有用信息

2013年来啦,新年伊始,写一篇博客,把之前学习boost的一个小练习晒晒。上班路上还自行车一直是我比较困扰和蛋疼的事,每次骑到公司,附近的还车点都满了,又不知道附近哪里还有空位,只能盲目的寻找,不知屌丝的狼狈你可懂。于是开始想办法怎么弄到自行车还车点的信息,好在杭州官方有在线查询的网站,就想分析一下,找到请求信息的url。PS:网站设计的相对简陋,被我找到请求的url了大笑

喏,Look!(url:http://www.hzbus.cn/map/cTJs.js

可以得知要获取自行车还车点的信息,可以想url:http://www.hzbus.cn/Page/BicyleSquare.aspx发送http get请求,缀上查询关键词参数则用&nm=xxx即可。于是就想利用这次机会练习一下之前boost库的掌握情况。使用boost发请求获取html返回数据,使用regex解析获取有用的信息。

上代码:

代码目录结构图如下:


http目录的代码负责请求http的能力的封装,parser目录下的代码负责解析html的能力的封装,test目录是做一些测试工作的目录(如单测等,以及一些小功能的测试代码),app目录是实际的处理程序,组装各个部分形成可用的程序。

http请求:

类声明如下:

主要实现部分是httpGet方法,借鉴了官方的sample的代码,实现代码如下:
bool CHttpRequest::httpGet(string& result, const string& host, const string& url) 
try
{
	using boost::asio::ip::tcp;
    boost::asio::io_service io_service;

    // Get a list of endpoints corresponding to the server name.
    tcp::resolver resolver(io_service);
    tcp::resolver::query query(host, "http");
    tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);

    // Try each endpoint until we successfully establish a connection.
    tcp::socket socket(io_service);
    boost::asio::connect(socket, endpoint_iterator);

    // Form the request. We specify the "Connection: close" header so that the
    // server will close the socket after transmitting the response. This will
    // allow us to treat all data up until the EOF as the content.
    boost::asio::streambuf request;
    std::ostream request_stream(&request);
    request_stream << "GET " << url << " HTTP/1.0\r\n";
    request_stream << "Host: " << host << "\r\n";
    request_stream << "Accept: */*\r\n";
    request_stream << "Connection: close\r\n\r\n";

    // Send the request.
    boost::asio::write(socket, request);

    // Read the response status line. The response streambuf will automatically
    // grow to accommodate the entire line. The growth may be limited by passing
    // a maximum size to the streambuf constructor.
    boost::asio::streambuf response;
    boost::asio::read_until(socket, response, "\r\n");

    // Check that response is OK.
    std::istream response_stream(&response);
    std::string http_version;
    response_stream >> http_version;
    unsigned int status_code;
    response_stream >> status_code;
    std::string status_message;
    std::getline(response_stream, status_message);
    if (!response_stream || http_version.substr(0, 5) != "HTTP/")
    {
      LOG(ERROR) << "Invalid response\n";
      return 1;
    }
    if (status_code != 200)
    {
      LOG(ERROR) << "Response returned with status code " << status_code << "\n";
      return 1;
    }

    // Read the response headers, which are terminated by a blank line.
    boost::asio::read_until(socket, response, "\r\n\r\n");

    // Process the response headers.
    std::string header;
    while (std::getline(response_stream, header) && header != "\r")
      LOG(INFO) << header << "\n";

    // Write whatever content we already have to output.
    if (response.size() > 0)
    {
		//std::cout << &response;
		unsigned int size = response.size();
		char* buf = new char[size+1];
		response.sgetn(buf, size+1);
		buf[size]='\0';
		this->response.append(buf);
		//std::cout << buf << std::endl;
	}

    // Read until EOF, writing data to output as we go.
    boost::system::error_code error;
    while (boost::asio::read(socket, response, boost::asio::transfer_at_least(1), error))
	{
		//std::cout << "streambuf ===================\n " << &response << std::endl;
		std::istream is(&response);
		unsigned int size = response.size();
		char* buf = new char[size+1];
		response.sgetn(buf, size+1);
		buf[size]='\0';
		this->response.append(buf);
	}
    if (error != boost::asio::error::eof)
      throw boost::system::system_error(error);
	return true;
}
catch(std::exception& e)
{
	LOG(ERROR) << e.what();
	return false;
}

bike info parser解析返回的html:
/*************************************************************************
	> File Name: HtmlParser.h
	> Author: Liu Xin
	> Mail: liu_x_0625@126.com 
	> Created Time: 2012年12月09日 星期日 14时36分15秒
 ************************************************************************/
#ifndef _BIKE_INFO_HTML_PARSER_H_
#define _BIKE_INFO_HTML_PARSER_H_

#include<iostream>
#include<map>
#include<vector>
#include<boost/regex.hpp>
using namespace std;

namespace parser
{
struct BikeInfo{
	string statId;
	string statName;
	string avaliableNum;
	string nonavaliableNum;
	string serviceTime;
	string status;
	string location;
	string callNumber;
	string otherServices;
	string info;
	string lat;
	string lng;

	std::map<string, string> context;
};

class CBikeInfoHtmlParser{
private:
	string html;
	vector<BikeInfo*> stats;

public:
	CBikeInfoHtmlParser(string& str);
	~CBikeInfoHtmlParser();

	void parse();
	vector<BikeInfo*>* getBikeInfo(){
		return &(this->stats);
	}

private:
	bool parseBikeInfo(const string& ul);
	bool searchPattern(const char *str, vector<string>& matches, const boost::regex& pattern);
};

}

#endif

主要实现部分使用boost:regex提取信息:

bool CBikeInfoHtmlParser::searchPattern(const char *str, vector<string>& matches, const boost::regex &pattern)
{
	string s(str);
	boost::match_results<std::string::const_iterator> what;
	std::string::const_iterator start, end;
	boost::match_flag_type flags = boost::match_default;

	start = s.begin();
	end = s.end();
	while(boost::regex_search(start, end, what, pattern, flags))
	{
		for(int i=0; i<what.size(); i++)
		{
			string ss(what[i].first, what[i].second);
			//std::cout << "first: " << ss << std::endl;
			matches.push_back(ss);
		}
		start = what[0].second;
		flags |= boost::match_prev_avail;
		flags |= boost::match_not_bob;
	}
}

void CBikeInfoHtmlParser::parse()
{
	parseBikeInfo(this->html);
}

bool CBikeInfoHtmlParser::parseBikeInfo(const string& ul)
{
	boost::regex patServiceTime("</strong>\\d+:\\d+-\\d+:\\d+");
	boost::regex patStatId("№\\d+</span>");
	boost::regex patStatName("</span>[ ]*[\x80-\xFF]+");
	boost::regex patAvaliableNum("可租</span>[ ]*\\d+");
	boost::regex patNonAvaliableNum("可还</span>[ ]*\\d+");
	boost::regex patLocation("站点地址:</strong>[\x80-\xFF\\d]+");

	vector<string> matches;

	matches.clear();
	searchPattern(ul.c_str(), matches, patServiceTime);

	for(int i =0; i<matches.size(); i++)
	{
		if (stats.size() != matches.size())
		{
			BikeInfo *info = new BikeInfo;
			stats.push_back(info);
		}

		string serviceTime = matches[i].substr(9);
		stats[i]->serviceTime = serviceTime;
		//std::cout << stats[i]->serviceTime << std::endl;
	}
	//std::cout << std::endl;

	matches.clear();
	searchPattern(ul.c_str(), matches, patStatId);
	for(int i =0; i<matches.size(); i++)
	{
		size_t pos = matches[i].find("</span>");
		stats[i]->statId = matches[i].substr(0, pos);
		//std::cout << stats[i]->statId << std::endl;
	}
	//std::cout << std::endl;

	matches.clear();
	searchPattern(ul.c_str(), matches, patStatName);
	for(int i =0; i<matches.size(); i++)
	{
		size_t pos = matches[i].find_last_of(";");
		if (pos == string::npos)
			pos = matches[i].find_last_of(">");
		stats[i]->statName = matches[i].substr(pos+1);
		//std::cout << stats[i]->statName << std::endl;
	}
	//std::cout << std::endl;

	matches.clear();
	searchPattern(ul.c_str(), matches, patAvaliableNum);
	for(int i =0; i<matches.size(); i++)
	{
		size_t pos = matches[i].find_last_of(";");
		if (pos == string::npos)
			pos = matches[i].find_last_of(">");
		stats[i]->avaliableNum = matches[i].substr(pos+1);
		//std::cout << stats[i]->avaliableNum << std::endl;
	}
	//std::cout << std::endl;


	matches.clear();
	searchPattern(ul.c_str(), matches, patNonAvaliableNum);
	for(int i =0; i<matches.size(); i++)
	{
		size_t pos = matches[i].find_last_of(";");
		if (pos == string::npos)
			pos = matches[i].find_last_of(">");
		stats[i]->nonavaliableNum = matches[i].substr(pos+1);
		//std::cout << stats[i]->nonavaliableNum << std::endl;
	}
	//std::cout << std::endl;

	
	matches.clear();
	searchPattern(ul.c_str(), matches, patLocation);
	for(int i =0; i<matches.size(); i++)
	{
		size_t pos = matches[i].find_last_of(">");
		stats[i]->location = matches[i].substr(pos+1);
		//std::cout << stats[i]->location << std::endl;
	}
	//std::cout << std::endl;
}

GeoXmlParer是根据位置描述文本请求百度地图的geocoder接口获取位置坐标信息的:
/*************************************************************************
	> File Name: GeoXmlParser.h
	> Author: Liu Xin
	> Mail: liu_x_0625@126.com 
	> Created Time: 2013年01月02日 星期三 22时17分50秒
 ************************************************************************/
#ifndef _GEO_XML_PARSER_H_
#define _GEO_XML_PARSER_H_

#include<iostream>
#include<vector>
#include<boost/regex.hpp>

using namespace std;

namespace parser
{
class CGeoXmlParser{
private:
	string xml;
	string lat;
	string lng;

public:
	CGeoXmlParser(const string& str);
	~CGeoXmlParser();

	void parse();

	string getLat(){
		return lat;
	}

	string getLng(){
		return lng;
	}
private:
	bool searchPattern(const char* str, vector<string>& matches, const boost::regex& pattern);
};

}

#endif

其主要实现和之前的BikeInfoParser是类似的使用boost:regex实现:
/*************************************************************************
	> File Name: GeoXmlParser.cpp
	> Author: Liu Xin
	> Mail: liu_x_0625@126.com 
	> Created Time: 2013年01月02日 星期三 22时20分25秒
 ************************************************************************/

#include<iostream>
#include<vector>

#include"GeoXmlParser.h"
#include<boost/regex.hpp>

using namespace std;
using namespace parser;

CGeoXmlParser::CGeoXmlParser(const string& str)
{
	this->xml = str;
}

CGeoXmlParser::~CGeoXmlParser()
{
}

void CGeoXmlParser::parse()
{
	boost::regex latPat("<lat>\\d+.\\d+</lat>");
	boost::regex lngPat("<lng>\\d+.\\d+</lng>");

	vector<string> matches;
	
	matches.clear();
	searchPattern(xml.c_str(), matches, latPat);
	lat=matches[0];

	matches.clear();
	searchPattern(xml.c_str(), matches, lngPat);
	lng=matches[0];
}

bool CGeoXmlParser::searchPattern(const char *str, vector<string>& matches, const boost::regex &pattern)
{
	string s(str);
	boost::match_results<std::string::const_iterator> what;
	std::string::const_iterator start, end;
	boost::match_flag_type flags = boost::match_default;

	start = s.begin();
	end = s.end();
	while(boost::regex_search(start, end, what, pattern, flags))
	{
		for(int i=0; i<what.size(); i++)
		{
			string ss(what[i].first, what[i].second);
			//std::cout << "first: " << ss << std::endl;
			matches.push_back(ss);
		}
		start = what[0].second;
		flags |= boost::match_prev_avail;
		flags |= boost::match_not_bob;
	}
}

接下来模块的组装就相对比较简单的了,分别调用http和parser的相关模块就是了:
/*************************************************************************
	> File Name: Test.cpp
	> Author: Liu Xin
	> Mail: liu_x_0625@126.com 
	> Created Time: 2012年12月08日 星期六 20时19分03秒
 ************************************************************************/

#include<iostream>
#include<http/HttpRequest.h>
#include<parser/BikeInfoHtmlParser.h>
#include<parser/GeoXmlParser.h>
#include<glog/logging.h>

using namespace std;
using namespace http;
using namespace parser;

int main()
{
	CHttpRequest request("http://www.hzbus.cn/Page/BicyleSquare.aspx?nm=滨兴");
	request.send();
	string html = request.getResponseData();
	
	CBikeInfoHtmlParser parser(html);
	vector<BikeInfo*> *stats=NULL;
	parser.parse();
	stats = parser.getBikeInfo();

	for (int i=0; i<stats->size(); i++)
	{
		string address = ((*stats)[i])->location;
		CHttpRequest geoRequest("http://api.map.baidu.com/geocoder?address=" + address + "&output=xml&city=杭州");
		geoRequest.send();
		string geoXml = geoRequest.getResponseData();
		CGeoXmlParser geoParser(geoXml);
		geoParser.parse();
		std::cout << "lat: " << geoParser.getLat() << "\tlng: " << geoParser.getLng() << std::endl;
	}
	return 0;
}

整个工程还算是有点点复杂的啦,对于我这样的小菜鸟来说,其中编译需要依赖的库有boost,glog,Test.cpp的编译Makefile.am如下:
bin_PROGRAMS=bike
INCLUDES=-I. -I/usr/local/include -I$(top_srcdir)/src

bike_SOURCES=Test.cpp \
			 $(top_srcdir)/src/parser/BikeInfoHtmlParser.cpp \
			 $(top_srcdir)/src/parser/GeoXmlParser.cpp
bike_LDADD=$(top_srcdir)/src/http/libhttp.a \
		   -lglog \
		   -lpthread \
		   -lboost_regex

就写到此吧,基本上都是代码,其实代码也不复杂,主要是体会linux c++开发的这个过程。期间不断的编译,gdb调试,运行测试等等,渐渐的就会熟悉并且习惯linux这种命令行式的开发方式,渐渐体会到了其对于开发者支持的强大能力。新的一年继续学习linux c++开发,希望学习中有更多更大的新的收获,坚持学习,不断进步!
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页