可能是我还比较菜,感觉这个程序写起来好绕啊,调了好多次才调通,下面放完代码去跑步了。
main.cpp
#include <iostream.h>
#include <vector.h>
#include <string.h>
#include <cctype>
#include "find_urls.h"
using std::vector;
using std::string;
using std::cin;
using std::cout;
using std::endl;
using std::getline;
int main(){
string s;
while(getline(cin,s)){
vector<string> v = find_urls(s);
for(vector<string>::const_iterator i = v.begin(); i != v.end();++i)
cout<<*i<<endl;
}
return 0;
}
然后是find_url()和其他几个函数体
#include <iostream.h>
#include <string.h>
#include <cctype>
#include <algorithm>
#include <vector.h>
#include "find_urls.h"
using std::string; using std::vector;
using std::find; using std::find_if;
using std::isalpha; using std::isalnum;
using std::search;
string::const_iterator
url_begin(string::const_iterator b,string::const_iterator e){
static const string sep = "://";
typedef string::const_iterator iter;
iter i = b;
while((i = search(i,e,sep.begin(),sep.end())) != e){
if(i != b && i + sep.size() !=e){
iter begin = i;
while(begin != b && isalpha(begin[-1]))
--begin;
if(begin != i && !not_url_char(i[sep.size()]))
//i[sep.size()] = *(i+sep.size())
return begin;
}
i+=sep.size();
}
return e;
}
bool not_url_char(char c){
static const string url_ch = "~;/?:@=&$-_.+!*'(),";
// find(url_ch.begin(),url_ch.end(),c)表示在url_ch里查找c字符,
//如果找到了则返回此时的迭代器,找不到就返回url_ch.end()
return !(isalnum(c) || find(url_ch.begin(),url_ch.end(),c) != url_ch.end());
}
string::const_iterator
url_end(string::const_iterator b,string::const_iterator e){
//find_if的返回类型为迭代器类型,所以函数的返回类型应该也是迭代器类型
return find_if(b,e,not_url_char);
//执行完后返回的迭代器指向not_url_char字符
}
vector<string> find_urls(const string& s){
vector<string> ret;
typedef string::const_iterator iter;
iter b = s.begin(),e = s.end();
while(b!=e){
b = url_begin(b,e);
if(b!=e){
iter after = url_end(b,e);
ret.push_back(string(b,after));
b = after;
}
}
return ret;
}
最后就是头文件
#ifndef GUARD_find_urls_h
#define GUARD_find_urls_h
#include <string.h>
#include <vector.h>
using std::string; using std::vector;
string::const_iterator
url_begin(string::const_iterator ,string::const_iterator);
bool not_url_char(char);
string::const_iterator
url_end(string::const_iterator ,string::const_iterator);
vector<string> find_urls(const string&);
#endif
关于头文件,我看书上说头文件里最好不要用using,但我还是偷懒用了,编译居然通过了...em...好吧,暂时不知道会有什么副作用。
下面跑个例子来看看:
发现这个算法其实有个bug,就是http前面是字母或者符合url_char的字符,那么也会被程序查找进url中,这个我下次改改看。