前期准备
以下代码来源于vs2010sp1 路径:\VC\include\regex
文档参阅的是 ISO/IEC 14882 Information Technology - Programming Languages-C++ C++2011-9-1国际标准文档。
正则表达式结果类
// TEMPLATE CLASS match_results
template<class _BidIt,
class _Alloc>
class match_results
{ // class to hold contents of all capture groups
public:
typedef match_results<_BidIt, _Alloc> _MyT;
typedef sub_match<_BidIt> _Elem;
typedef _STD vector<_Elem, _Alloc> _MyCont;
typedef _Elem value_type;
typedef typename _Alloc::const_reference const_reference;
typedef const_reference reference;
typedef typename _MyCont::const_iterator const_iterator;
typedef const_iterator iterator;
typedef _REGEX_DIFFT(_BidIt) difference_type;
typedef typename _Alloc::size_type size_type;
typedef _Alloc allocator_type;
typedef _REGEX_VALT(_BidIt) char_type;
typedef _STD basic_string<char_type> string_type;
match_results()
{ // construct empty match_results
}
explicit match_results(const _Alloc& _Al)
: _Matches(_Al)
{ // construct empty match_results with allocator
}
match_results(_MyT&& _Right)
{ // construct by moving _Right
_Assign_rv(_STD move(_Right));
}
_MyT& operator=(_MyT&& _Right)
{ // assign by moving _Right
_Assign_rv(_STD move(_Right));
return (*this);
}
void _Assign_rv(_MyT&& _Right)
{ // assign by moving _Right
if (this != &_Right)
{ // clear this and steal from _Right
_Matches = _STD move(_Right._Matches);
_Prefix = _Right._Prefix;
_Suffix = _Right._Suffix;
_Null_elem = _Right._Null_elem;
}
}
size_type size() const
{ // return number of capture groups
return (_Matches.size());
}
size_type max_size() const
{ // return maximum possible number of capture groups
return (_Matches.max_size());
}
bool empty() const
{ // test if object is empty
return (_Matches.empty());
}
difference_type length(size_type _Sub = 0) const
{ // return length of capture group _Sub
return ((*this)[_Sub].length());
}
difference_type position(size_type _Sub = 0) const
{ // return offset of submatch _Sub
return (_STD distance(_Org, (*this)[_Sub].first));
}
string_type str(size_type _Sub = 0) const
{ // return contents of submatch _Sub
return (string_type((*this)[_Sub]));
}
const_reference operator[](size_type _Sub) const
{ // return submatch _Sub
return (_Matches.size() <= _Sub ? _Null_elem : _Matches[_Sub]);
}
const_reference prefix() const
{ // return text preceding match
return (_Prefix);
}
const_reference suffix() const
{ // return text following match
return (_Suffix);
}
const_iterator begin() const
{ // return iterator for beginning of sequence of submatches
return (_Matches.begin());
}
const_iterator end() const
{ // return iterator for end of sequence of submatches
return (_Matches.end());
}
template<class _OutIt>
_OutIt _Format(_OutIt _Out,
const string_type& _Fmt,
regex_constants::match_flag_type _Flags) const
{ // format text, replacing matches
return (_Flags & regex_constants::format_sed
? _Format_sed(*this, _Out, _Fmt.begin(), _Fmt.end(), _Flags)
: _Format_default(*this, _Out, _Fmt.begin(), _Fmt.end(), _Flags));
}
#if _ITERATOR_DEBUG_LEVEL == 0
template<class _OutIt>
_OutIt format(_OutIt _Out,
const string_type& _Fmt,
regex_constants::match_flag_type _Flags =
regex_constants::format_default) const
{ // format text, replacing matches
return (_Rechecked(_Out,
_Format(_Unchecked(_Out), _Fmt, _Flags)));
}
#else /* _ITERATOR_DEBUG_LEVEL == 0 */
template<class _OutIt>
_OutIt _Format(_OutIt _Out,
const string_type& _Fmt,
regex_constants::match_flag_type _Flags, _STD tr1::true_type) const
{ // format text, replacing matches, checked dest
return (_Format(_Out, _Fmt, _Flags));
}
template<class _OutIt>
_SCL_INSECURE_DEPRECATE
_OutIt _Format(_OutIt _Out,
const string_type& _Fmt,
regex_constants::match_flag_type _Flags, _STD tr1::false_type) const
{ // format text, replacing matches, unchecked dest
return (_Format(_Out, _Fmt, _Flags));
}
template<class _OutIt>
_OutIt format(_OutIt _Out,
const string_type& _Fmt,
regex_constants::match_flag_type _Flags =
regex_constants::format_default) const
{ // format text, replacing matches
_DEBUG_POINTER(_Out);
return (_Format(_Out, _Fmt, _Flags, _Is_checked(_Out)));
}
template<class _OutTy,
size_t _OutSize>
_OutTy * format(_OutTy (&_Out)[_OutSize],
const string_type& _Fmt,
regex_constants::match_flag_type _Flags =
regex_constants::format_default) const
{ // format text, replacing matches
return (_Unchecked(format(
_Array_iterator<_OutTy, _OutSize>(_Out), _Fmt, _Flags)));
}
#endif /* _ITERATOR_DEBUG_LEVEL == 0 */
string_type format(const string_type& _Fmt,
regex_constants::match_flag_type _Flags =
regex_constants::format_default) const
{ // format text, replacing matches
string_type _Str;
format(_STD back_inserter(_Str), _Fmt, _Flags);
return (_Str);
}
allocator_type get_allocator() const
{ // return allocator object for submatches
return (_Matches.get_allocator());
}
void swap(match_results& _Right)
{ // exchange contents with _Right
_Swap_adl(_Org, _Right._Org);
_Matches.swap(_Right._Matches);
_STD swap(_Prefix, _Right._Prefix);
_STD swap(_Suffix, _Right._Suffix);
}
void _Resize(unsigned _Nx)
{ // allocate space for _Nx submatches
_Matches.resize(_Nx);
}
_Elem& _Pfx()
{ // return modifiable pair of iterators to prefix
return (_Prefix);
}
_Elem& _Sfx()
{ // return modifiable pair of iterators to suffix
return (_Suffix);
}
_Elem& _Null()
{ // return modifiable pair of iterators for null element
return (_Null_elem);
}
_Elem& _At(unsigned _Sub)
{ // unchecked access to element at _Sub
return (_Matches[_Sub]);
}
_Elem _At(unsigned _Sub) const
{ // unchecked access to element at _Sub
return (_Matches[_Sub]);
}
_BidIt _Org;
private:
_MyCont _Matches;
_Elem _Prefix;
_Elem _Suffix;
_Elem _Null_elem;
};
这个类是正则匹配的最终结果存放类,一般情况下,我们使用regex_search 和 regex_match 两个算法来算出我们的最终结果。
如下两个图,来自于c++标准,第一个是regex_match操作,其运行后的结果放置示意。
下面的图显示的是regex的regex_search算法的结果示意图
从两个表的设置方式,我们可以便利的取出结果来处理。
然而这里的我们喜欢深层次挖掘,那么问题来了,为什么会是如上这个设置方式呢
我们看看结果类就明白了,就是上面的代码,我们现在进行详细的分析
我们可以看到类中的成员变量,其中_Matches就放着核心数据结果,函数中重载了[],使得m[0] 返回的就是_Matches变量。
const_reference operator[](size_type _Sub) const
{ // return submatch _Sub
return (_Matches.size() <= _Sub ? _Null_elem : _Matches[_Sub]);
}
函数先判断用户寻址的索引是否越界,如果越界则返回空的,如果没有返回索引的结果。
我们继续深挖,看_Matches到底是神马高端货
_MyCont _Matches;
typedef _STD vector<_Elem, _Alloc> _MyCont;
typedef sub_match<_BidIt> _Elem;
哦,原来_Matches是一个stl中的vector,这个队列中存放的是一个sub_match,当我们使用m[0]时返回的是vector中的第一项,而这个项的类型是sub_match。
终究没有挖到底,我们继续看sub_match,即每个子结果类的实现。
// TEMPLATE CLASS sub_match
template<class _BidIt>
class sub_match
: public _STD pair<_BidIt, _BidIt>
{ // class to hold contents of a capture group
public:
typedef _REGEX_VALT(_BidIt) value_type;
typedef _REGEX_DIFFT(_BidIt) difference_type;
typedef _BidIt iterator;
sub_match()
: matched(false)
{ // construct
}
bool matched;
difference_type length() const
{ // return length of matched text
return (matched ? _STD distance(this->first, this->second) : 0);
}
operator _STD basic_string<value_type>() const
{ // convert matched text to string
return (str());
}
_STD basic_string<value_type> str() const
{ // convert matched text to string
return (matched ?
_STD basic_string<value_type>(this->first, this->second)
: _STD basic_string<value_type>());
}
int compare(const sub_match& _Right) const
{ // compare *this to _Right
return (str().compare(_Right.str()));
}
int compare(const _STD basic_string<value_type>& _Right) const
{ // compare *this to _Right
return (str().compare(_Right));
}
int compare(_In_z_ const value_type *_Ptr) const
{ // compare *this to array pointed to by _Ptr
return (str().compare(_Ptr));
}
};
看完sub_match我们就恍然大悟,哦,原来他是一个pair,貌似高大上,其实还是使用了stl的思想,用pair来存单个数据,用sub_match继承pair存放单个匹配串的同时存放是否匹配变量,然后sub_match写一些处理函数,用vector来存整个数据,用match_results的一个成员变量_Matches的类型是vector,这样match_results就是整个结果。当然match_results还要设计一些函数来处理这些数据,以便更好的访问结果,这样重载操作符[]就应运而生了。
下面是标准C++11中的对于match_results的要求
下面是正确匹配时的结果填写
match_results<_BidIt, _Alloc> *_Matches
if (!_Match(_Rep))
return (false);
if (_Matches)
{ // copy results to _Matches
_Matches->_Resize(_Ncap);
for (unsigned int _Idx = 0; _Idx < _Ncap; ++_Idx)
{ // copy submatch _Idx
if (_Res._Grp_valid & (1 << _Idx))
{ // copy successful match
_Matches->_At(_Idx).matched = true;
_Matches->_At(_Idx).first = _Res._Grps[_Idx]._Begin;
_Matches->_At(_Idx).second = _Res._Grps[_Idx]._End;
}
else
{ // copy failed match
_Matches->_At(_Idx).matched = false;
_Matches->_At(_Idx).first = _End;
_Matches->_At(_Idx).second = _End;
}
}
_Matches->_Org = _Begin;
_Matches->_Pfx().matched = true;
_Matches->_Pfx().first = _Begin;
_Matches->_Pfx().second = _Matches->_At(0).first;
_Matches->_Sfx().matched = true;
_Matches->_Sfx().first = _Matches->_At(0).second;
_Matches->_Sfx().second = _End;
_Matches->_Null().first = _End;
_Matches->_Null().second = _End;
}
从上面的程序中我们就更加的明白,最终结果类的使用方式了,如同c++11标准中的那样,我们按照匹配或是不匹配存储了不同的值。用户在调用搜索或匹配算法后得到的就是这个结果形式,通过访问这个结果就就能得到已匹配的字符串。
下面是测试结果类的数据显示
正则表达式规则类
下面讲述的是正则表达式的规则类,也就是说用于存放我要求的给定规则串。
tr1提供的接口是
using tr1::wregex;
我们可以使用wregex来声明我们自己的unicode编码的规则串。
其实啊,wregex是个typedef basic_regex wregex;这里我们重点分析basic_regex。
// CLASS _Regex_base
class _Regex_base
: public _Container_base
{ // base class for basic_regex to construct and destroy proxy
public:
#if 0 < _ITERATOR_DEBUG_LEVEL
_Regex_base()
{ // construct proxy
this->_Myproxy = new _STD _Container_proxy;
this->_Myproxy->_Mycont = this;
}
~_Regex_base()
{ // destroy proxy
this->_Orphan_all();
delete this->_Myproxy;
this->_Myproxy = 0;
}
#endif /* 0 < _ITERATOR_DEBUG_LEVEL */
};
// TEMPLATE CLASS basic_regex
template<class _Elem,
class _RxTraits = regex_traits<_Elem> >
class basic_regex
: public _Regex_base
{ // regular expression
public:
typedef basic_regex<_Elem, _RxTraits> _MyT;
typedef _Elem value_type;
typedef typename _RxTraits::locale_type locale_type;
typedef regex_constants::syntax_option_type flag_type;
enum {_EEN_VIS =
_ENHANCED_REGEX_VISUALIZER}; // helper for expression evaluator
static const flag_type icase = regex_constants::icase;
static const flag_type nosubs = regex_constants::nosubs;
static const flag_type optimize = regex_constants::optimize;
static const flag_type collate = regex_constants::collate;
static const flag_type ECMAScript = regex_constants::ECMAScript;
static const flag_type basic = regex_constants::basic;
static const flag_type extended = regex_constants::extended;
static const flag_type awk = regex_constants::awk;
static const flag_type grep = regex_constants::grep;
static const flag_type egrep = regex_constants::egrep;
basic_regex()
: _Rep(0)
{ // construct empty object
}
explicit basic_regex(_In_z_ const _Elem *_Ptr,
flag_type _Flags = regex_constants::ECMAScript)
: _Rep(0)
{ // construct from null terminated character sequence
_Reset(_Ptr, _Ptr + _RxTraits::length(_Ptr),
_Flags, random_access_iterator_tag());
}
basic_regex(_In_count_(_Count) const _Elem *_Ptr, size_t _Count,
flag_type _Flags = regex_constants::ECMAScript)
: _Rep(0)
{ // construct from character sequence
if (_Ptr == 0)
_Xbad(regex_constants::error_parse);
_Reset(_Ptr, _Ptr + _Count, _Flags, random_access_iterator_tag());
}
template<class _STtraits,
class _STalloc>
explicit basic_regex(
const _STD basic_string<_Elem, _STtraits, _STalloc>& _Str,
flag_type _Flags = regex_constants::ECMAScript)
: _Rep(0)
{ // construct from string object
_Reset(_Str.begin(), _Str.end(), _Flags, random_access_iterator_tag());
}
template<class _InIt>
basic_regex(_InIt _First, _InIt _Last,
flag_type _Flags)
: _Rep(0)
{ // construct from pair of iterators
_DEBUG_RANGE(_First, _Last);
_Reset(_First, _Last, _Flags, _Iter_cat(_First));
}
template<class _InIt>
basic_regex(_InIt _First, _InIt _Last)
: _Rep(0)
{ // construct from pair of iterators
_DEBUG_RANGE(_First, _Last);
_Reset(_First, _Last, regex_constants::ECMAScript,
_Iter_cat(_First));
}
basic_regex(const _MyT& _Right)
#if _ENHANCED_REGEX_VISUALIZER
: _Rep(0), _Visualization(_Right._Visualization)
#else /* _ENHANCED_REGEX_VISUALIZER */
: _Rep(0)
#endif /* _ENHANCED_REGEX_VISUALIZER */
{ // construct copy of _Right
_Reset(_Right._Rep);
}
basic_regex(_MyT&& _Right)
: _Rep(0)
{ // construct by moving _Right
_Assign_rv(_STD move(_Right));
}
_MyT& operator=(_MyT&& _Right)
{ // assign by moving _Right
_Assign_rv(_STD move(_Right));
return (*this);
}
void _Assign_rv(_MyT&& _Right)
{ // assign by moving _Right
if (this != &_Right)
{ // clear this and steal from _Right