regex 源码分析

最新推荐文章于 2024-05-12 09:33:19 发布

青城梦远

最新推荐文章于 2024-05-12 09:33:19 发布

阅读量1.6k

点赞数

分类专栏： STL 文章标签： regex 源码分析 vs

本文链接：https://blog.csdn.net/feiniao251314/article/details/52198048

版权

本文深入分析了C++标准库中的regex模块，详细探讨了从正则表达式结果类match_results到正则表达式规则类basic_regex的实现细节，包括规则串的转换、编译过程及NFA自动机的构建。通过源码解析，揭示了正则表达式的内部工作原理，如编译时的元字符转换、状态机构建以及匹配接口Regex_search的核心功能。

摘要由CSDN通过智能技术生成

前期准备

以下代码来源于vs2010sp1 路径：\VC\include\regex
文档参阅的是 ISO/IEC 14882 Information Technology - Programming Languages-C++ C++2011-9-1国际标准文档。

正则表达式结果类

 // TEMPLATE CLASS match_results
template<class _BidIt,
 class _Alloc>
 class match_results
 {  // class to hold contents of all capture groups
public:
 typedef match_results<_BidIt, _Alloc> _MyT;
 typedef sub_match<_BidIt> _Elem;
 typedef _STD vector<_Elem, _Alloc> _MyCont;
 typedef _Elem value_type;
 typedef typename _Alloc::const_reference const_reference;
 typedef const_reference reference;
 typedef typename _MyCont::const_iterator const_iterator;
 typedef const_iterator iterator;
 typedef _REGEX_DIFFT(_BidIt) difference_type;
 typedef typename _Alloc::size_type size_type;
 typedef _Alloc allocator_type;
 typedef _REGEX_VALT(_BidIt) char_type;
 typedef _STD basic_string<char_type> string_type;

 match_results()
  { // construct empty match_results
  }

 explicit match_results(const _Alloc& _Al)
  : _Matches(_Al)
  { // construct empty match_results with allocator
  }

 match_results(_MyT&& _Right)
  { // construct by moving _Right
  _Assign_rv(_STD move(_Right));
  }

 _MyT& operator=(_MyT&& _Right)
  { // assign by moving _Right
  _Assign_rv(_STD move(_Right));
  return (*this);
  }

 void _Assign_rv(_MyT&& _Right)
  { // assign by moving _Right
  if (this != &_Right)
   {    // clear this and steal from _Right
   _Matches = _STD move(_Right._Matches);
   _Prefix = _Right._Prefix;
   _Suffix = _Right._Suffix;
   _Null_elem = _Right._Null_elem;
   }
  }

 size_type size() const
  { // return number of capture groups
  return (_Matches.size());
  }

 size_type max_size() const
  { // return maximum possible number of capture groups
  return (_Matches.max_size());
  }

 bool empty() const
  { // test if object is empty
  return (_Matches.empty());
  }

 difference_type length(size_type _Sub = 0) const
  { // return length of capture group _Sub
  return ((*this)[_Sub].length());
  }

 difference_type position(size_type _Sub = 0) const
  { // return offset of submatch _Sub
  return (_STD distance(_Org, (*this)[_Sub].first));
  }

 string_type str(size_type _Sub = 0) const
  { // return contents of submatch _Sub
  return (string_type((*this)[_Sub]));
  }

 const_reference operator[](size_type _Sub) const
  { // return submatch _Sub
  return (_Matches.size() <= _Sub ? _Null_elem : _Matches[_Sub]);
  }

 const_reference prefix() const
  { // return text preceding match
  return (_Prefix);
  }

 const_reference suffix() const
  { // return text following match
  return (_Suffix);
  }

 const_iterator begin() const
  { // return iterator for beginning of sequence of submatches
  return (_Matches.begin());
  }

 const_iterator end() const
  { // return iterator for end of sequence of submatches
  return (_Matches.end());
  }

 template<class _OutIt>
  _OutIt _Format(_OutIt _Out,
   const string_type& _Fmt,
   regex_constants::match_flag_type _Flags) const
  { // format text, replacing matches
  return (_Flags & regex_constants::format_sed
   ? _Format_sed(*this, _Out, _Fmt.begin(), _Fmt.end(), _Flags)
   : _Format_default(*this, _Out, _Fmt.begin(), _Fmt.end(), _Flags));
  }

 #if _ITERATOR_DEBUG_LEVEL == 0
 template<class _OutIt>
  _OutIt format(_OutIt _Out,
   const string_type& _Fmt,
   regex_constants::match_flag_type _Flags =
    regex_constants::format_default) const
  { // format text, replacing matches
  return (_Rechecked(_Out,
   _Format(_Unchecked(_Out), _Fmt, _Flags)));
  }
 #else /* _ITERATOR_DEBUG_LEVEL == 0 */
 template<class _OutIt>
  _OutIt _Format(_OutIt _Out,
   const string_type& _Fmt,
   regex_constants::match_flag_type _Flags, _STD tr1::true_type) const
  { // format text, replacing matches, checked dest
  return (_Format(_Out, _Fmt, _Flags));
  }

 template<class _OutIt>
 _SCL_INSECURE_DEPRECATE
  _OutIt _Format(_OutIt _Out,
   const string_type& _Fmt,
   regex_constants::match_flag_type _Flags, _STD tr1::false_type) const
  { // format text, replacing matches, unchecked dest
  return (_Format(_Out, _Fmt, _Flags));
  }

 template<class _OutIt>
  _OutIt format(_OutIt _Out,
   const string_type& _Fmt,
   regex_constants::match_flag_type _Flags =
    regex_constants::format_default) const
  { // format text, replacing matches
  _DEBUG_POINTER(_Out);
  return (_Format(_Out, _Fmt, _Flags, _Is_checked(_Out)));
  }

 template<class _OutTy,
  size_t _OutSize>
  _OutTy * format(_OutTy (&_Out)[_OutSize],
   const string_type& _Fmt,
   regex_constants::match_flag_type _Flags =
    regex_constants::format_default) const
  { // format text, replacing matches
  return (_Unchecked(format(
   _Array_iterator<_OutTy, _OutSize>(_Out), _Fmt, _Flags)));
  }
 #endif /* _ITERATOR_DEBUG_LEVEL == 0 */

 string_type format(const string_type& _Fmt,
  regex_constants::match_flag_type _Flags =
   regex_constants::format_default) const
  { // format text, replacing matches
  string_type _Str;
  format(_STD back_inserter(_Str), _Fmt, _Flags);
  return (_Str);
  }

 allocator_type get_allocator() const
  { // return allocator object for submatches
  return (_Matches.get_allocator());
  }

 void swap(match_results& _Right)
  { // exchange contents with _Right
  _Swap_adl(_Org, _Right._Org);
  _Matches.swap(_Right._Matches);
  _STD swap(_Prefix, _Right._Prefix);
  _STD swap(_Suffix, _Right._Suffix);
  }

 void _Resize(unsigned _Nx)
  { // allocate space for _Nx submatches
  _Matches.resize(_Nx);
  }

 _Elem& _Pfx()
  { // return modifiable pair of iterators to prefix
  return (_Prefix);
  }

 _Elem& _Sfx()
  { // return modifiable pair of iterators to suffix
  return (_Suffix);
  }

 _Elem& _Null()
  { // return modifiable pair of iterators for null element
  return (_Null_elem);
  }

 _Elem& _At(unsigned _Sub)
  { // unchecked access to element at _Sub
  return (_Matches[_Sub]);
  }

 _Elem _At(unsigned _Sub) const
  { // unchecked access to element at _Sub
  return (_Matches[_Sub]);
  }

 _BidIt _Org;

private:
 _MyCont _Matches;
 _Elem _Prefix;
 _Elem _Suffix;
 _Elem _Null_elem;
 };

这个类是正则匹配的最终结果存放类，一般情况下，我们使用regex_search 和 regex_match 两个算法来算出我们的最终结果。
如下两个图，来自于c++标准，第一个是regex_match操作，其运行后的结果放置示意。
这里写图片描述
下面的图显示的是regex的regex_search算法的结果示意图

从两个表的设置方式，我们可以便利的取出结果来处理。
然而这里的我们喜欢深层次挖掘，那么问题来了，为什么会是如上这个设置方式呢
我们看看结果类就明白了，就是上面的代码，我们现在进行详细的分析
我们可以看到类中的成员变量，其中_Matches就放着核心数据结果，函数中重载了[]，使得m[0] 返回的就是_Matches变量。

 const_reference operator[](size_type _Sub) const
  { // return submatch _Sub
  return (_Matches.size() <= _Sub ? _Null_elem : _Matches[_Sub]);
  }

函数先判断用户寻址的索引是否越界，如果越界则返回空的，如果没有返回索引的结果。
我们继续深挖，看_Matches到底是神马高端货

 _MyCont _Matches;
 typedef _STD vector<_Elem, _Alloc> _MyCont;
 typedef sub_match<_BidIt> _Elem;

哦，原来_Matches是一个stl中的vector，这个队列中存放的是一个sub_match,当我们使用m[0]时返回的是vector中的第一项，而这个项的类型是sub_match。
终究没有挖到底，我们继续看sub_match，即每个子结果类的实现。

 // TEMPLATE CLASS sub_match
template<class _BidIt>
 class sub_match
  : public _STD pair<_BidIt, _BidIt>
 {  // class to hold contents of a capture group
public:
 typedef _REGEX_VALT(_BidIt) value_type;
 typedef _REGEX_DIFFT(_BidIt) difference_type;
 typedef _BidIt iterator;
 sub_match()
  : matched(false)
  { // construct
  }
 bool matched;
 difference_type length() const
  { // return length of matched text
  return (matched ? _STD distance(this->first, this->second) : 0);
  }
 operator _STD basic_string<value_type>() const
  { // convert matched text to string
  return (str());
  }
 _STD basic_string<value_type> str() const
  { // convert matched text to string
  return (matched ?
   _STD basic_string<value_type>(this->first, this->second)
   : _STD basic_string<value_type>());
  }
 int compare(const sub_match& _Right) const
  { // compare *this to _Right
  return (str().compare(_Right.str()));
  }
 int compare(const _STD basic_string<value_type>& _Right) const
  { // compare *this to _Right
  return (str().compare(_Right));
  }
 int compare(_In_z_ const value_type *_Ptr) const
  { // compare *this to array pointed to by _Ptr
  return (str().compare(_Ptr));
  }
 };

看完sub_match我们就恍然大悟，哦，原来他是一个pair，貌似高大上，其实还是使用了stl的思想，用pair来存单个数据，用sub_match继承pair存放单个匹配串的同时存放是否匹配变量，然后sub_match写一些处理函数，用vector来存整个数据，用match_results的一个成员变量_Matches的类型是vector，这样match_results就是整个结果。当然match_results还要设计一些函数来处理这些数据，以便更好的访问结果，这样重载操作符[]就应运而生了。
下面是标准C++11中的对于match_results的要求
这里写图片描述
下面是正确匹配时的结果填写

match_results<_BidIt, _Alloc> *_Matches

  if (!_Match(_Rep))
   return (false);
  if (_Matches)
   {    // copy results to _Matches
   _Matches->_Resize(_Ncap);
   for (unsigned int _Idx = 0; _Idx < _Ncap; ++_Idx)
    {   // copy submatch _Idx
    if (_Res._Grp_valid & (1 << _Idx))
     {  // copy successful match
     _Matches->_At(_Idx).matched = true;
     _Matches->_At(_Idx).first = _Res._Grps[_Idx]._Begin;
     _Matches->_At(_Idx).second = _Res._Grps[_Idx]._End;
     }
    else
     {  // copy failed match
     _Matches->_At(_Idx).matched = false;
     _Matches->_At(_Idx).first = _End;
     _Matches->_At(_Idx).second = _End;
     }
    }
   _Matches->_Org = _Begin;
   _Matches->_Pfx().matched = true;
   _Matches->_Pfx().first = _Begin;
   _Matches->_Pfx().second = _Matches->_At(0).first;
   _Matches->_Sfx().matched = true;
   _Matches->_Sfx().first = _Matches->_At(0).second;
   _Matches->_Sfx().second = _End;
   _Matches->_Null().first = _End;
   _Matches->_Null().second = _End;
   }

从上面的程序中我们就更加的明白，最终结果类的使用方式了，如同c++11标准中的那样，我们按照匹配或是不匹配存储了不同的值。用户在调用搜索或匹配算法后得到的就是这个结果形式，通过访问这个结果就就能得到已匹配的字符串。
下面是测试结果类的数据显示
这里写图片描述

正则表达式规则类

下面讲述的是正则表达式的规则类，也就是说用于存放我要求的给定规则串。
tr1提供的接口是
using tr1::wregex;
我们可以使用wregex来声明我们自己的unicode编码的规则串。
其实啊，wregex是个typedef basic_regex wregex;这里我们重点分析basic_regex。

 // CLASS _Regex_base
class _Regex_base
 : public _Container_base
 {  // base class for basic_regex to construct and destroy proxy
public:
 #if 0 < _ITERATOR_DEBUG_LEVEL
 _Regex_base()
   { // construct proxy
  this->_Myproxy = new _STD _Container_proxy;
  this->_Myproxy->_Mycont = this;
   }
 ~_Regex_base()
  { // destroy proxy
  this->_Orphan_all();
  delete this->_Myproxy;
  this->_Myproxy = 0;
  }
 #endif /* 0 < _ITERATOR_DEBUG_LEVEL */
 };
 // TEMPLATE CLASS basic_regex
template<class _Elem,
 class _RxTraits = regex_traits<_Elem> >
 class basic_regex
  : public _Regex_base
 {  // regular expression
public:
 typedef basic_regex<_Elem, _RxTraits> _MyT;
 typedef _Elem value_type;
 typedef typename _RxTraits::locale_type locale_type;
 typedef regex_constants::syntax_option_type flag_type;
 enum {_EEN_VIS =
  _ENHANCED_REGEX_VISUALIZER};  // helper for expression evaluator
 static const flag_type icase = regex_constants::icase;
 static const flag_type nosubs = regex_constants::nosubs;
 static const flag_type optimize = regex_constants::optimize;
 static const flag_type collate = regex_constants::collate;
 static const flag_type ECMAScript = regex_constants::ECMAScript;
 static const flag_type basic = regex_constants::basic;
 static const flag_type extended = regex_constants::extended;
 static const flag_type awk = regex_constants::awk;
 static const flag_type grep = regex_constants::grep;
 static const flag_type egrep = regex_constants::egrep;
 basic_regex()
  : _Rep(0)
  { // construct empty object
  }
 explicit basic_regex(_In_z_ const _Elem *_Ptr,
  flag_type _Flags = regex_constants::ECMAScript)
  : _Rep(0)
  { // construct from null terminated character sequence
  _Reset(_Ptr, _Ptr + _RxTraits::length(_Ptr),
   _Flags, random_access_iterator_tag());
  }
 basic_regex(_In_count_(_Count) const _Elem *_Ptr, size_t _Count,
  flag_type _Flags = regex_constants::ECMAScript)
  : _Rep(0)
  { // construct from character sequence
  if (_Ptr == 0)
   _Xbad(regex_constants::error_parse);
  _Reset(_Ptr, _Ptr + _Count, _Flags, random_access_iterator_tag());
  }
 template<class _STtraits,
  class _STalloc>
  explicit basic_regex(
   const _STD basic_string<_Elem, _STtraits, _STalloc>& _Str,
   flag_type _Flags = regex_constants::ECMAScript)
  : _Rep(0)
  { // construct from string object
  _Reset(_Str.begin(), _Str.end(), _Flags, random_access_iterator_tag());
  }
 template<class _InIt>
  basic_regex(_InIt _First, _InIt _Last,
   flag_type _Flags)
  : _Rep(0)
  { // construct from pair of iterators
  _DEBUG_RANGE(_First, _Last);
  _Reset(_First, _Last, _Flags, _Iter_cat(_First));
  }
 template<class _InIt>
  basic_regex(_InIt _First, _InIt _Last)
  : _Rep(0)
  { // construct from pair of iterators
  _DEBUG_RANGE(_First, _Last);
  _Reset(_First, _Last, regex_constants::ECMAScript,
   _Iter_cat(_First));
  }
 basic_regex(const _MyT& _Right)
 #if _ENHANCED_REGEX_VISUALIZER
  : _Rep(0), _Visualization(_Right._Visualization)
 #else /* _ENHANCED_REGEX_VISUALIZER */
  : _Rep(0)
 #endif /* _ENHANCED_REGEX_VISUALIZER */
  { // construct copy of _Right
  _Reset(_Right._Rep);
  }
 basic_regex(_MyT&& _Right)
  : _Rep(0)
  { // construct by moving _Right
  _Assign_rv(_STD move(_Right));
  }
 _MyT& operator=(_MyT&& _Right)
  { // assign by moving _Right
  _Assign_rv(_STD move(_Right));
  return (*this);
  }
 void _Assign_rv(_MyT&& _Right)
  { // assign by moving _Right
  if (this != &_Right)
   {    // clear this and steal from _Right

最低0.47元/天解锁文章

青城梦远

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
regex 源码分析

前期准备以下代码来源于vs2010sp1 路径：\VC\include\regex 文档参阅的是 ISO/IEC 14882 Information Technology - Programming Languages-C++ C++2011-9-1国际标准文档。正则表达式结果类 // TEMPLATE CLASS match_resultstemplate<class _BidIt,
复制链接

扫一扫