抓取网页上所有看的见的字符

最近老板让实现一个功能,抓取指定页面中的所有可见的字符串。因为原先没做过这方面的工作在实现起来遇到了一些问题,在这里做一下纪录希望在以后再遇到这样的要求时能想起来。
对与网页来说,可见的字符串是什么概念, 样式不能是Display: none 和 visibility: hidden;所有字符串是什么概念包括在容器标签中的如
<td>hello</td> <div>hello</div> <span>hello</span>和控件的标题如<input type="btn" value="确定">这样的字符串。我采用了两个函数来分别处理这两种功能:
 1 ExpandedBlockStart.gif ContractedBlock.gif /**/ /// <summary>
 2InBlock.gif        /// traverse the html tree and seek string which in the container
 3InBlock.gif        /// </summary>
 4InBlock.gif        /// <param name="container"></param>
 5ExpandedBlockEnd.gif        /// <param name="showIframeIndex"></param>

 6 None.gif          private   void  SeekStringsInContianer(mshtml.IHTMLElement container,  ref   int  showIframeIndex)
 7 ExpandedBlockStart.gifContractedBlock.gif         dot.gif {
 8InBlock.gif            try
 9ExpandedSubBlockStart.gifContractedSubBlock.gif            dot.gif{
10InBlock.gif                if (null == container || null == container.innerHTML)
11InBlock.gif                    return;
12InBlock.gif
13InBlock.gif                // if the container is hidden
14InBlock.gif                if (null != container.style &&
15InBlock.gif                    ((null != container.style.display &&  container.style.display.Equals("none")) ||
16InBlock.gif                     (null != container.style.visibility && container.style.visibility.Equals("hidden"))))
17InBlock.gif                    return;
18InBlock.gif
19InBlock.gif                if (container.tagName.ToLower().Equals("iframe"))
20InBlock.gif                    return;
21InBlock.gif                
22InBlock.gif                // if the container isn't a container
23InBlock.gif                if (null != container.innerText &&
24InBlock.gif                    container.innerHTML.IndexOf("<IFRAME"== -1 &&
25InBlock.gif                    container.innerHTML.IndexOf("<TABLE"== -1 &&
26InBlock.gif                    container.innerHTML.IndexOf("<TD"== -1 &&
27InBlock.gif                    container.innerHTML.IndexOf("<TR"== -1 &&
28InBlock.gif                    container.innerHTML.IndexOf("<DIV"== -1 )
29ExpandedSubBlockStart.gifContractedSubBlock.gif                dot.gif{
30InBlock.gif                    if (((mshtml.IHTMLElementCollection)container.children).length == 0)
31InBlock.gif                        mHuntedStringsList.Add(container.innerText);
32InBlock.gif                    else
33InBlock.gif                        mHuntedStringsList.Add(SpanFilters(container));
34ExpandedSubBlockEnd.gif                }

35InBlock.gif                else
36ExpandedSubBlockStart.gifContractedSubBlock.gif                dot.gif{
37InBlock.gif                    mshtml.IHTMLElementCollection collection = (mshtml.IHTMLElementCollection)container.children;
38InBlock.gif
39InBlock.gif                    // traverse container's childs
40InBlock.gif                    int i = 0;
41InBlock.gif                    foreach (mshtml.IHTMLElement elem in collection)
42ExpandedSubBlockStart.gifContractedSubBlock.gif                    dot.gif{
43InBlock.gif                        // jump over  the hidden iframes
44InBlock.gif                        if (elem.tagName.ToLower().Equals("iframe"))
45ExpandedSubBlockStart.gifContractedSubBlock.gif                        dot.gif{
46InBlock.gif                            if (null != elem.style &&
47InBlock.gif                                null != elem.style.cssText &&
48InBlock.gif                                elem.style.cssText.IndexOf("none"!= -1)
49InBlock.gif                                i++;
50InBlock.gif                            else
51InBlock.gif                                showIframeIndex = i;
52ExpandedSubBlockEnd.gif                        }

53InBlock.gif                        else
54ExpandedSubBlockStart.gifContractedSubBlock.gif                        dot.gif{
55InBlock.gif                            SeekStringsInContianer(elem, ref showIframeIndex);
56ExpandedSubBlockEnd.gif                        }

57ExpandedSubBlockEnd.gif                    }

58ExpandedSubBlockEnd.gif                }

59ExpandedSubBlockEnd.gif            }

60InBlock.gif            catch (ArgumentNullException e)
61ExpandedSubBlockStart.gifContractedSubBlock.gif            dot.gif{
62InBlock.gif                throw new Exception("null container!", e);
63ExpandedSubBlockEnd.gif            }

64ExpandedBlockEnd.gif        }

65 None.gif
66 ExpandedBlockStart.gifContractedBlock.gif         /**/ /// <summary>
67InBlock.gif        /// get the caption of the control , for example button
68InBlock.gif        /// </summary>
69ExpandedBlockEnd.gif        /// <param name="doc"></param>

70 None.gif          private   void  SeekStringsInControl(mshtml.IHTMLElement container)
71 ExpandedBlockStart.gifContractedBlock.gif         dot.gif {
72InBlock.gif            try
73ExpandedSubBlockStart.gifContractedSubBlock.gif            dot.gif{
74InBlock.gif                mshtml.IHTMLElementCollection collection = (mshtml.IHTMLElementCollection)container.all;
75InBlock.gif                foreach (mshtml.IHTMLElement elem in collection)
76ExpandedSubBlockStart.gifContractedSubBlock.gif                dot.gif{
77InBlock.gif                    if (elem.tagName.ToLower().Equals("input"))
78ExpandedSubBlockStart.gifContractedSubBlock.gif                    dot.gif{
79InBlock.gif                        string attr = (string)elem.getAttribute("type"0);
80InBlock.gif                        string id = (string)elem.getAttribute("id"0);
81InBlock.gif                        if ( attr.Equals("submit"|| ( null != id && id.IndexOf("txt"!= -1))
82InBlock.gif                            mHuntedStringsList.Add((string)elem.getAttribute("value"0));
83ExpandedSubBlockEnd.gif                    }

84ExpandedSubBlockEnd.gif                }

85ExpandedSubBlockEnd.gif            }

86InBlock.gif            catch (ArgumentNullException e)
87ExpandedSubBlockStart.gifContractedSubBlock.gif            dot.gif{
88InBlock.gif                throw new Exception("null document during seeking string in control!", e);
89ExpandedSubBlockEnd.gif            }

90ExpandedBlockEnd.gif        }

这两个函数主要用到了递归操作,由于 html代码具有嵌套性,并且可以不规范的书写所以中间做了一些额外的工作在去的可见字符串的时候。我使用下面的函数来完成这个工作:
 1 ExpandedBlockStart.gif ContractedBlock.gif   /**/ /// <summary>
 2InBlock.gif        /// filter the span which is hidden in the element
 3InBlock.gif        /// </summary>
 4InBlock.gif        /// <param name="element"></param>
 5ExpandedBlockEnd.gif        /// <returns></returns>

 6 None.gif          private   string  SpanFilters(mshtml.IHTMLElement element)
 7 ExpandedBlockStart.gifContractedBlock.gif         dot.gif {
 8InBlock.gif            string resultStr = element.innerText;
 9InBlock.gif
10InBlock.gif            if (((mshtml.IHTMLElementCollection)element.children).length == 0)
11InBlock.gif                return resultStr;
12InBlock.gif
13InBlock.gif            FilterHiddenText(element, ref resultStr);
14InBlock.gif            return resultStr;
15ExpandedBlockEnd.gif        }

16 None.gif
17 None.gif         private   void  FilterHiddenText(mshtml.IHTMLElement element,  ref   string  srcStr)
18 ExpandedBlockStart.gifContractedBlock.gif         dot.gif {
19InBlock.gif            if (null != element && 
20InBlock.gif                null != element.innerText &&
21InBlock.gif                null != element.style &&
22InBlock.gif                (( null != element.style.visibility && element.style.visibility.Equals("hidden") ) ||
23InBlock.gif                (null != element.style.display && element.style.display.Equals("none"))))
24ExpandedSubBlockStart.gifContractedSubBlock.gif            dot.gif{
25InBlock.gif                int pos = srcStr.IndexOf(element.innerText);
26InBlock.gif                srcStr = srcStr.Remove(pos, element.innerText.Length);
27InBlock.gif                return;
28ExpandedSubBlockEnd.gif            }

29InBlock.gif
30InBlock.gif            mshtml.IHTMLElementCollection collection = (mshtml.IHTMLElementCollection)element.children;
31InBlock.gif            if (collection.length != 0)
32ExpandedSubBlockStart.gifContractedSubBlock.gif            dot.gif{
33InBlock.gif                foreach (mshtml.IHTMLElement elem in collection)
34ExpandedSubBlockStart.gifContractedSubBlock.gif                dot.gif{
35InBlock.gif                    FilterHiddenText(elem, ref srcStr);
36ExpandedSubBlockEnd.gif                }

37ExpandedSubBlockEnd.gif            }

38ExpandedBlockEnd.gif        }

转载于:https://www.cnblogs.com/moonz-wu/archive/2007/07/11/814610.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值