利用SgmlReader获取网页源代码,进行提取

1根据sgmlReader类获得完整的html代码

   /// <summary>
        /// 读取html页面内容
        /// </summary>
        /// <param name="uri">网址</param>
        /// <returns></returns>
        private string GetWellFormedHTML(string uri)
        {
            StreamReader sReader = null;//读取字节流
            StringWriter sw = null;//写入字符串
            SgmlReader reader = null;//sgml读取方法
            XmlTextWriter writer = null;//生成xml数据流
            try
            {
                if (uri == String.Empty)
                    return null;
                WebClient webclient = new WebClient();
                webclient.Encoding = Encoding.UTF8;
                //页面内容
                string strWebContent = webclient.DownloadString(uri);


                reader = new SgmlReader();
                reader.DocType = "HTML";
                reader.InputStream = new StringReader(strWebContent);


                sw = new StringWriter();
                writer = new XmlTextWriter(sw);
                writer.Formatting = System.Xml.Formatting.Indented;
                while (reader.Read())
                {
                    if (reader.NodeType != XmlNodeType.Whitespace)
                    {
                        writer.WriteNode(reader, true);
                    }
                }
                return sw.ToString();

            }
            catch (Exception exp)
            {
                writer.Close();
                reader.Close();
                sw.Close();
                sReader.Close();
                return exp.Message;
            }
        }
View Code

2根据xpath规则,进行查找

  /// <summary>
        /// 加载html源码,根据xpath规则查找所需内容
        /// </summary>
        /// <param name="htmlStr">源码</param>
        /// <param name="xpath">xpath规则</param>
        /// <returns>查询结果</returns>
        private string GetResult(string htmlStr, string xpath)
        {
            StringBuilder sb = new StringBuilder();//存储结果
            XPathDocument doc = new XPathDocument(new StringReader(htmlStr));//记载文件
            XPathNavigator nav = doc.CreateNavigator();//产生节点
            XPathNodeIterator nodes = nav.Select(xpath);//需找目标
            while (nodes.MoveNext())
            {
                XPathNavigator navCon = nodes.Current;
                sb.AppendLine(navCon.InnerXml);//获取全部内容(包含属性等)
                sb.AppendLine(navCon.Value);//获取值(不包含属性等)
            }
            return sb.ToString();
        }
View Code

完!

 

转载于:https://www.cnblogs.com/wwz-wwz/p/7551477.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值