通过链接获取Html源码内容

       /// <summary>
        /// 通过链接获取网页源码
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
       public  static string GetContenFrommUrl(string url)
        {
            string Content = string.Empty;
            Uri uri = new Uri(url);
            //WebRequest需要添加引用  System.Net;
            WebRequest myReq = WebRequest.Create(uri);
            WebResponse result = myReq.GetResponse();
            Stream receviceStream = result.GetResponseStream();
            //Encoding.UTF8
            //StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312"));
            StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.UTF8);
            Content = readerOfStream.ReadToEnd();
            readerOfStream.Close();
            receviceStream.Close();
            result.Close();
            return Content;
        }

        /// <summary>
        /// 获取指定DIV的内容
        /// </summary>
        /// <param name="strHTML">被筛选的字符串</param>
        /// <param name="name">ID名</param>
        /// <returns></returns>
        public  static string GetDivFromStr(string strHTML)
        {
            string Content = string.Empty;
            //Match,Regex需要添加引用 System.Text.RegularExpressions;
            Match m = Regex.Match(strHTML, @"<div[^>]*?id=""listLeft""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[\s\S]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase);
            if (m.Success)
            {
                Content = m.Value;
            }
            return Content;
        }

        /// <summary>
        /// 下载图片,并将图片保存到本地
        /// </summary>
        /// <param name="URL">图片链接</param>
        /// <returns>本地图片地址</returns>
     public    static string DowmLoadImage(string URL)
        {
            string Image = string.Empty;
            string Path = "D:/MyJob/HtmlToData/Images/";
            //WebClient需要添加引用 System.Net;
            WebClient myWebClient = new System.Net.WebClient();
            //URL 图片路径, Path + System.IO.Path.GetFileName(URL) 图片保存位置
            myWebClient.DownloadFile(URL, Path + System.IO.Path.GetFileName(URL));
            Image = "2016/12/22/" + System.IO.Path.GetFileName(URL);
            return Image;
        }

        /// <summary>
        /// 替换指定图片
        /// </summary>
        /// <param name="Content">Html代码</param>
        /// <returns>返回替换后的Html代码</returns>
        public static string ReplaceImage(string Content)
        {
            //获取图片路径
            //Regex需要添加引用 System.Text.RegularExpressions;
            Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
            //MatchCollection 需要添加引用 System.Text.RegularExpressions;
            MatchCollection matches = regImg.Matches(Content);
            //将某一特定图片(横杠杠)替换为<hr />
            foreach (Match match in matches)
            {
                if (match.Groups["imgUrl"].Value == "http://en.shio.gov.cn/file/images/split-e5.gif")
                {
                    Content = Content.Replace(match.Value, "<hr />"); //将图片http://en.shio.gov.cn/file/images/split-e5.gif替换为<hr />
                    break;
                }
            }
            return Content;
        }

        /// <summary>
        /// 替换指定Div
        /// </summary>
        /// <param name="Content">Html代码</param>
       /// <param name="strHTML">被筛选的字符串</param>
        /// <returns>返回替换后的Html代码</returns>
        public static string ReplaceDiv(string Content,string strHTML)
        {
            //将< div id = "pages" ></div>中的内容替换为<hr />
            //Match,Regex需要添加引用 System.Text.RegularExpressions;
            Match mm = Regex.Match(strHTML, @"<div[^>]*?id=""pages""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[\s\S]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase);
            Content = Content.Replace(mm.Value, "<hr />");
            return Content;
        }

        /// <summary>
        /// 获取指定imge标签的src
        /// </summary>
        /// <param name="strHTML"></param>
        /// <returns></returns>
        public  static string GetImageSrc(string strHTML)
        {
            string Titleimage = "";
            //Match,Regex需要添加引用 System.Text.RegularExpressions;
            Match maimage = Regex.Match(strHTML, @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>");
            //获取标题图片
            if (maimage.Success)
            {
               Titleimage = DowmLoadImage(maimage.Groups["imgUrl"].Value);
            }
            return Titleimage;
        }

        /// <summary>
        ///获取<a> 标签的href和内容   
        /// </summary>
        /// <param name="AStr">Html代码</param>
        /// <returns></returns>
        public static string[] GetHref(string AStr)
        {
            string[] ListStr = new string[2];
            //Match,Regex需要添加引用 System.Text.RegularExpressions;
            Match ma = Regex.Match(AStr, @"(?is)<a[^>]+?href=(['""])([^'""]*)\1[^>]*>(.+)</a>");
            if (ma.Success)
            {
                ListStr[0] = ma.Groups[3].Value;//text
                ListStr[1] = ma.Groups[2].Value;//超链接
            }
            return ListStr;
        }

        /// <summary>
        /// 获取指定p(<p class="auxiInfo">)标签的内容
        /// </summary>
        /// <param name="PStr">Html代码</param>
        /// <returns>返回P标签的内容</returns>
        public static string GetTargetPContent(string PStr)
        {
            string content = "";
            //Match,Regex需要添加引用 System.Text.RegularExpressions;
            Match mtime = Regex.Match(PStr, @"<p[^>]*?class=""auxiInfo""[^>]*>((?>(?<o><p[^>]*>)|(?<-o></p>)|(?:(?!</?p)[\s\S]))*)(?(o)(?!))</p>", RegexOptions.IgnoreCase);
            if (mtime.Success)
            {
                content = mtime.Groups[1].Value;
            }
            return content;
        }

        /// <summary>
        /// 获取P标签的内容
        /// </summary>
        /// <param name="PStr">Html代码</param>
        /// <returns>返回P标签的内容</returns>
        public static string GetPContent(string PStr)
        {
            string content = "";
            //Match,Regex需要添加引用 System.Text.RegularExpressions;
            Match mp = Regex.Match(PStr, @"(?is)<p>(.*?)</p>");
            if (mp.Success)
            {
                content = mp.Groups[1].Value;
            }
            return content;
        }

  

转载于:https://www.cnblogs.com/suflowers1700218/p/11528046.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值