代码示例1：抓取原代码

最新推荐文章于 2018-12-30 14:51:50 发布

claymore1114

最新推荐文章于 2018-12-30 14:51:50 发布

阅读量814

点赞数

分类专栏： .net 代码示例文章标签： string null exception url html 脚本

本文链接：https://blog.csdn.net/claymore1114/article/details/6409801

版权

.net 同时被 2 个专栏收录

24 篇文章 0 订阅

订阅专栏

代码示例

1 篇文章 0 订阅

订阅专栏

1、抓取源代码

/// <summary> /// Retrieve the entire html code from SourcePage.aspx with WebRequest and /// WebRespond. We transfer the format of html code to uft-8. /// </summary> /// <param name="url"></param> /// <returns></returns> public string GetWholeHtmlCode(string url) { string strHtml = string.Empty; StreamReader strReader = null; HttpWebResponse wrpContent = null; try { HttpWebRequest wrqContent = (HttpWebRequest)WebRequest.Create(url); wrqContent.Timeout = 300000; wrpContent = (HttpWebResponse)wrqContent.GetResponse(); if (wrpContent.StatusCode != HttpStatusCode.OK) { flgPageRetrieved = false; strHtml = "Sorry, the web page is not run successful"; } if (wrpContent != null) { strReader = new StreamReader(wrpContent.GetResponseStream(), Encoding.GetEncoding("utf-8")); strHtml = strReader.ReadToEnd(); } } catch (Exception e) { flgPageRetrieved = false; strHtml = e.Message; } finally { if (strReader != null) strReader.Close(); if (wrpContent != null) wrpContent.Close(); } return strHtml; }

2、匹配正文

strWholeHtml = this.GetWholeHtmlCode(strUrl); string strRegexScript = @"(?m)<body[^>]*>(/w|/W)*?</body[^>]*>"; string strRegex = @"<[^>]*>"; string strMatchScript = string.Empty; Match matchText = Regex.Match(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase); strMatchScript = matchText.Groups[0].Value; string strPureText = Regex.Replace(strMatchScript, strRegex, string.Empty, RegexOptions.IgnoreCase);

3、匹配脚本

strWholeHtml = this.GetWholeHtmlCode(strUrl); string strRegexScript = @"(?m)<script[^>]*>(/w|/W)*?</script[^>]*>"; string strRegex = @"<[^>]*>"; string strMatchScript = string.Empty; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase); StringBuilder strbScriptList = new StringBuilder(); foreach (Match matchSingleScript in matchList) { string strSingleScriptText = Regex.Replace(matchSingleScript.Value, strRegex, string.Empty, RegexOptions.IgnoreCase); strbScriptList.Append(strSingleScriptText + "/r/n"); }

4、匹配图片

strWholeHtml = this.GetWholeHtmlCode(strUrl); string strRegexImg = @"(?is)<img.*?>"; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexImg, RegexOptions.IgnoreCase); StringBuilder strbImageList = new StringBuilder(); foreach (Match matchSingleImage in matchList) { strbImageList.Append(matchSingleImage.Value + "/r/n"); }

5、匹配链接

strWholeHtml = this.GetWholeHtmlCode(strUrl); string strRegexLink = @"(?is)<a .*?>"; MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexLink, RegexOptions.IgnoreCase); StringBuilder strbLinkList = new StringBuilder(); foreach (Match matchSingleLink in matchList) { strbLinkList.Append(matchSingleLink.Value + "/r/n"); }