class WebCollection
{
/// <summary>
/// 获取列表页所有URL链接地址
/// </summary>
/// <param name="ListUrl">列表页的URL</param>
/// <param name="ListUrlStartCode">列表页开始的网页源代码</param>
/// <param name="ListUrlEndCode">列表页结束的网页源代码</param>
/// <param name="LinkUrlStartCode">链接开始的网页源代码</param>
/// <param name="LinkUrlEndCode">链接结束的网页源代码</param>
/// <param name="WebEncoding">列表页的网页编码</param>
/// <returns>返回一个字符串数组,判断第一个字符串为true则成功</returns>
public string[] GetUrlList(string ListUrl, string ListUrlStartCode, string ListUrlEndCode, string LinkUrlStartCode, string LinkUrlEndCode, string WebEncoding)
{
List<string> ListStr = new List<string>();
try
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(ListUrl);
req.Method = "GET";
req.ContentType = "application/x-www-form-urlencoded";
HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
StreamReader sr = new StreamReader(resp.GetResponseStream(), Encoding.GetEncoding(WebEncoding));
string respHtml = sr.ReadToEnd();
respHtml = (respHtml.Split(new string[] { ListUrlStartCode, ListUrlEndCode }, StringSplitOptions.None))[1];
string[] SplitStr = respHtml.Split(new string[] { LinkUrlStartCode, LinkUrlEndCode }, StringSplitOptions.None);
for (int i = 1; i < SplitStr.Length; i = i + 2)
{
ListStr.Add(SplitStr[i]);
}
ListStr.Insert(0, "true");
return ListStr.ToArray();
}
catch (Exception er)
{
ListStr.Insert(0, "false,err:" + er.Message);
return ListStr.ToArray();
}
}
/// <summary>
/// 获取内容页的多个内容块
/// </summary>
/// <param name="ContentUrl">内容页的URL地址</param>
/// <param name="GetContentStartCode">想要获取的内容块所有的开始的网页源代码</param>
/// <param name="GetContentEndCode">想要获取的内容块所有的结束的网页源代码<</param>
/// <param name="WebEncoding">内容页的编码</param>
/// <returns>返回处理后获得的内容块,第一个字符串为true则获取成功</returns>
public string[] GetContent(string ContentUrl, string[] GetContentStartCode, string[] GetContentEndCode, string WebEncoding)
{
List<string> ListContent = new List<string>();
try
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(ContentUrl);
req.Method = "GET";
req.Timeout = 20000;
req.AllowAutoRedirect = false;
req.ContentType = "application/x-www-form-urlencoded";
HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
StreamReader sr = new StreamReader(resp.GetResponseStream(), Encoding.GetEncoding(WebEncoding));
string respHtml = sr.ReadToEnd();
for (int i = 0; i < GetContentStartCode.Length; i++)
{
string ReceiveContent = "";
ReceiveContent = (respHtml.Split(new string[] { GetContentStartCode[i], GetContentEndCode[i] }, StringSplitOptions.None))[1];
ListContent.Add(ReceiveContent);
}
ListContent.Insert(0, "true");
return ListContent.ToArray();
}
catch (Exception er)
{
ListContent.Insert(0, "false,err:" + er.Message);
return ListContent.ToArray();
}
}
/// <summary>
/// 去除HTML标记(转载方法)
/// </summary>
/// <param name="NoHTML">包括HTML的源码</param>
/// <returns>已经去除后的文字</returns>
public string NoHTML(string Htmlstring)
{
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
Htmlstring.Replace("<", "");
Htmlstring.Replace(">", "");
Htmlstring.Replace("\r\n", "");
//Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
return Htmlstring;
}
}
c# 网页采集类 (方法)
最新推荐文章于 2020-06-03 22:08:22 发布