/// <summary>
/// 通过链接获取网页源码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetContenFrommUrl(string url)
{
string Content = string.Empty;
Uri uri = new Uri(url);
//WebRequest需要添加引用 System.Net;
WebRequest myReq = WebRequest.Create(uri);
WebResponse result = myReq.GetResponse();
Stream receviceStream = result.GetResponseStream();
//Encoding.UTF8
//StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312"));
StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.UTF8);
Content = readerOfStream.ReadToEnd();
readerOfStream.Close();
receviceStream.Close();
result.Close();
return Content;
}
/// <summary>
/// 获取指定DIV的内容
/// </summary>
/// <param name="strHTML">被筛选的字符串</param>
/// <param name="name">ID名</param>
/// <returns></returns>
public static string GetDivFromStr(string strHTML)
{
string Content = string.Empty;
//Match,Regex需要添加引用 System.Text.RegularExpressions;
Match m = Regex.Match(strHTML, @"<div[^>]*?id=""listLeft""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[\s\S]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase);
if (m.Success)
{
Content = m.Value;
}
return Content;
}
/// <summary>
/// 下载图片,并将图片保存到本地
/// </summary>
/// <param name="URL">图片链接</param>
/// <returns>本地图片地址</returns>
public static string DowmLoadImage(string URL)
{
string Image = string.Empty;
string Path = "D:/MyJob/HtmlToData/Images/";
//WebClient需要添加引用 System.Net;
WebClient myWebClient = new System.Net.WebClient();
//URL 图片路径, Path + System.IO.Path.GetFileName(URL) 图片保存位置
myWebClient.DownloadFile(URL, Path + System.IO.Path.GetFileName(URL));
Image = "2016/12/22/" + System.IO.Path.GetFileName(URL);
return Image;
}
/// <summary>
/// 替换指定图片
/// </summary>
/// <param name="Content">Html代码</param>
/// <returns>返回替换后的Html代码</returns>
public static string ReplaceImage(string Content)
{
//获取图片路径
//Regex需要添加引用 System.Text.RegularExpressions;
Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
//MatchCollection 需要添加引用 System.Text.RegularExpressions;
MatchCollection matches = regImg.Matches(Content);
//将某一特定图片(横杠杠)替换为<hr />
foreach (Match match in matches)
{
if (match.Groups["imgUrl"].Value == "http://en.shio.gov.cn/file/images/split-e5.gif")
{
Content = Content.Replace(match.Value, "<hr />"); //将图片http://en.shio.gov.cn/file/images/split-e5.gif替换为<hr />
break;
}
}
return Content;
}
/// <summary>
/// 替换指定Div
/// </summary>
/// <param name="Content">Html代码</param>
/// <param name="strHTML">被筛选的字符串</param>
/// <returns>返回替换后的Html代码</returns>
public static string ReplaceDiv(string Content,string strHTML)
{
//将< div id = "pages" ></div>中的内容替换为<hr />
//Match,Regex需要添加引用 System.Text.RegularExpressions;
Match mm = Regex.Match(strHTML, @"<div[^>]*?id=""pages""[^>]*>((?>(?<o><div[^>]*>)|(?<-o></div>)|(?:(?!</?div)[\s\S]))*)(?(o)(?!))</div>", RegexOptions.IgnoreCase);
Content = Content.Replace(mm.Value, "<hr />");
return Content;
}
/// <summary>
/// 获取指定imge标签的src
/// </summary>
/// <param name="strHTML"></param>
/// <returns></returns>
public static string GetImageSrc(string strHTML)
{
string Titleimage = "";
//Match,Regex需要添加引用 System.Text.RegularExpressions;
Match maimage = Regex.Match(strHTML, @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>");
//获取标题图片
if (maimage.Success)
{
Titleimage = DowmLoadImage(maimage.Groups["imgUrl"].Value);
}
return Titleimage;
}
/// <summary>
///获取<a> 标签的href和内容
/// </summary>
/// <param name="AStr">Html代码</param>
/// <returns></returns>
public static string[] GetHref(string AStr)
{
string[] ListStr = new string[2];
//Match,Regex需要添加引用 System.Text.RegularExpressions;
Match ma = Regex.Match(AStr, @"(?is)<a[^>]+?href=(['""])([^'""]*)\1[^>]*>(.+)</a>");
if (ma.Success)
{
ListStr[0] = ma.Groups[3].Value;//text
ListStr[1] = ma.Groups[2].Value;//超链接
}
return ListStr;
}
/// <summary>
/// 获取指定p(<p class="auxiInfo">)标签的内容
/// </summary>
/// <param name="PStr">Html代码</param>
/// <returns>返回P标签的内容</returns>
public static string GetTargetPContent(string PStr)
{
string content = "";
//Match,Regex需要添加引用 System.Text.RegularExpressions;
Match mtime = Regex.Match(PStr, @"<p[^>]*?class=""auxiInfo""[^>]*>((?>(?<o><p[^>]*>)|(?<-o></p>)|(?:(?!</?p)[\s\S]))*)(?(o)(?!))</p>", RegexOptions.IgnoreCase);
if (mtime.Success)
{
content = mtime.Groups[1].Value;
}
return content;
}
/// <summary>
/// 获取P标签的内容
/// </summary>
/// <param name="PStr">Html代码</param>
/// <returns>返回P标签的内容</returns>
public static string GetPContent(string PStr)
{
string content = "";
//Match,Regex需要添加引用 System.Text.RegularExpressions;
Match mp = Regex.Match(PStr, @"(?is)<p>(.*?)</p>");
if (mp.Success)
{
content = mp.Groups[1].Value;
}
return content;
}