C#/.NET获取网页中全部图片链接
方法函数1:获取HTML中的Img标签
/// <summary>
/// 取得HTML中所有图片的 URL。
/// </summary>
/// <param name="sHtmlText">HTML代码</param>
/// <returns>图片的URL列表</returns>
private string[] GetHtmlImageUrlList(string sHtmlText)
{
// 定义正则表达式用来匹配 img 标签
Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText);
int i = 0;
string[] sUrlList = new string[matches.Count];
// 取得匹配项列表
foreach (Match match in matches)
sUrlList[i++] = match.Groups["imgUrl"].Value;
return sUrlList;
}
方法函数2:下载网页
public static string DownloadHtml(string url)
{
string html = string.Empty;
try
{
var request = WebRequest.Create(url) as HttpWebRequest;
request.Timeout = RequestModels.Timeout*1000;//设置30s的超时
request.UserAgent = RequestModels.UserAgent;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = RequestModels.ContentType;
request.Headers.Add("Cookie", @RequestModels.Cookie);
//Encoding enc = Encoding.GetEncoding("GB2312"); // 如果是乱码就改成 utf-8 / GB2312
using (var response = request.GetResponse() as HttpWebResponse)
{
if (response.StatusCode != HttpStatusCode.OK)
{
//Log.Error(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
return url;
}
try
{
StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.UTF8);//Encoding.GetEncoding("GB2312"));//
html = sr.ReadToEnd();
sr.Close();
}
catch (Exception ex)
{
Log.Error(string.Format("DownloadHtml抓取{0}保存失败", url), ex);
html = null;
}
}
}
catch (System.Net.WebException ex)
{
if (ex.Message.Equals("远程服务器返回错误: (306)。"))
{
//logger.Error("远程服务器返回错误: (306)。", ex);
//Console.WriteLine("DownloadHtml url={0}结果为空,306", url);
return null;
}
}
catch (Exception ex)
{
//logger.Error("异常", ex);
//Log.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
html = null;
}
return html;
}