using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace _03通过webclient来提取Email地址
{
class Program
{
static void Main(string[] args)
{
#region 通过webclient下载
//下载字符串
var a = 0;
WebClient client = new WebClient();
string html = client.DownloadString(@"http://www.360doc.com/content/16/0622/21/6598516_569892154.shtml");
//从html字符串中提取邮件地址https://www.baidu.com
MatchCollection mcs = Regex.Matches(html, @"[-a-zA-Z0-9_.]+@[-a-zA-Z]+(\.[a-zA-Z0-9]+)+");
foreach (Match item in mcs)
{
Console.WriteLine(item.Value);
}
Console.WriteLine(mcs.Count + "个");
//Console.WriteLine(html.ToString());
#endregion
#region 提取图片
//1.下载Html文件
WebClient client2 = new WebClient();
//2.提取html中的<img/>标签
/*
<img src="http://t1.27270.com/uploads/tu/201803/98/b213b436e6.jpg" width="190" height="280" alt="">
*/
string html2 = client2.DownloadString(@"http://www.27270.com/ent/meinvtupian/");
MatchCollection matchs = Regex.Matches(html2, @"<img\ssrc=""(.+?)""", RegexOptions.IgnoreCase);
//3.通过提取组获取img中src的属性
foreach (Match item in matchs)
{
Console.WriteLine(item.Groups[1].Value);
//4.通过拼接路径下载图片存储到指定路径
client2.DownloadFile(item.Groups[1].Value, @"D:\\img\" + DateTime.Now.ToFileTime() +" .jpg");
}
#endregion
#region 提取超链接
WebClient clien3 = new WebClient();
string html3 = clien3.DownloadString(@"https://www.baidu.com/");
MatchCollection matchs3 = Regex.Matches(html, @"<a.+ href=""(.+?)""", RegexOptions.IgnoreCase);
foreach (Match item in matchs3)
{
Console.WriteLine(item.Value);
Console.WriteLine(item.Groups[1].Value);
}
#endregion
/*
使用 webclient 下载指定html页面字符串
然后分析字符串使用正则表达式获取想要得到的字符串信息 然后进行下载,提取等操作
*/
Console.ReadKey();
}
}
}