Demo源码如下:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
namespace _12提取html中的所有的Email地址
{
class Program
{
static void Main(string[] args)
{
string html = File.ReadAllText("1.htm");
//提取Email
//通过()提取组,正则表达式如下
string regEmail = @"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9]+)(\.[a-zA-Z0-9])+";
MatchCollection mc = Regex.Matches(html, regEmail);
//请统计出常用邮件服务提供商的用户使用。
//163
//126
//sohu
//gmail
//sina
//yahoo
//hotmail
int count_163 = 0;
int count_126 = 0;
int count_gmail = 0;
int count_qq = 0;
int count_sohu = 0;
int count_sina = 0;
int count_yahoo = 0;
int count_hotmail = 0;
foreach (Match match in mc)
{
#region MyRegion
//match.Groups[0].Value中存储的值遇match.Value中存储的值是一样的
//表示提取到的Email的完整字符串
//match.Value
Console.WriteLine(match.Value);
switch (match.Groups[2].Value)
{
//default:
}
Console.WriteLine(match.Groups[0].Value);//0:完整邮箱名
Console.WriteLine(match.Groups[1].Value);//:1:用户名
Console.WriteLine(match.Groups[2].Value);//:2:域名
Console.WriteLine(match.Groups[3].Value);//:3:组织名
#endregion
Console.WriteLine(match.Value);//输出所有邮箱地址
//通过match.Groups[]来获取提取组。注意:第0组存储的是完整匹配字符串,要获取组因该从索引1开始。
switch (match.Groups[2].Value.ToLower())
{
case "163":
count_163++;
break;
case "126":
count_126++;
break;
case "gmail":
count_gmail++;
break;
case "qq":
count_qq++;
break;
case "sohu":
count_sohu++;
break;
case "sina":
count_sina++;
break;
case "yahoo":
count_yahoo++;
break;
case "hotmail":
count_hotmail++;
break;
}
}
Console.WriteLine("=============统计信息============");
Console.WriteLine("邮箱总数:{0}",mc.Count);
Console.WriteLine("网易163邮箱用户数:{0}", count_163);
Console.WriteLine("网易126邮箱用户数:{0}", count_126);
Console.WriteLine("gmail邮箱用户数:{0}", count_gmail);
Console.WriteLine("qq邮箱用户数:{0}", count_qq);
Console.WriteLine("sohu邮箱用户数:{0}", count_sohu);
Console.WriteLine("sina邮箱用户数:{0}", count_sina);
Console.WriteLine("yahoo邮箱用户数:{0}", count_yahoo);
Console.WriteLine("hotmail邮箱用户数:{0}", count_hotmail);
Console.ReadKey();
}
}
}
“1.htm”截图如下:
通过调试控制,查看获取的全部网页字符串,截图如下:
输出结果如下: