以下是我写的一个从网页中抓取EMail的方法,可以处理带分页的link。用这个程序,我一下子从一个网页中提取到3000多个EMail(哈哈,发垃圾邮件的人是不是也这样做的??)
1
//
CAll
2
private
void
GetAllURL(
string
urlStr)
3![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
{
4
new Thread(new ParameterizedThreadStart(GetEmailAddress)).Start(urlStr);
... //处理页面中的Link
}
5![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
/**/
/// <summary>
6
/// 提取网页中的Eamil
7
/// </summary>
8
/// <param name="urlStr">网页地址</param>
9
private
void
GetEmailAddress(
object
urlStr)
10![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
{
11
ArrayList EmailStrs = GetWebInfo((string)urlStr, @"(?<EmailStr>\b[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)"); //得到Email
12
foreach (object tmp in EmailStrs)
13![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
14![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
Invoke(new AppendTextDelegate(AppendText), new object[]
{ tmp + "\r\n" });
15
}
16
}
17
18
private
ArrayList GetWebInfo(
string
URlStr,
string
RegExpress)
19![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
{
20
//打开指定页
21
HttpWebRequest webRequest1 = (HttpWebRequest)WebRequest.Create(new Uri(URlStr));
22
webRequest1.Method = "GET";
23
HttpWebResponse response = (HttpWebResponse)webRequest1.GetResponse();
24
String textData = new StreamReader(response.GetResponseStream(), Encoding.Default).ReadToEnd();
25![](/Images/OutliningIndicators/InBlock.gif)
26![](/Images/OutliningIndicators/InBlock.gif)
27
//用正则表达式,提取指定内容,带一个变量
28
Regex r;
29
Match m;
30
r = new Regex(RegExpress, //@"copyTitle.\'(?<AdInfo>.*)\'",
31
RegexOptions.IgnoreCase | RegexOptions.Compiled);
32
int pos1=RegExpress.IndexOf("(?<");
33
int pos2=RegExpress.IndexOf(">",pos1);
34
string DestionKey = RegExpress.Substring(pos1 + 3, pos2 - pos1 - 3);
35
string AdStr = "";
36
ArrayList Result = new ArrayList();
37
for (m = r.Match(textData); m.Success; m = m.NextMatch())
38![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
{
39
AdStr = m.Result("${" + DestionKey + "}").Trim(); //地址
40
Result.Add(AdStr);
41
}
42
return Result;
43
}
44
![](/Images/OutliningIndicators/None.gif)
2
![](/Images/OutliningIndicators/None.gif)
3
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](/Images/OutliningIndicators/ContractedBlock.gif)
![](https://www.cnblogs.com/Images/dot.gif)
4
![](/Images/OutliningIndicators/ExpandedBlockEnd.gif)
... //处理页面中的Link
}
5
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](/Images/OutliningIndicators/ContractedBlock.gif)
6
![](/Images/OutliningIndicators/InBlock.gif)
7
![](/Images/OutliningIndicators/InBlock.gif)
8
![](/Images/OutliningIndicators/ExpandedBlockEnd.gif)
9
![](/Images/OutliningIndicators/None.gif)
10
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](/Images/OutliningIndicators/ContractedBlock.gif)
![](https://www.cnblogs.com/Images/dot.gif)
11
![](/Images/OutliningIndicators/InBlock.gif)
12
![](/Images/OutliningIndicators/InBlock.gif)
13
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
![](/Images/OutliningIndicators/ContractedSubBlock.gif)
![](https://www.cnblogs.com/Images/dot.gif)
14
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
![](/Images/OutliningIndicators/ContractedSubBlock.gif)
![](https://www.cnblogs.com/Images/dot.gif)
15
![](/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
16
![](/Images/OutliningIndicators/ExpandedBlockEnd.gif)
17
![](/Images/OutliningIndicators/None.gif)
18
![](/Images/OutliningIndicators/None.gif)
19
![](/Images/OutliningIndicators/ExpandedBlockStart.gif)
![](/Images/OutliningIndicators/ContractedBlock.gif)
![](https://www.cnblogs.com/Images/dot.gif)
20
![](/Images/OutliningIndicators/InBlock.gif)
21
![](/Images/OutliningIndicators/InBlock.gif)
22
![](/Images/OutliningIndicators/InBlock.gif)
23
![](/Images/OutliningIndicators/InBlock.gif)
24
![](/Images/OutliningIndicators/InBlock.gif)
25
![](/Images/OutliningIndicators/InBlock.gif)
26
![](/Images/OutliningIndicators/InBlock.gif)
27
![](/Images/OutliningIndicators/InBlock.gif)
28
![](/Images/OutliningIndicators/InBlock.gif)
29
![](/Images/OutliningIndicators/InBlock.gif)
30
![](/Images/OutliningIndicators/InBlock.gif)
31
![](/Images/OutliningIndicators/InBlock.gif)
32
![](/Images/OutliningIndicators/InBlock.gif)
33
![](/Images/OutliningIndicators/InBlock.gif)
34
![](/Images/OutliningIndicators/InBlock.gif)
35
![](/Images/OutliningIndicators/InBlock.gif)
36
![](/Images/OutliningIndicators/InBlock.gif)
37
![](/Images/OutliningIndicators/InBlock.gif)
38
![](/Images/OutliningIndicators/ExpandedSubBlockStart.gif)
![](/Images/OutliningIndicators/ContractedSubBlock.gif)
![](https://www.cnblogs.com/Images/dot.gif)
39
![](/Images/OutliningIndicators/InBlock.gif)
40
![](/Images/OutliningIndicators/InBlock.gif)
41
![](/Images/OutliningIndicators/ExpandedSubBlockEnd.gif)
42
![](/Images/OutliningIndicators/InBlock.gif)
43
![](/Images/OutliningIndicators/ExpandedBlockEnd.gif)
44
![](/Images/OutliningIndicators/None.gif)
上述代码中的关键是书写提取EMail的表达式:
@"(?<EmailStr>\b[A-Z0-9._%-]+@[A-Z0-9._%-]+\.[A-Z]{2,4}\b)"
以下是我写的一个程序界面及运行结果:
![](https://i-blog.csdnimg.cn/blog_migrate/a13cc0cb3cbc620e4c7a6a851f9fb2b5.jpeg)