**此代码用来获取百度贴吧中某一个帖子中的所有邮箱.**
![运行测试:](https://img-blog.csdn.net/20170103152859778?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvbW9vbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
![这里写图片描述](https://img-blog.csdn.net/20170103153126548?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvbW9vbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
![这里写图片描述](https://img-blog.csdn.net/20170103153136873?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvbW9vbXg=/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/SouthEast)
package regex;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.filechooser.FileSystemView;
public class NetSpider2 {
public static void main(String[] args) throws IOException {
long begin = System.currentTimeMillis();
/**
* 爬取百度贴吧帖子中的邮箱 v1.0 第一次测试耗时:19818毫秒 第二次测试耗时:15259毫秒
* 思路:
* 1,在浏览器中找到帖子链接
* 2,在html中找到其表示总页的标签内容 如 百度贴吧 </span>回复贴,共<spanclass="red">30</span>页</li>
* 3,用正则表达式提取其中的 页码 30 并将页码值赋给 pnMax 同时锁上页码获取代码
* 4,遍历同时用正则表达式筛选合格的E-mail 账号并存储到集合中
* 5,返回集合
* 6,用获取页码的方法获取标题 <title>要的留邮箱</title>
* 7,将集合和标题 传给输出文件方法
* 8,获取当前系统桌面路径 并创建 E_mailFile文件夹
* 9,将时间+标题作为文件名 保存.txt文件
*/
System.out.println("请输入地址:");
String url_str =new BufferedReader(new InputStreamReader(System.in)).readLine();
String str_mail = "\\w+@\\w+(\\.\\w+)+";
String pn_regex = ".*</span>回复贴,共<span class=\"red\">(\\d++)</span>页</li>.*";
Set<String> set_mail = new HashSet<String>();
String title = getTitleForNet(url_str);
System.out.println("帖子标题: "+title);
set_mail = getEmilForNet(url_str, str_mail, pn_regex);
toDesktopFile(set_mail, url_str, title);
long end = System.currentTimeMillis() - begin;
System.out.println("耗时:" + end/1000 + "秒");
}
/**
* 获取该网页的标题 内容参考获取邮箱中的获取页码
*
* @param url_str
* @return
* @throws IOException
*/
public static String getTitleForNet(String url_str) throws IOException {
URL url = new URL(url_str);
URLConnection conn = url.openConnection();
InputStream in = conn.getInputStream();
BufferedReader bufr = new BufferedReader(new InputStreamReader(in,
"UTF-8"));
String line = null;
while ((line = bufr.readLine()) != null) {
if (line.matches(".*<title>.*</title>.*")) {
String temp = line;
temp = temp.replaceAll(".*<title>(.*)</title>.*", "$1");
return temp;
}
}
return null;
}
/**
* 将集合中的内容写入到桌面的
*
* @param set_mail
* @param url_str
* @param title
* @throws IOException
*/
public static void toDesktopFile(Set<String> set_mail, String url_str,
String title) throws IOException {
File desktopDir = FileSystemView.getFileSystemView().getHomeDirectory();
String desktopPath = desktopDir.getAbsolutePath();
File file = new File(desktopPath + "\\E-mailFile");
if (!file.exists() && !file.isDirectory()) {
System.out.println("目录不存在,正在创建");
file.mkdir();
System.out.println("创建成功!");
} else {
}
Date date = new Date();
DateFormat format = new SimpleDateFormat("yyyy_MM_dd");
String time = format.format(date);
BufferedWriter bufw = new BufferedWriter(new FileWriter(
file.getAbsolutePath() + "\\" + time + title + ".txt"));
System.out.println("文件地址:"+file.getAbsolutePath()+file.getAbsolutePath() + "\\" + time + title + ".txt");
for (String str : set_mail) {
bufw.write(str);
bufw.newLine();
bufw.flush();
}
bufw.close();
System.out.println("存储完毕!");
}
public static Set<String> getEmilForNet(String url_str, String str_mail,
String pn_regex) throws IOException {
Set<String> set_mail = new HashSet<String>();
int pnNum = 1;
int pnMax = 65535;
Pattern p = Pattern.compile(str_mail);
boolean lock = true;
while (pnNum <= pnMax) {
URL url = new URL(url_str + "?pn=" + pnNum);
URLConnection conn = url.openConnection();
InputStream in = conn.getInputStream();
BufferedReader bufr = new BufferedReader(new InputStreamReader(in,
"UTF-8"));
String line = null;
while ((line = bufr.readLine()) != null) {
if (lock) {
if (line.matches(".*</span>回复贴,共<span class=\"red\">\\d++</span>页</li>.*")) {
String temp = line;
temp = temp.replaceAll(pn_regex, "$1");
pnMax = Integer.valueOf(temp).intValue();
lock = false;
}
}
Matcher m = p.matcher(line);
while (m.find()) {
set_mail.add(m.group());
}
}
System.out.println("正在爬取第 "+pnNum+" 页,共 "+pnMax+" 页.");
pnNum++;
bufr.close();
in.close();
}
return set_mail;
}
}