看完了马士兵老师的正则表达式视频,想到了贴吧各种留邮箱,于是写了这个小工具。
输入网址(要加http://,可以单击“
网址”粘贴),然后会获取网址内匹配的邮箱,包括全角的
@和
.
截图:
代码:
//import java.awt.*;
import java.awt.datatransfer.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;
public class EmailSpider extends MouseAdapter implements ActionListener, KeyListener {
private static final String INIT_URL = "http://tieba.baidu.com/p/2286659953?pn=4";
private static final String URL_PREFIX = "http://tieba.baidu.com/p/"; // 贴吧帖子网址前缀
private static final String WORDS = "\n\n 说明:\n 百度贴吧帖子获取所有页面的邮箱,其他的获取当前页。\n 匹配不全肯定是有的,只做了粗略匹配。\n 也有可能只匹配到一段,有些人邮箱输入方式有点奇葩。\n\n 单击“网址”可以粘贴剪贴板的网址。\n 右键菜单没有写,反正可以Ctrl+C Ctrl+V\n 提取时请不要进行操作。";
private BufferedReader buffer;
private Pattern pattern;
private Matcher matcher;
private String line; // 网页源代码的一行
private List<String> list = new ArrayList<String>(); // 邮箱
private JFrame frame;
private JTextArea textArea;
private JScrollPane scrollPane; // textArea的滚动条
private JTextField textField;
private JButton button1, button2;
private JCheckBox checkBox;
private JLabel label1, label2;
private Clipboard clipboard; // 剪贴板
private int total_page = 99999; // 总页数
private String id; // 贴吧帖子id
private StringBuffer sBuf; // 网址
//private Thread thread;
//private boolean isFinished;
public EmailSpider() {
//System.out.println(Integer.toHexString(".".codePointAt(0)));
// 正则表达式匹配,包括@和.(全角).
pattern = Pattern.compile("[\\w.-[\uff0e]]+[@[\uff20]][\\w-]+[\\.[\uff0e]]?[\\w.-[\uff0e]]*\\w");
matcher = pattern.matcher("");
frame = new JFrame("网页Email提取v1.0 by kyda");
clipboard = frame.getToolkit().getSystemClipboard(); // 剪贴板
label1 = new JLabel("<html><font size=4 color=green>网址:</font></html>");
label1.setBounds(20, 10, 50, 30);
label1.addMouseListener(this);
textField = new JTextField(INIT_URL);
textField.setBounds(70, 12, 310, 26);
textField.addKeyListener(this);
button1 = new JButton("<html><font size=4 color=blue>提取</font></html>");
button1.setBounds(400, 10, 60, 30);
button1.addActionListener(this);
textArea = new JTextArea(WORDS);
textArea.setLineWrap(true); // 自动换行
scrollPane = new JScrollPane(textArea); // 添加滚动条
scrollPane.setBounds(10, 50, 390, 290);
scrollPane.setBorder(BorderFactory.createLoweredSoftBevelBorder()); // 边框样式
//checkBox = new JCheckBox("<html><div style=\"top:-5px;\"><font size=5>换行</font></div></html>");
checkBox = new JCheckBox("换行");
// html会偏一点,不知道为什么。。。Font类有冲突,就这样new没什么问题吧
checkBox.setFont(new java.awt.Font(checkBox.getFont().deriveFont((float) 16).getAttributes()));
checkBox.setSelected(true);
checkBox.setBounds(410, 100, 90, 30);
checkBox.addActionListener(this);
button2 = new JButton("<html><font size=3>复制到<br>剪贴板</font></html>");
button2.setBounds(410, 160, 70, 40);
button2.addActionListener(this);
label2 = new JLabel("<html><font size=4 color=green>请选择操作:</font></html>");
label2.setBounds(10, 340, 480, 30);
frame.add(label1);
frame.add(textField);
frame.add(button1);
frame.getContentPane().add(scrollPane);
frame.add(checkBox);
frame.add(button2);
frame.add(label2);
frame.setLayout(null);
frame.setSize(500, 400);
frame.setLocationRelativeTo(null); // 窗口居中
frame.setResizable(false); // 不可改变大小
frame.setVisible(true);
frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
}
public void actionPerformed(ActionEvent e) {
if (e.getSource() == button1) {
startAnalyse();
} else if (e.getSource() == button2) {
// 复制到剪贴板
String s = textArea.getText();
if (s.length() > 0) {
StringSelection ss = new StringSelection(s);
this.clipboard.setContents(ss, ss);
label2.setText("<html><font color=red>已复制到剪贴板。</font></html>");
}
}
}
// 对网址单击粘贴
public void mouseClicked(MouseEvent e) {
if (e.getButton() == MouseEvent.BUTTON1) {
try {
String s = clipboard.getContents(frame).getTransferData(DataFlavor.stringFlavor).toString();
if (s.length() == 0) {
label2.setText("剪贴板无内容!");
} else {
textField.setText(s);
label2.setText("网址粘贴成功!");
}
} catch (UnsupportedFlavorException e1) {
label2.setText("剪贴板内容不是字符串!");
} catch (IOException e1) {
e1.printStackTrace();
}
}
}
// 按enter可以处理
public void keyPressed(KeyEvent e) {
if (e.getKeyCode() == KeyEvent.VK_ENTER) {
startAnalyse();
}
}
public void keyReleased(KeyEvent arg0) {
}
public void keyTyped(KeyEvent arg0) {
}
// 页面处理前启动线程,实现setText实时更新
public void startAnalyse() {
new Thread(new Runnable() {
public void run() {
analyse();
}
}).start();
}
// 页面处理
public void analyse() {
label2.setText("邮箱地址提取中……");
try {
for (int i = 1; i <= total_page; ++i) {
// 网址计算
if (i == 1) {
sBuf = new StringBuffer(textField.getText());
if (sBuf.length() < 1)
return;
Matcher m = Pattern.compile("http://tieba.baidu.com/p/[0-9]{1,10}").matcher(textField.getText());
if (m.find()) {
id = m.group().substring(25); // 如果是贴吧获取帖子id
sBuf = new StringBuffer(URL_PREFIX); // 转到第一页
sBuf.append(id).append("?pn=").append(i);
total_page = 99999;
//System.out.println(id);
} else { // 否则只处理当前页
total_page = 1;
}
} else { // 每次循环页面地址递增1
sBuf = new StringBuffer(URL_PREFIX);
sBuf.append(id).append("?pn=").append(i);
}
// 获取网址内容
URL url = new URL(sBuf.toString());
buffer = new BufferedReader(new InputStreamReader(url.openStream()));
// 获取总页数
if (total_page == 99999) {
BufferedReader tmpBuf = new BufferedReader(new InputStreamReader(url.openStream()));
String s = "";
while ((s = tmpBuf.readLine()) != null) {
// 有些浏览器进不去的帖子java可以获取,并得到total_page,估计是被删的帖子
// 然后进浏览器就会得到百度的温馨提示。。不敢玩了 - - ,被封号就蛋疼了
Matcher tmpMatcher = Pattern.compile("all_page_num:[0-9]{1,6}").matcher(s);
if (tmpMatcher.find()) {
total_page = Integer.valueOf(tmpMatcher.group().substring(13));
break;
}
}
if (total_page == 99999) { // 没有获取到页数
label2.setText("貌似出错了~你真确定有这个帖子?");
return;
}
}
label2.setText("正在提取页面:" + i + " 共" + total_page + "页 地址:" + sBuf);
// 逐行提取email地址
while ((line = buffer.readLine()) != null) {
getEmailAddr(line);
}
}
StringBuffer result = new StringBuffer("");
for (String s : list) {
result.append(s);
}
textArea.setText(result.toString());
label2.setText("提取完成! 邮箱总计:" + list.size());
list.clear(); // 清空list
} catch (MalformedURLException | IllegalArgumentException | UnknownHostException e2) {
label2.setText("请输入正确的网址!");
return;
} catch (IOException e) {
e.printStackTrace();
}
}
// 提取一行字符串中的email地址
public void getEmailAddr(String str) {
matcher.reset(line);
while (matcher.find()) {
//System.out.println(matcher.group());
// 用StringBuffer的话存一个进list后list.contains就一直true了,原因未知。
String s = matcher.group();
//if (s.equals("i@cegle.net"))
// System.out.println(line);
s = s.replaceAll("\uff20", "@");
s = s.replaceAll("\uff0e", ".");
s += ';';
if (checkBox.isSelected())
s += '\n';
// 用ArrayList以防止重复
if (!list.contains(s)) {
list.add(s);
}
}
}
public static void main(String[] args) {
// 使用Windows的界面风格
try {
UIManager.setLookAndFeel("com.sun.java.swing.plaf.windows.WindowsLookAndFeel");
} catch (Exception e) {
e.printStackTrace();
}
new EmailSpider();
}
}