原理很简单,把网页信息用In流导入,然后用正则表达式,判断是否为邮箱,是的话就记录下来。当然也可以爬其他东西,而且做得更复杂。就是有超链接,或者深层的要进去继续爬,就是搜索。我这里比较简单,只爬单独的网页。
package cn.hncu.dage.Spider;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.Test;
public class SpiderDemo {
@Test // 爬本地的文件上的E-Mails
public void getMails() throws IOException{
// 先把文件导入
FileReader fr = new FileReader("Mails.html");
BufferedReader br = new BufferedReader(fr);
String reg="\\w+@\\w+(\\.\\w+)+";// 判断是否为e-mail 的正则表达式
Pattern p = Pattern.compile(reg);
String str = null;
while( (str=br.readLine())!=null){
Matcher m = p.matcher(str);
while (m.find()) {
System.out.println(m.group());
}
}
}
@Test
public void getMailsBynet() throws IOException{
URL url = new URL("http://www.sina.com");
BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
String reg ="\\w+@\\w+(\\.\\w+)+";
Pattern p = Pattern.compile(reg);
String str = null;
while( (str=br.readLine())!=null){
Matcher m = p.matcher(str);
while(m.find()){
System.out.println(m.group());
}
}
}
}
改进了 做了个图形界面的
/*
* SpiderFrame.java
*
* Created on __DATE__, __TIME__
*/
package cn.hncu.dage.Spider.v;
/**
*
* @author __USER__
*/
public class SpiderFrame extends javax.swing.JFrame {
/** Creates new form SpiderFrame */
public SpiderFrame() {
initComponents();
this.setBounds(400, 100, 500, 400);
this.setContentPane(new SpiderPanel());
}
/** This method is called from within the constructor to
* initialize the form.
* WARNING: Do NOT modify this code. The content of this method is
* always regenerated by the Form Editor.
*/
//GEN-BEGIN:initComponents
// <editor-fold defaultstate="collapsed" desc="Generated Code">
private void initComponents() {
menuBar = new javax.swing.JMenuBar();
fileMenu = new javax.swing.JMenu();
openMenuItem = new javax.swing.JMenuItem();
saveMenuItem = new javax.swing.JMenuItem();
saveAsMenuItem = new javax.swing.JMenuItem();
exitMenuItem = new javax.swing.JMenuItem();
editMenu = new javax.swing.JMenu();
cutMenuItem = new javax.swing.JMenuItem();
copyMenuItem = new javax.swing.JMenuItem();
pasteMenuItem = new javax.swing.JMenuItem();
deleteMenuItem = new javax.swing.JMenuItem();
helpMenu = new javax.swing.JMenu();
contentsMenuItem = new javax.swing.JMenuItem();
aboutMenuItem = new javax.swing.JMenuItem();
setDefaultCloseOperation(javax.swing.WindowConstants.EXIT_ON_CLOSE);
setMinimumSize(new java.awt.Dimension(300, 400));
fileMenu.setText("File");
openMenuItem.setText("Open");
fileMenu.add(openMenuItem);
saveMenuItem.setText("Save");
fileMenu.add(saveMenuItem);
saveAsMenuItem.setText("Save As ...");
fileMenu.add(saveAsMenuItem);
exitMenuItem.setText("Exit");
exitMenuItem.addActionListener(new java.awt.event.ActionListener() {
public void actionPerformed(java.awt.event.ActionEvent evt) {
exitMenuItemActionPerformed(evt);
}
});
fileMenu.add(exitMenuItem);
menuBar.add(fileMenu);
editMenu.setText("Edit");
cutMenuItem.setText("Cut");
editMenu.add(cutMenuItem);
copyMenuItem.setText("Copy");
editMenu.add(copyMenuItem);
pasteMenuItem.setText("Paste");
editMenu.add(pasteMenuItem);
deleteMenuItem.setText("Delete");
editMenu.add(deleteMenuItem);
menuBar.add(editMenu);
helpMenu.setText("Help");
contentsMenuItem.setText("Contents");
helpMenu.add(contentsMenuItem);
aboutMenuItem.setText("About");
helpMenu.add(aboutMenuItem);
menuBar.add(helpMenu);
setJMenuBar(menuBar);
javax.swing.GroupLayout layout = new javax.swing.GroupLayout(
getContentPane());
getContentPane().setLayout(layout);
layout.setHorizontalGroup(layout.createParallelGroup(
javax.swing.GroupLayout.Alignment.LEADING).addGap(0, 400,
Short.MAX_VALUE));
layout.setVerticalGroup(layout.createParallelGroup(
javax.swing.GroupLayout.Alignment.LEADING).addGap(0, 279,
Short.MAX_VALUE));
pack();
}// </editor-fold>
//GEN-END:initComponents
private void exitMenuItemActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_exitMenuItemActionPerformed
System.exit(0);
}//GEN-LAST:event_exitMenuItemActionPerformed
/**
* @param args the command line arguments
*/
public static void main(String args[]) {
java.awt.EventQueue.invokeLater(new Runnable() {
public void run() {
new SpiderFrame().setVisible(true);
}
});
}
//GEN-BEGIN:variables
// Variables declaration - do not modify
private javax.swing.JMenuItem aboutMenuItem;
private javax.swing.JMenuItem contentsMenuItem;
private javax.swing.JMenuItem copyMenuItem;
private javax.swing.JMenuItem cutMenuItem;
private javax.swing.JMenuItem deleteMenuItem;
private javax.swing.JMenu editMenu;
private javax.swing.JMenuItem exitMenuItem;
private javax.swing.JMenu fileMenu;
private javax.swing.JMenu helpMenu;
private javax.swing.JMenuBar menuBar;
private javax.swing.JMenuItem openMenuItem;
private javax.swing.JMenuItem pasteMenuItem;
private javax.swing.JMenuItem saveAsMenuItem;
private javax.swing.JMenuItem saveMenuItem;
// End of variables declaration//GEN-END:variables
}
/*
* SpiderPanel.java
*
* Created on __DATE__, __TIME__
*/
package cn.hncu.dage.Spider.v;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.JOptionPane;
/**
*
* @author __USER__
*/
public class SpiderPanel extends javax.swing.JPanel {
/** Creates new form SpiderPanel */
public SpiderPanel() {
initComponents();
this.setBounds(400, 100, 500, 400);
}
/** This method is called from within the constructor to
* initialize the form.
* WARNING: Do NOT modify this code. The content of this method is
* always regenerated by the Form Editor.
*/
//GEN-BEGIN:initComponents
// <editor-fold defaultstate="collapsed" desc="Generated Code">
private void initComponents() {
jLabel1 = new javax.swing.JLabel();
jLabel2 = new javax.swing.JLabel();
tfdMail = new javax.swing.JTextField();
jScrollPane1 = new javax.swing.JScrollPane();
ListShow = new javax.swing.JList();
btnDfs = new javax.swing.JButton();
btnExit = new javax.swing.JButton();
jLabel1.setFont(new java.awt.Font("黑体", 0, 24));
jLabel1.setForeground(new java.awt.Color(102, 102, 0));
jLabel1.setText("\u7f51\u7edc\u8718\u86db");
jLabel2.setFont(new java.awt.Font("黑体", 0, 18));
jLabel2.setText("\u641c\u7d22\u7f51\u5740\uff1a");
tfdMail.setFont(new java.awt.Font("黑体", 0, 24));
tfdMail.addActionListener(new java.awt.event.ActionListener() {
public void actionPerformed(java.awt.event.ActionEvent evt) {
tfdMailActionPerformed(evt);
}
});
ListShow.setFont(new java.awt.Font("黑体", 0, 18));
ListShow.setForeground(new java.awt.Color(0, 153, 153));
ListShow.setModel(new javax.swing.AbstractListModel() {
String[] strings = { "" };
public int getSize() {
return strings.length;
}
public Object getElementAt(int i) {
return strings[i];
}
});
jScrollPane1.setViewportView(ListShow);
btnDfs.setFont(new java.awt.Font("黑体", 0, 18));
btnDfs.setText("\u641c\u7d22");
btnDfs.addActionListener(new java.awt.event.ActionListener() {
public void actionPerformed(java.awt.event.ActionEvent evt) {
btnDfsActionPerformed(evt);
}
});
btnExit.setFont(new java.awt.Font("黑体", 0, 18));
btnExit.setText("\u9000\u51fa");
btnExit.addActionListener(new java.awt.event.ActionListener() {
public void actionPerformed(java.awt.event.ActionEvent evt) {
btnExitActionPerformed(evt);
}
});
javax.swing.GroupLayout layout = new javax.swing.GroupLayout(this);
this.setLayout(layout);
layout.setHorizontalGroup(layout
.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
.addGroup(
javax.swing.GroupLayout.Alignment.TRAILING,
layout.createSequentialGroup()
.addContainerGap(139, Short.MAX_VALUE)
.addComponent(jLabel1,
javax.swing.GroupLayout.PREFERRED_SIZE,
134,
javax.swing.GroupLayout.PREFERRED_SIZE)
.addGap(127, 127, 127))
.addGroup(
layout.createSequentialGroup()
.addGroup(
layout.createParallelGroup(
javax.swing.GroupLayout.Alignment.TRAILING,
false)
.addGroup(
layout.createSequentialGroup()
.addGap(20, 20,
20)
.addComponent(
jScrollPane1,
javax.swing.GroupLayout.PREFERRED_SIZE,
231,
javax.swing.GroupLayout.PREFERRED_SIZE)
.addPreferredGap(
javax.swing.LayoutStyle.ComponentPlacement.RELATED,
62,
Short.MAX_VALUE)
.addGroup(
layout.createParallelGroup(
javax.swing.GroupLayout.Alignment.TRAILING)
.addComponent(
btnDfs)
.addComponent(
btnExit)))
.addGroup(
javax.swing.GroupLayout.Alignment.LEADING,
layout.createSequentialGroup()
.addGap(31, 31,
31)
.addComponent(
jLabel2,
javax.swing.GroupLayout.PREFERRED_SIZE,
100,
javax.swing.GroupLayout.PREFERRED_SIZE)
.addPreferredGap(
javax.swing.LayoutStyle.ComponentPlacement.RELATED)
.addComponent(
tfdMail,
javax.swing.GroupLayout.PREFERRED_SIZE,
246,
javax.swing.GroupLayout.PREFERRED_SIZE)))
.addContainerGap(18, Short.MAX_VALUE)));
layout.setVerticalGroup(layout
.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
.addGroup(
layout.createSequentialGroup()
.addContainerGap()
.addComponent(jLabel1,
javax.swing.GroupLayout.PREFERRED_SIZE,
51,
javax.swing.GroupLayout.PREFERRED_SIZE)
.addPreferredGap(
javax.swing.LayoutStyle.ComponentPlacement.UNRELATED)
.addGroup(
layout.createParallelGroup(
javax.swing.GroupLayout.Alignment.BASELINE)
.addComponent(
jLabel2,
javax.swing.GroupLayout.PREFERRED_SIZE,
43,
javax.swing.GroupLayout.PREFERRED_SIZE)
.addComponent(
tfdMail,
javax.swing.GroupLayout.PREFERRED_SIZE,
33,
javax.swing.GroupLayout.PREFERRED_SIZE))
.addPreferredGap(
javax.swing.LayoutStyle.ComponentPlacement.RELATED)
.addGroup(
layout.createParallelGroup(
javax.swing.GroupLayout.Alignment.LEADING)
.addGroup(
javax.swing.GroupLayout.Alignment.TRAILING,
layout.createSequentialGroup()
.addComponent(
btnDfs)
.addGap(30, 30,
30)
.addComponent(
btnExit))
.addComponent(
jScrollPane1,
javax.swing.GroupLayout.Alignment.TRAILING,
javax.swing.GroupLayout.PREFERRED_SIZE,
128,
javax.swing.GroupLayout.PREFERRED_SIZE))
.addContainerGap(53, Short.MAX_VALUE)));
}// </editor-fold>
//GEN-END:initComponents
private void btnExitActionPerformed(java.awt.event.ActionEvent evt) {
System.exit(0);
}
private void btnDfsActionPerformed(java.awt.event.ActionEvent evt) {
// 1. 收集参数
String str1 = tfdMail.getText();
// 判断是输入的是否为网址格式
String reg1 = "www.\\w+.(net|com|cn|org|cc|tv)";
Pattern p2 = Pattern.compile(reg1);
Matcher m2 = p2.matcher(str1);
if (!m2.find()) {
JOptionPane.showMessageDialog(this, "请输入正确的网址格式");
return;
}
List<String> list = new ArrayList<String>();
try {
URL url = new URL("http://" + str1);
BufferedReader br = new BufferedReader(new InputStreamReader(
url.openStream()));
String reg = "\\w+@\\w+(\\.\\w+)+";
Pattern p = Pattern.compile(reg);
String str = null;
while ((str = br.readLine()) != null) {
Matcher m = p.matcher(str);
while (m.find()) {
String s = m.group();
list.add(s);
}
}
ListShow.setListData(list.toArray());
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void tfdMailActionPerformed(java.awt.event.ActionEvent evt) {
// TODO add your handling code here:
}
//GEN-BEGIN:variables
// Variables declaration - do not modify
private javax.swing.JList ListShow;
private javax.swing.JButton btnDfs;
private javax.swing.JButton btnExit;
private javax.swing.JLabel jLabel1;
private javax.swing.JLabel jLabel2;
private javax.swing.JScrollPane jScrollPane1;
private javax.swing.JTextField tfdMail;
// End of variables declaration//GEN-END:variables
}