入口类
import java.awt.Dimension;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import javax.swing.JButton;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JMenuBar;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import javax.swing.JTextField;
import javax.swing.ScrollPaneConstants;
import javax.swing.SwingUtilities;
/**
* Description 检查URL是否是合法的URL,入口类,直接运行该类,将需要分析的URL地址粘入文本框即可
*
* @author wangxu
*
*/
public class CheckLinks extends JFrame implements Runnable, ISpiderReportable {
// Used by addNotify
boolean frameSizeAdjusted = false;
JLabel label1 = new JLabel();
JButton begin = new JButton();
JTextField url = new JTextField();
JScrollPane errorScroll = new JScrollPane();
JTextArea errors = new JTextArea();
JLabel current = new JLabel();
JLabel goodLinksLabel = new JLabel();
JLabel badLinksLabel = new JLabel();
protected Thread backgroundThread;
protected Spider spider;
protected URL base;
protected int badLinksCount = 0;
protected int goodLinksCount = 0;
private static final long serialVersionUID = 1L;
public CheckLinks() {
setTitle("Find Broken Links");// 设置JFrame的标题
getContentPane().setLayout(null);// 设置布局方式
setSize(405, 288);
setVisible(true);
label1.setText("Enter a URL:");
getContentPane().add(label1);
label1.setBounds(12, 12, 84, 12);
begin.setText("Begin");
begin.setActionCommand("Begin");
getContentPane().add(begin);
begin.setBounds(12, 36, 84, 24);// 设置坐标和宽、高
getContentPane().add(url);
url.setBounds(108, 36, 288, 24);
errorScroll.setAutoscrolls(true);// 自动显示滚动条
errorScroll.setHorizontalScrollBarPolicy(ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);// 水平方向始终显示
errorScroll.setVerticalScrollBarPolicy(ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);// 垂直方向始终显示
errorScroll.setOpaque(true);// 设置不透明
getContentPane().add(errorScroll);
errorScroll.setBounds(12, 120, 384, 156);
errors.setEditable(false);// 设置不可编辑
errorScroll.getViewport().add(errors);// 将文本域添加进滚动条
errors.setBounds(0, 0, 366, 138);
current.setText("Currently Processing: ");
getContentPane().add(current);// 加入显示当前信息的JLabel
current.setBounds(12, 72, 384, 12);
goodLinksLabel.setText("Good Links: 0");
getContentPane().add(goodLinksLabel);
goodLinksLabel.setBounds(12, 96, 192, 12);
badLinksLabel.setText("Bad Links: 0");
getContentPane().add(badLinksLabel);
badLinksLabel.setBounds(216, 96, 96, 12);
SymAction lSymAction = new SymAction();// 实例化一个事件监听器
begin.addActionListener(lSymAction);// 注册监听
}
static public void main(String args[]) {
new CheckLinks();// 程序入口
}
public void addNotify() {
// Record the size of the window prior to calling parent's addNotify.
Dimension size = getSize();
super.addNotify();
if (frameSizeAdjusted)
return;
frameSizeAdjusted = true;
// Adjust size of frame according to the insets and menu bar
Insets insets = getInsets();
JMenuBar menuBar = getRootPane().getJMenuBar();
int menuBarHeight = 0;
if (menuBar != null)
menuBarHeight = menuBar.getPreferredSize().height;
setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight);
}
class SymAction implements ActionListener {
public void actionPerformed(ActionEvent event) {
Object object = event.getSource();
if (object == begin)
begin_actionPerformed(event);
}
}
void begin_actionPerformed(ActionEvent event) {
if (backgroundThread == null) {
begin.setText("Cancel");
backgroundThread = new Thread(this);// 用当前对象来实例化一个Thread对象
backgroundThread.start();// 启动线程,执行run方法
goodLinksCount = 0;
badLinksCount = 0;
} else {
spider.cancel();// 设置标志位true
}
}
@Override
public void run() {
try {
errors.setText("");
spider = new Spider(this);// 用当前对象来实例化一个Spider对象,因为当前类实现了ISpiderReportable接口
spider.clear();
base = new URL(url.getText());// 取得需要搜索的URL地址
spider.addURL(base);//将URL地址加入spider
spider.begin();//spider开始工作
Runnable doLater = new Runnable() {
public void run() {
begin.setText("Begin");
}
};
// 导致 doRun.run() 在 AWT 事件指派线程上异步执行。在所有挂起的 AWT
// 事件被处理后才发生。此方法应该在应用程序线程需要更新该 GUI时使用。在下面的示例中,invokeLater
// 调用将事件指派线程上的 Runnable对象 doHelloWorld加入队列,然后输出一条信息。
SwingUtilities.invokeLater(doLater);
backgroundThread = null;// 将后台线程重新置空,以便接受下一个URL
} catch (MalformedURLException e) {
UpdateErrors err = new UpdateErrors();
err.msg = "Bad address.";
SwingUtilities.invokeLater(err);
}
}
//检测两个URL地址是否属于同一主机,如果是返回true,否则false
@Override
public boolean spiderFoundURL(URL base, URL url) {
UpdateCurrentStats cs = new UpdateCurrentStats();
cs.msg = url.toString();//将URL信息赋值给cs.msg,使用后台线程进行打印
SwingUtilities.invokeLater(cs);
if (!checkLink(url)) {
UpdateErrors err = new UpdateErrors();
err.msg = url + "(on page " + base + ")\n";
SwingUtilities.invokeLater(err);
badLinksCount++;
return false;
}
goodLinksCount++;
if (!url.getHost().equalsIgnoreCase(base.getHost()))
return false;
else
return true;
}
@Override
public void spiderURLError(URL url) {
System.out.println("没找到的URL:" + url);
}
protected boolean checkLink(URL url) {
try {
URLConnection connection = url.openConnection();
connection.connect();
return true;
} catch (IOException e) {
return false;
}
}
public void spiderFoundEMail(String email) {
System.out.println("获得Email:" + email);
}
class UpdateErrors implements Runnable {
public String msg;
public void run() {
errors.append(msg);
}
}
class UpdateCurrentStats implements Runnable {
public String msg;
public void run() {
current.setText("Currently Processing: " + msg);
goodLinksLabel.setText("Good Links: " + goodLinksCount);
badLinksLabel.setText("Bad Links: " + badLinksCount);
}
}
}
import javax.swing.text.html.*;
/**
* Swing JEditorPane 文本组件通过称为 EditorKit 的插件机制来支持不同种类的内容。因为 HTML
* 是很流行的内容格式,因此默认提供了某种支持。此类提供了 HTML version 3.2(带有某些扩展)的默认支持,并正在向 version 4.0
* 迁移。不支持 <applet> 标记,但为 <object> 标记提供了某种支持。
*
* @author wangxu
*
*/
public class HTMLParse extends HTMLEditorKit {
private static final long serialVersionUID = 1L;
public HTMLEditorKit.Parser getParser() {
return super.getParser();
}
}
import java.net.*;
public interface ISpiderReportable {
// 找到URL链接
public boolean spiderFoundURL(URL base, URL url);
public void spiderURLError(URL url);
// 找到Email的链接
public void spiderFoundEMail(String email);
}
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
public class Spider {
// 装载错误的工作集
protected Collection workloadError = new ArrayList(3);
// 等待工作集
protected Collection workloadWaiting = new ArrayList(3);
// 已处理的工作集
protected Collection workloadProcessed = new ArrayList(3);
protected ISpiderReportable report;
protected boolean cancel = false;
public Spider(ISpiderReportable report) {
this.report = report;
}
public Collection getWorkloadError() {
return workloadError;
}
public Collection getWorkloadWaiting() {
return workloadWaiting;
}
public Collection getWorkloadProcessed() {
return workloadProcessed;
}
public void clear() {
getWorkloadError().clear();
getWorkloadWaiting().clear();
getWorkloadProcessed().clear();
}
public void cancel() {
cancel = true;
}
public void addURL(URL url) {
if (getWorkloadWaiting().contains(url))// 如果等待的工作集中已经包含该URL,返回
return;
if (getWorkloadError().contains(url))// 如果出错的工作集中已经包含该URL,返回
return;
if (getWorkloadProcessed().contains(url))// 如果已处理的工作集中包含该URL,返回
return;
log("Adding to workload: " + url);
getWorkloadWaiting().add(url);// 将其加入等待的工作集中
}
// 具体分析URL的方法
public void processURL(URL url) {
try {
log("Processing: " + url);// 控制台打印处理的URL地址
// get the URL's contents
URLConnection connection = url.openConnection();
System.out.println(connection.getContentType() + "++++++++++++++++====");
if ((connection.getContentType() != null) && !connection.getContentType().toLowerCase().startsWith("text/")) {
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Not processing because content type is: " + connection.getContentType());
return;
}
// read the URL
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
// parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
// Parse the given stream and drive the given callback with the
// results of the parse. This method should be implemented to be
// thread-safe.
// 解析给定的流并通过解析的结果驱动给定的回调。该方法执行完之后,会调用给定的回调函数
parse.parse(r, new Parser(url), true);
} catch (IOException e) {// 如果出错
getWorkloadWaiting().remove(url);// 从工作集中移除URL
getWorkloadError().add(url);// 将出错的URL加入错误的工作集
log("Error: " + url);
report.spiderURLError(url);// 报告该出错的URL
return;
}
// mark URL as complete
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Complete: " + url);
}
// 蜘蛛工作的方法,只要等待工作集不为空,并且标志位为false,那么一直从集合中取出URL
public void begin() {
cancel = false;
while (!getWorkloadWaiting().isEmpty() && !cancel) {
Object list[] = getWorkloadWaiting().toArray();
for (int i = 0; (i < list.length) && !cancel; i++)
processURL((URL) list[i]);// 调用分析URL的方法
}
}
protected class Parser extends HTMLEditorKit.ParserCallback {
protected URL base;
public Parser(URL base) {
this.base = base;
}
public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet mutableAttributeSet, int pos) {
String href = (String) mutableAttributeSet.getAttribute(HTML.Attribute.HREF);// 获取href链接
if ((href == null) && (tag == HTML.Tag.FRAME))
href = (String) mutableAttributeSet.getAttribute(HTML.Attribute.SRC);
if (href == null)
return;
int i = href.indexOf('#');
if (i != -1)
href = href.substring(0, i);// 开始截取到'#'字符
if (href.toLowerCase().startsWith("mailto:")) {// 如果是邮件链接
report.spiderFoundEMail(href);
return;
}
if (tag == HTML.Tag.META) {
String title = (String) mutableAttributeSet.getAttribute(HTML.Attribute.NAME);
System.out.println("title:" + title);
}
// 处理新得到的链接
handleLink(base, href);
}
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
handleSimpleTag(t, a, pos); // handle the same way
}
// 处理链接的函数
protected void handleLink(URL base, String str) {
try {
URL url = new URL(base, str);
// 判断,如果属于同一主机,加入待处理工作集
if (report.spiderFoundURL(base, url))
addURL(url);
} catch (MalformedURLException e) {
log("Found malformed URL: " + str);
}
}
}
public void log(String entry) {
System.out.println((new Date()) + ":" + entry);
}
}