网络爬虫之Spider

网络爬虫是搜索引擎的一个重要的部分。爬虫的根本原理就是下载页面,然后进行解析。Web上的存储着海量数据,怎么样才能将海量数据尽快的下载到本机上?这是网络爬虫设计的一个方案。采取多线程技术。以下代码实现了将网页的数据存储到XML文档。希望能提出更好的方案。

 

import java.awt.*;

import java.awt.event.*;

import javax.swing.*;

import java.util.*;

import java.io.*;



import com.heaton.bot.*;



import org.w3c.dom.*;

import org.cyberneko.html.parsers.*;

import org.xml.sax.*;

import org.apache.html.dom.*;

import javax.xml.parsers.*;

import javax.xml.transform.*;

import javax.xml.transform.dom.*;

import javax.xml.transform.stream.*;



/**

*  网络爬虫,通过深度优先算法将互联网上的网页下载解析。经过测试,能正常运行!

*/

public class mySpider extends JFrame implements ISpiderReportable {



	/**

	 * @param args

	 */

	Spider _Spider = null;

	Document doc = null;

	

	int _pagesCount;

	

	public mySpider(){

		setTitle("网络爬虫之Spider");

		getContentPane().setLayout(null);

		setSize(405,268);

		setVisible(false);

		D.setHorizontalTextPosition(

				SwingConstants.LEFT);

		D.setVerticalTextPosition(

				SwingConstants.TOP);

		D.setVerticalAlignment(

				SwingConstants.TOP);

		D.setText("下载的页面数目: ");

		getContentPane().add(D);

		D.setBounds(12,12,384,24);

		JLabel2.setText("URL:");

		getContentPane().add(JLabel2);

		JLabel2.setBounds(12,36,36,24);

		getContentPane().add(_url);

		_url.setBounds(48,36,348,24);

		JLabel3.setText("选择存储XML文档的目录:");

		getContentPane().add(JLabel3);

		JLabel3.setBounds(12,72,384,24);

		getContentPane().add(_save);

		_save.setBounds(12,96,384,24);

		_go.setText("GO!");

		getContentPane().add(_go);

		_go.setBounds(96,228,216,24);

		getContentPane().add(_current);

		_current.setBounds(12,204,384,12);

		JLabel4.setText("Number of pages:");

		getContentPane().add(JLabel4);

		JLabel4.setBounds(12,180,120,12);

		_pages.setText("0");

		getContentPane().add(_pages);

		_pages.setBounds(120,180,108,12);

		JLabel6.setText("选择Log,当前需要保存的日志:");

		getContentPane().add(JLabel6);

		JLabel6.setBounds(12,120,384,24);

		_logPath.setText("./Spider.log");

		getContentPane().add(_logPath);

		_logPath.setBounds(12,144,384,24);

		_go.setActionCommand("jbutton");

		

		SymAction lSymAction = new SymAction();

		_go.addActionListener(lSymAction);

		SymWindow aSymWindow = new SymWindow();

		this.addWindowListener(aSymWindow);

		

		try{

	    	DOMImplementation domImpl = DocumentBuilderFactory

	    	.newInstance().newDocumentBuilder().getDOMImplementation();

	    	doc = domImpl.createDocument(null,"spider",null);

	    }catch(ParserConfigurationException e){

	    	e.printStackTrace();

	    }catch(DOMException e){

	    	e.printStackTrace();

	    }

	}

	

	public void setVisible(boolean b){

		if(b)

			setLocation(50,50);

		super.setVisible(b);

	}

	

	public static void main(String[] args) {

		// TODO Auto-generated method stub

		(new mySpider()).setVisible(true);

	}

	

	public void addNotify(){

		Dimension size = getSize();

		

		super.addNotify();

		

		if(frameSizeAdjusted)

			return;

		frameSizeAdjusted = true;

		

		Insets insets = getInsets();

		JMenuBar menuBar = getRootPane().getJMenuBar();

		int menuBarHeight = 0;

		if(menuBar!=null)

			menuBarHeight = menuBar.getPreferredSize().height;

		setSize(insets.left+

				insets.right+

				size.width,

				insets.top+

				insets.bottom+

				size.height+menuBarHeight);

	}



	boolean frameSizeAdjusted = false;

	

	JLabel D = new JLabel();

	JLabel JLabel2 = new JLabel();

	

	JTextField _url = new JTextField();

	JLabel JLabel3 = new JLabel();

	

	JTextField _save = new JTextField();

	JButton _go = new JButton();

	

	JLabel _current = new JLabel();

	JLabel JLabel4 = new JLabel();

	

	JLabel _pages = new JLabel();

	JLabel JLabel6 = new JLabel();

	

	JTextField _logPath = new JTextField();

	

	class SymAction implements ActionListener{

		public void actionPerformed(ActionEvent event){

			Object object = event.getSource();

			if(object == _go)

				Go_actionPerformed(event);

		}

	}

	

	protected void processFile(HTTP file){

		try{	

			if(_save.getText().length()>0){

				int i=file.getURL().lastIndexOf('/');

				

				if(i!=-1){

					int iPoint = file.getURL().lastIndexOf('.');

					String extendName = file.getURL().substring(iPoint+1);

					if(extendName.equals("html") || extendName.equals("htm") || extendName.equals("shtml")){

						String fileBody = new String(file.getBody().getBytes("iso-8859-1"),"GBK");



						DOMFragmentParser parser = new DOMFragmentParser();

						DocumentFragment node =

						      new HTMLDocumentImpl().createDocumentFragment();

						try {

						    parser.setProperty("http://cyberneko.org/html/properties/default-encoding","GBK");

						    parser.parse(new InputSource(new ByteArrayInputStream(fileBody.getBytes())), node);

						}catch (IOException e) {

							e.printStackTrace();

							

						}catch (SAXException e) {

							e.printStackTrace();

						    

						}

						

						StringBuffer sb = new StringBuffer();

					    getText(sb, node, "title");

					    String title = sb.toString();



					    sb.setLength(0);

					    getText(sb, node,"body");

					    String text = sb.toString();

					    text = text.replaceAll("<","<")

					    .replaceAll(">",">");

					    if(title.length()!=0 && text.length()!=0)

					    	addElementNode(doc,title,text,file.getURL()); 

					}

				}

			}

			

		}catch(Exception e){

			Log.logException("Can't save output file: ",e);

		}

	}

	

	private Element createTitleElement(Document docs,String title){

		Element titleElement = docs.createElement("TITLE");

		titleElement.setTextContent(title);

		return titleElement;

	}

	

	private Element createBodyElement(Document docs,String body){

		Element bodyElement = docs.createElement("BODY");

		bodyElement.setTextContent(body);

		return bodyElement;

	}

	

	private Element createURLElement(Document docs,String URL){

		Element URLElement = docs.createElement("URL");

		URLElement.setTextContent(URL);

		return URLElement;

	}

	

	public void addElementNode(Document docs,String title,String body,String URL){

		Element HTMLElement = docs.createElement("HTMLPAPER");

		

		HTMLElement.appendChild(createTitleElement(docs,title));

		HTMLElement.appendChild(createBodyElement(docs,body));

		HTMLElement.appendChild(createURLElement(docs,URL));

		

		docs.getDocumentElement().appendChild(HTMLElement);

	}

	

	private void getText(StringBuffer sb, Node node) {

	    if (node.getNodeType() == Node.TEXT_NODE) {

	      sb.append(node.getNodeValue());

	    }

	    NodeList children = node.getChildNodes();

	    if (children != null) {

	      int len = children.getLength();

	      for (int i = 0; i < len; i++) {

	        getText(sb, children.item(i));

	      }

	    }

	  }



	  private boolean getText(StringBuffer sb, Node node,

	    String element) {

	    if (node.getNodeType() == Node.ELEMENT_NODE) {

	      if (element.equalsIgnoreCase(node.getNodeName())) {

	        getText(sb, node);

	      }

	    }

	    NodeList children = node.getChildNodes();

	    if (children != null) {

	      int len = children.getLength();

	      for (int i = 0; i < len; i++) {

	        if (getText(sb, children.item(i), element)) {

	          return true;

	        }

	      }

	    }

	    return false;

	  }

	

	void Go_actionPerformed(ActionEvent event){

		IWorkloadStorable wl = new SpiderInternalWorkload();

		if(_Spider!=null){

			Runnable doLater = new Runnable(){

				public void run(){

					_go.setText("Canceling...");

				}

			};

			SwingUtilities.invokeLater(doLater);

			_Spider.halt();

			return;

		}

		

		try{

			if(_url.getText().length()>0){

				HTTPSocket http = new HTTPSocket();

				http.send(_url.getText(),null);

			}else{

				_current.setText("<<distributed mode>>");

			}

		}catch(Exception e){

			JOptionPane.showMessageDialog(this,e,"Error",JOptionPane.OK_CANCEL_OPTION,null);

			return;

		}

		

		Runnable doLater = new Runnable(){

			public void run(){

				_go.setText("Cancel");

				_current.setText("Loading...");

			}

		};

		

		SwingUtilities.invokeLater(doLater);

		_pagesCount = 0;

		if(_logPath.getText().length()>0){

			File file = new File(_logPath.getText());

			file.delete();

			Log.setLevel(Log.LOG_LEVEL_NORMAL);

			Log.setFile(true);

			Log.setConsole(false);

			Log.setPath(_logPath.getText());

		}

		

		try{

			wl = new SpiderSQLWorkload(

					"sun.jdbc.odbc.JdbcOdbcDriver",

					"jdbc:odbc:WORKLOAD");

		}catch(Exception e){

			JOptionPane.showMessageDialog(this,

					e,

					"Error",

					JOptionPane.OK_CANCEL_OPTION,

					null);

		}

		

		_Spider = new Spider(this,

				_url.getText(),

				new HTTPSocket(),

				100,

				wl);

		_Spider.setMaxBody(200);

		_Spider.start();

	}

	

	public boolean foundInternalLink(String url){

		return true;

	}

	

	public boolean foundExternalLink(String url){

		return false;

	}

	

	public boolean foundOtherLink(String url){

		return false;

	}

	

	class UpdateTarget implements Runnable{

		public String _t;

		public void run(){

			_current.setText(_t);

			_pages.setText(""+_pagesCount);

		}

	}

	

	public void processPage(HTTP page){

		_pagesCount++;

		UpdateTarget ut = new UpdateTarget();

		ut._t = page.getURL();

		SwingUtilities.invokeLater(ut);

		processFile(page);

	}

	

	public void completePage(HTTP page, boolean error){

		

	}

	

	public boolean getRemoveQuery(){

		return true;

	}

	

	public void spiderComplete(){

		if(_Spider.isHalted()){

			JOptionPane.showMessageDialog(this,

					"下载正在被取消...... "+

					"请检查日志的错误!.",

					"完成下载!",

					JOptionPane.OK_CANCEL_OPTION,

					null);

		}else{

			JOptionPane.showMessageDialog(this,

					"下载完成..... "+

					"请检查日志的错误!.",

					"完成下载!",

					JOptionPane.OK_CANCEL_OPTION,

					null);		

		}

		

		DOMSource doms = new DOMSource(doc);

		File f = new File(_save.getText(),"HTMLPraser.xml");

		StreamResult sr = new StreamResult(f);

		

		try{

			TransformerFactory tf = TransformerFactory.newInstance();

			Transformer t = tf.newTransformer();

			

			Properties properties = t.getOutputProperties();

			properties.setProperty(OutputKeys.ENCODING,"GBK");

			properties.setProperty(OutputKeys.INDENT,"yes");

			

			t.setOutputProperties(properties);

			t.transform(doms,sr);

		}catch(TransformerConfigurationException tce){

			tce.printStackTrace();

		}catch(TransformerException te){

			System.out.println("转换错误..../n-------");

			te.printStackTrace();

		}

		

		_Spider = null;

		Runnable doLater = new Runnable(){

			public void run(){

				_go.setText("GO!!");

			}

		};

		SwingUtilities.invokeLater(doLater);

	}

	

	class SymWindow extends WindowAdapter{

		public void windowClosed(WindowEvent event){

			Object object = event.getSource();

			if(object==mySpider.this)

				GetSite_windowClosed(event);

		}

	}

	

	void GetSite_windowClosed(WindowEvent event){

		System.exit(0);

	}

}
以上是网络爬虫的代码。需要导入第三方包,可以在Eclipse环境下开发。但是本人
配置环境变量,直接在记事本中开发。第三方包可以在网上下载。也可以向我索取。
留下个人E-mail即可。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值