Java之------socket网络蜘蛛

网络蜘蛛

网络蜘蛛可以通过网页超链接进入到更多的的网页,将需要的信息提取出来。

本篇蜘蛛将从新浪主页开始爬,提取网页中的邮箱地址。


本篇主要采用了递归的算法,层层深入搜索,但我在这里设置了一个限制,就是只递归到50层就返回回去,你想无限搜索的就把限制去掉。

本篇将搜索到的网址和邮箱地址都用IO写到了文件当中去,采纳者可以自己指定一个路径或者按本例在classpath下建立相同的文件。


利用正则表达式将网页中的超链接提取出来:

package cn.hncu.bs;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MySpider {

	private static int ply=0;

	/**
	 * @param args
	 * Internet Spider
	 */
	public static void main(String[] args) {
		String address="http://www.sina.com";
		search1(address);
	}

	private static void search1(String address) {
		ply++;
		if (ply>=50){//递归限制
			return;
		}
		try {
			URL url=new URL(address);
			BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));
			
			//正则表达式
			String regexEmail="\\w+@\\w+(\\.\\w+)+";
			String regexAddress="http://([\\w-]+\\.)+[\\w-]+(/[\\w- .%&=]*)?";
			
			Pattern p1=Pattern.compile(regexEmail);
			Pattern p2=Pattern.compile(regexAddress);
			String line=null;
			while ((line=br.readLine())!=null){
				Matcher m1=p1.matcher(line);
				while (m1.find()){
					String email=m1.group();
					BufferedReader brLink=new BufferedReader(new InputStreamReader(new FileInputStream("emailAddress.txt")));
					String compareEmail=null;
					boolean emailFlag=false;
					while ((compareEmail=brLink.readLine())!=null){
						if (email.equalsIgnoreCase(compareEmail)){
							emailFlag=true;
							break;
						}
					}
					if (emailFlag){
						continue;
					}
					System.out.println(email);
					DataOutputStream dEmail=new DataOutputStream(new BufferedOutputStream(new FileOutputStream("emailAddress.txt",true)));
					dEmail.writeBytes(email+"\r\n");
					dEmail.flush();
				}
				Matcher m2=p2.matcher(line);
				while (m2.find()){
					String link=m2.group();
					BufferedReader brLink=new BufferedReader(new InputStreamReader(new FileInputStream("linkAddress.txt")));
					String compareLink=null;
					boolean linkFlag=false;
					while ((compareLink=brLink.readLine())!=null){
						if (link.equalsIgnoreCase(compareLink)){
							linkFlag=true;
							break;
						}
					}
					if (linkFlag){
						continue;
					}
					System.out.println(link);
					DataOutputStream dLink=new DataOutputStream(new BufferedOutputStream(new FileOutputStream("linkAddress.txt",true)));
					dLink.writeBytes(link+"\r\n");
					dLink.flush();
					search1(link);
				}
			}
			ply--;
			return;
		} catch (Exception e) {
			return;//无法进入该网址就会抛出该异常,回退递归
		}
	}
}

还可以自己写算法将网页超链接提取出来:

(有些许的缺陷,待改善)

package cn.hncu.bs;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MySpider {

	private static int ply=0;

	/**
	 * @param args
	 * Internet Spider
	 */
	public static void main(String[] args) {
		String address="http://www.sina.com";
		search2(address);
	}

	private static void search2(String address) {
		ply++;
		if (ply>=50){//递归限制
			return;
		}
		try {
			URL url=new URL(address);
			BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));
			
			//正则表达式
			String regexEmail="\\w+@\\w+(\\.\\w+)+";
			
			Pattern p1=Pattern.compile(regexEmail);
			String line=null;
			while ((line=br.readLine())!=null){
				Matcher m1=p1.matcher(line);
				while (m1.find()){
					String email=m1.group();
					BufferedReader brLink=new BufferedReader(new InputStreamReader(new FileInputStream("emailAddress2.txt")));
					String compareEmail=null;
					boolean emailFlag=false;
					while ((compareEmail=brLink.readLine())!=null){
						if (email.equalsIgnoreCase(compareEmail)){
							emailFlag=true;
							break;
						}
					}
					if (emailFlag){
						continue;
					}
					System.out.println(email);
					DataOutputStream dEmail=new DataOutputStream(new BufferedOutputStream(new FileOutputStream("emailAddress2.txt",true)));
					dEmail.writeBytes(email+"\r\n");
					dEmail.flush();
				}
				
				int index=line.indexOf("http://");
				String link="";
				if (index!=-1){
					for (int i=index;i<line.length();i++){
						if (line.charAt(i)=='\"'||line.charAt(i)=='\''||line.charAt(i)==';'||line.charAt(i)==' '||line.charAt(i)=='<'){
							continue;
						}
						if (line.charAt(i)=='>'){
							break;
						}
						link+=line.charAt(i);
					}
					
					BufferedReader brLink=new BufferedReader(new InputStreamReader(new FileInputStream("linkAddress2.txt")));
					String compareLink=null;
					boolean linkFlag=false;
					while ((compareLink=brLink.readLine())!=null){
						if (link.equalsIgnoreCase(compareLink)){
							linkFlag=true;
							break;
						}
					}
					if (linkFlag){
						continue;
					}
					System.out.println(link);
					DataOutputStream dLink=new DataOutputStream(new BufferedOutputStream(new FileOutputStream("linkAddress2.txt",true)));
					dLink.writeBytes(link+"\r\n");
					dLink.flush();
					search2(link);
				}
			}
			ply--;
			return;
		} catch (Exception e) {
			return;<span style="font-family: Arial, Helvetica, sans-serif;">//无法进入该网址就会抛出该异常,回退递归</span>
		}
	}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值