网络蜘蛛
网络蜘蛛可以通过网页超链接进入到更多的的网页,将需要的信息提取出来。
本篇蜘蛛将从新浪主页开始爬,提取网页中的邮箱地址。
本篇主要采用了递归的算法,层层深入搜索,但我在这里设置了一个限制,就是只递归到50层就返回回去,你想无限搜索的就把限制去掉。
本篇将搜索到的网址和邮箱地址都用IO写到了文件当中去,采纳者可以自己指定一个路径或者按本例在classpath下建立相同的文件。
利用正则表达式将网页中的超链接提取出来:
package cn.hncu.bs;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MySpider {
private static int ply=0;
/**
* @param args
* Internet Spider
*/
public static void main(String[] args) {
String address="http://www.sina.com";
search1(address);
}
private static void search1(String address) {
ply++;
if (ply>=50){//递归限制
return;
}
try {
URL url=new URL(address);
BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));
//正则表达式
String regexEmail="\\w+@\\w+(\\.\\w+)+";
String regexAddress="http://([\\w-]+\\.)+[\\w-]+(/[\\w- .%&=]*)?";
Pattern p1=Pattern.compile(regexEmail);
Pattern p2=Pattern.compile(regexAddress);
String line=null;
while ((line=br.readLine())!=null){
Matcher m1=p1.matcher(line);
while (m1.find()){
String email=m1.group();
BufferedReader brLink=new BufferedReader(new InputStreamReader(new FileInputStream("emailAddress.txt")));
String compareEmail=null;
boolean emailFlag=false;
while ((compareEmail=brLink.readLine())!=null){
if (email.equalsIgnoreCase(compareEmail)){
emailFlag=true;
break;
}
}
if (emailFlag){
continue;
}
System.out.println(email);
DataOutputStream dEmail=new DataOutputStream(new BufferedOutputStream(new FileOutputStream("emailAddress.txt",true)));
dEmail.writeBytes(email+"\r\n");
dEmail.flush();
}
Matcher m2=p2.matcher(line);
while (m2.find()){
String link=m2.group();
BufferedReader brLink=new BufferedReader(new InputStreamReader(new FileInputStream("linkAddress.txt")));
String compareLink=null;
boolean linkFlag=false;
while ((compareLink=brLink.readLine())!=null){
if (link.equalsIgnoreCase(compareLink)){
linkFlag=true;
break;
}
}
if (linkFlag){
continue;
}
System.out.println(link);
DataOutputStream dLink=new DataOutputStream(new BufferedOutputStream(new FileOutputStream("linkAddress.txt",true)));
dLink.writeBytes(link+"\r\n");
dLink.flush();
search1(link);
}
}
ply--;
return;
} catch (Exception e) {
return;//无法进入该网址就会抛出该异常,回退递归
}
}
}
还可以自己写算法将网页超链接提取出来:
(有些许的缺陷,待改善)
package cn.hncu.bs;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MySpider {
private static int ply=0;
/**
* @param args
* Internet Spider
*/
public static void main(String[] args) {
String address="http://www.sina.com";
search2(address);
}
private static void search2(String address) {
ply++;
if (ply>=50){//递归限制
return;
}
try {
URL url=new URL(address);
BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));
//正则表达式
String regexEmail="\\w+@\\w+(\\.\\w+)+";
Pattern p1=Pattern.compile(regexEmail);
String line=null;
while ((line=br.readLine())!=null){
Matcher m1=p1.matcher(line);
while (m1.find()){
String email=m1.group();
BufferedReader brLink=new BufferedReader(new InputStreamReader(new FileInputStream("emailAddress2.txt")));
String compareEmail=null;
boolean emailFlag=false;
while ((compareEmail=brLink.readLine())!=null){
if (email.equalsIgnoreCase(compareEmail)){
emailFlag=true;
break;
}
}
if (emailFlag){
continue;
}
System.out.println(email);
DataOutputStream dEmail=new DataOutputStream(new BufferedOutputStream(new FileOutputStream("emailAddress2.txt",true)));
dEmail.writeBytes(email+"\r\n");
dEmail.flush();
}
int index=line.indexOf("http://");
String link="";
if (index!=-1){
for (int i=index;i<line.length();i++){
if (line.charAt(i)=='\"'||line.charAt(i)=='\''||line.charAt(i)==';'||line.charAt(i)==' '||line.charAt(i)=='<'){
continue;
}
if (line.charAt(i)=='>'){
break;
}
link+=line.charAt(i);
}
BufferedReader brLink=new BufferedReader(new InputStreamReader(new FileInputStream("linkAddress2.txt")));
String compareLink=null;
boolean linkFlag=false;
while ((compareLink=brLink.readLine())!=null){
if (link.equalsIgnoreCase(compareLink)){
linkFlag=true;
break;
}
}
if (linkFlag){
continue;
}
System.out.println(link);
DataOutputStream dLink=new DataOutputStream(new BufferedOutputStream(new FileOutputStream("linkAddress2.txt",true)));
dLink.writeBytes(link+"\r\n");
dLink.flush();
search2(link);
}
}
ply--;
return;
} catch (Exception e) {
return;<span style="font-family: Arial, Helvetica, sans-serif;">//无法进入该网址就会抛出该异常,回退递归</span>
}
}
}