import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Spider
{
public static void main(String args[]) throws IOException
{
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss-SSS");//设置日期格式
HtmlText htmltext=new HtmlText();
//String text=htmltext.readText("D:\\test.htm");
Html html=new Html(new URL("http://www.hust.edu.cn"));
String html_str=html.getHtml();
HtmlPro htp=new HtmlPro(html_str);
System.out.println(df.format(new Date()));
htp.strMatch("href=");
htp.strMatchKMP("href=");
System.out.println(df.format(new Date()));
}
}
class Html
{
Html(URL sourse_url)
{
this.url=sourse_url;
}
Html()
{
}
public void setUrl(URL sourse_url)
{
this.url=sourse_url;
}
public String getHtml() throws IOException
{
connection = (HttpURLConnection)url.openConnection();
connection.connect();
in = new InputStreamReader(url.openStream());
br = new BufferedReader(in);
StringBuffer strbuf=new StringBuffer();
String str;
while ((str = br.readLine()) != null)
{
strbuf.append(str);
}
return strbuf.toString();
}
private
HttpURLConnection connection = null;
InputStreamReader in = null;
BufferedReader br = null;
URL url = null;
}
class HtmlText
{
HtmlText()
{
}
public void appendText(String File_Path,String Msg_text) throws IOException
{
PrintWriter fout=new PrintWriter(new FileWriter(File_Path,true));
fout.print(Msg_text);
fout.close();
}
public void appendLine(String File_Path,String Msg_text) throws IOException
{
PrintWriter fout=new PrintWriter(new FileWriter(File_Path,true));
fout.println(Msg_text);
fout.close();
}
public String readText(String File_Path) throws IOException
{
BufferedReader fin=new BufferedReader(new FileReader(File_Path));
StringBuffer strbuf=new StringBuffer();
String str;
while((str=fin.readLine())!=null)
{
strbuf.append(str);
}
return strbuf.toString();
}
public String readLine(String File_Path) throws IOException
{
@SuppressWarnings("resource")
BufferedReader fin=new BufferedReader(new FileReader(File_Path));
String str=fin.readLine();
return str;
}
private String FilePath;
PrintWriter fout;
}
//字符串处理类
class HtmlPro
{
HtmlPro(String Html_Str)
{
this.HtmlStr=Html_Str;
}
public int strMatch(String sub_str) throws IOException//确定匹配主串结束位置,输出链接个数
{
int main_index=0,sub_index=0,url_index,matchnum=0;
int main_len=HtmlStr.length();
int sub_len=sub_str.length();
HtmlText htmltext=new HtmlText();
String str=null;
while(sub_index<sub_len&&main_index<main_len)
{
if(HtmlStr.charAt(main_index)==sub_str.charAt(sub_index))
{
main_index++;
sub_index++;
if(sub_index==sub_len)//字符串在这里匹配结束,当下指向"或'
{
url_index=main_index;
if(HtmlStr.charAt(main_index)=='"')while(HtmlStr.charAt(++url_index)!='"'&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='\'')while(HtmlStr.charAt(++url_index)!='\''&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='"'||HtmlStr.charAt(main_index)=='\'')
{ //substring左闭右开
str=HtmlStr.substring(main_index+1, url_index);
if(checkUrl(str))
{
Pattern regex = Pattern.compile("^http://.*");//站内链接处理
Matcher matcher = regex.matcher(str);
if(!matcher.matches())
;//str="http://hub.hust.edu.cn/"+str;
htmltext.appendLine("D:\\Url.txt",str);
matchnum++;
}
}
sub_index=0;
}
}
else
{
main_index=main_index-sub_index+1;
sub_index=0;
}
}
System.out.println("本页面共有"+matchnum+"条链接");
return matchnum;
}
public int strMatchKMP(String sub_str) throws IOException
{
int main_index=0,sub_index=0,url_index,matchnum=0;
int main_len=HtmlStr.length();
int sub_len=sub_str.length();
Integer next[] = new Integer[sub_str.length()];//next数组
getNext(sub_str, next);//获取模式串的next数组
HtmlText htmltext=new HtmlText();
String str=null;
while(sub_index<sub_len&&main_index<main_len)
{
if(sub_index==-1||HtmlStr.charAt(main_index)==sub_str.charAt(sub_index))
{
sub_index++;main_index++;
if(sub_index==sub_len)//字符串在这里匹配结束,当下指向"或'
{
url_index=main_index;
if(HtmlStr.charAt(main_index)=='"')while(HtmlStr.charAt(++url_index)!='"'&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='\'')while(HtmlStr.charAt(++url_index)!='\''&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='"'||HtmlStr.charAt(main_index)=='\'')
{ //substring左闭右开
str=HtmlStr.substring(main_index+1, url_index);
if(checkUrl(str))
{
Pattern regex = Pattern.compile("^http://.*");//站内链接处理
Matcher matcher = regex.matcher(str);
if(!matcher.matches())
;//str="http://hub.hust.edu.cn/"+str;
htmltext.appendLine("D:\\Url.txt",str);
matchnum++;
}
}
sub_index=0;
}
}
else
sub_index=next[sub_index];
}
System.out.println("本页面共有"+matchnum+"条链接");
return matchnum;
}
//获取KMP算法的next数组
public static void getNext(String sub_str,Integer next[])
{
next[0]=-1;
next[1]=0;
int i=1,j=0;
while(i<sub_str.length())
{
if(j==-1||sub_str.charAt(i)==sub_str.charAt(j))
{
i++;j++;
if(i>=sub_str.length())break;
next[i]=j;
}
else j=next[j];
}
}
//URL过滤包括:zip、#、css、javascript、图片
public Boolean checkUrl(String url)//
{
if(url.contains("#"))return false;
if(url.toLowerCase().contains("javascript"))return false;
if(url.toLowerCase().contains(".png"))return false;
if(url.toLowerCase().contains(".gif"))return false;
if(url.toLowerCase().contains("css"))return false;
if(url.toLowerCase().contains("zip"))return false;
return true;
}
public static void printArry(Integer[] Intval)
{
for(int i=0;i<Intval.length;i++)
System.out.print(Intval[i]+" ");
}
private String HtmlStr;
}
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Spider
{
public static void main(String args[]) throws IOException
{
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss-SSS");//设置日期格式
HtmlText htmltext=new HtmlText();
//String text=htmltext.readText("D:\\test.htm");
Html html=new Html(new URL("http://www.hust.edu.cn"));
String html_str=html.getHtml();
HtmlPro htp=new HtmlPro(html_str);
System.out.println(df.format(new Date()));
htp.strMatch("href=");
htp.strMatchKMP("href=");
System.out.println(df.format(new Date()));
}
}
class Html
{
Html(URL sourse_url)
{
this.url=sourse_url;
}
Html()
{
}
public void setUrl(URL sourse_url)
{
this.url=sourse_url;
}
public String getHtml() throws IOException
{
connection = (HttpURLConnection)url.openConnection();
connection.connect();
in = new InputStreamReader(url.openStream());
br = new BufferedReader(in);
StringBuffer strbuf=new StringBuffer();
String str;
while ((str = br.readLine()) != null)
{
strbuf.append(str);
}
return strbuf.toString();
}
private
HttpURLConnection connection = null;
InputStreamReader in = null;
BufferedReader br = null;
URL url = null;
}
class HtmlText
{
HtmlText()
{
}
public void appendText(String File_Path,String Msg_text) throws IOException
{
PrintWriter fout=new PrintWriter(new FileWriter(File_Path,true));
fout.print(Msg_text);
fout.close();
}
public void appendLine(String File_Path,String Msg_text) throws IOException
{
PrintWriter fout=new PrintWriter(new FileWriter(File_Path,true));
fout.println(Msg_text);
fout.close();
}
public String readText(String File_Path) throws IOException
{
BufferedReader fin=new BufferedReader(new FileReader(File_Path));
StringBuffer strbuf=new StringBuffer();
String str;
while((str=fin.readLine())!=null)
{
strbuf.append(str);
}
return strbuf.toString();
}
public String readLine(String File_Path) throws IOException
{
@SuppressWarnings("resource")
BufferedReader fin=new BufferedReader(new FileReader(File_Path));
String str=fin.readLine();
return str;
}
private String FilePath;
PrintWriter fout;
}
//字符串处理类
class HtmlPro
{
HtmlPro(String Html_Str)
{
this.HtmlStr=Html_Str;
}
public int strMatch(String sub_str) throws IOException//确定匹配主串结束位置,输出链接个数
{
int main_index=0,sub_index=0,url_index,matchnum=0;
int main_len=HtmlStr.length();
int sub_len=sub_str.length();
HtmlText htmltext=new HtmlText();
String str=null;
while(sub_index<sub_len&&main_index<main_len)
{
if(HtmlStr.charAt(main_index)==sub_str.charAt(sub_index))
{
main_index++;
sub_index++;
if(sub_index==sub_len)//字符串在这里匹配结束,当下指向"或'
{
url_index=main_index;
if(HtmlStr.charAt(main_index)=='"')while(HtmlStr.charAt(++url_index)!='"'&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='\'')while(HtmlStr.charAt(++url_index)!='\''&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='"'||HtmlStr.charAt(main_index)=='\'')
{ //substring左闭右开
str=HtmlStr.substring(main_index+1, url_index);
if(checkUrl(str))
{
Pattern regex = Pattern.compile("^http://.*");//站内链接处理
Matcher matcher = regex.matcher(str);
if(!matcher.matches())
;//str="http://hub.hust.edu.cn/"+str;
htmltext.appendLine("D:\\Url.txt",str);
matchnum++;
}
}
sub_index=0;
}
}
else
{
main_index=main_index-sub_index+1;
sub_index=0;
}
}
System.out.println("本页面共有"+matchnum+"条链接");
return matchnum;
}
public int strMatchKMP(String sub_str) throws IOException
{
int main_index=0,sub_index=0,url_index,matchnum=0;
int main_len=HtmlStr.length();
int sub_len=sub_str.length();
Integer next[] = new Integer[sub_str.length()];//next数组
getNext(sub_str, next);//获取模式串的next数组
HtmlText htmltext=new HtmlText();
String str=null;
while(sub_index<sub_len&&main_index<main_len)
{
if(sub_index==-1||HtmlStr.charAt(main_index)==sub_str.charAt(sub_index))
{
sub_index++;main_index++;
if(sub_index==sub_len)//字符串在这里匹配结束,当下指向"或'
{
url_index=main_index;
if(HtmlStr.charAt(main_index)=='"')while(HtmlStr.charAt(++url_index)!='"'&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='\'')while(HtmlStr.charAt(++url_index)!='\''&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='"'||HtmlStr.charAt(main_index)=='\'')
{ //substring左闭右开
str=HtmlStr.substring(main_index+1, url_index);
if(checkUrl(str))
{
Pattern regex = Pattern.compile("^http://.*");//站内链接处理
Matcher matcher = regex.matcher(str);
if(!matcher.matches())
;//str="http://hub.hust.edu.cn/"+str;
htmltext.appendLine("D:\\Url.txt",str);
matchnum++;
}
}
sub_index=0;
}
}
else
sub_index=next[sub_index];
}
System.out.println("本页面共有"+matchnum+"条链接");
return matchnum;
}
//获取KMP算法的next数组
public static void getNext(String sub_str,Integer next[])
{
next[0]=-1;
next[1]=0;
int i=1,j=0;
while(i<sub_str.length())
{
if(j==-1||sub_str.charAt(i)==sub_str.charAt(j))
{
i++;j++;
if(i>=sub_str.length())break;
next[i]=j;
}
else j=next[j];
}
}
//URL过滤包括:zip、#、css、javascript、图片
public Boolean checkUrl(String url)//
{
if(url.contains("#"))return false;
if(url.toLowerCase().contains("javascript"))return false;
if(url.toLowerCase().contains(".png"))return false;
if(url.toLowerCase().contains(".gif"))return false;
if(url.toLowerCase().contains("css"))return false;
if(url.toLowerCase().contains("zip"))return false;
return true;
}
public static void printArry(Integer[] Intval)
{
for(int i=0;i<Intval.length;i++)
System.out.print(Intval[i]+" ");
}
private String HtmlStr;
}