Java网络爬虫

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;




public class Spider 
{
public static void main(String args[]) throws IOException
{                                    
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss-SSS");//设置日期格式
HtmlText htmltext=new HtmlText();
//String text=htmltext.readText("D:\\test.htm");
Html html=new Html(new URL("http://www.hust.edu.cn"));
String html_str=html.getHtml();
HtmlPro htp=new HtmlPro(html_str);
System.out.println(df.format(new Date()));
htp.strMatch("href=");
htp.strMatchKMP("href=");
System.out.println(df.format(new Date()));
}


}


class Html
{
Html(URL sourse_url)
{
this.url=sourse_url;
}
   Html()
{
}
public void setUrl(URL sourse_url)
    {
this.url=sourse_url;
    }


public String getHtml() throws IOException
{
connection = (HttpURLConnection)url.openConnection();
connection.connect();
in = new InputStreamReader(url.openStream());
br = new BufferedReader(in);
StringBuffer strbuf=new StringBuffer(); 

String str;
while ((str = br.readLine()) != null) 
{
strbuf.append(str);
}
return strbuf.toString();


}
private  
    HttpURLConnection connection = null;
InputStreamReader in = null;
BufferedReader br = null;
URL url = null;
}




class HtmlText
{
HtmlText()
{
}
public  void appendText(String File_Path,String Msg_text) throws IOException
{
PrintWriter fout=new PrintWriter(new FileWriter(File_Path,true));
    fout.print(Msg_text);
    fout.close();
}
public  void appendLine(String File_Path,String Msg_text) throws IOException
{
PrintWriter fout=new PrintWriter(new FileWriter(File_Path,true));
fout.println(Msg_text);
fout.close();
}
public String readText(String File_Path) throws IOException
{
BufferedReader fin=new BufferedReader(new FileReader(File_Path));
StringBuffer strbuf=new StringBuffer(); 
String str;
while((str=fin.readLine())!=null)
{
strbuf.append(str);
}
return strbuf.toString();
}


public String readLine(String File_Path) throws IOException
{
@SuppressWarnings("resource")
BufferedReader fin=new BufferedReader(new FileReader(File_Path));
String str=fin.readLine();
return str;
}


private String FilePath;
PrintWriter fout;
}


//字符串处理类
class HtmlPro
{
HtmlPro(String Html_Str)
 {
   this.HtmlStr=Html_Str;
}
public int strMatch(String sub_str) throws IOException//确定匹配主串结束位置,输出链接个数
{
int main_index=0,sub_index=0,url_index,matchnum=0;
int main_len=HtmlStr.length();
int sub_len=sub_str.length();
HtmlText htmltext=new HtmlText();
String str=null;
while(sub_index<sub_len&&main_index<main_len)
{
if(HtmlStr.charAt(main_index)==sub_str.charAt(sub_index))
{
main_index++;
   sub_index++;
if(sub_index==sub_len)//字符串在这里匹配结束,当下指向"或'
{
url_index=main_index;
if(HtmlStr.charAt(main_index)=='"')while(HtmlStr.charAt(++url_index)!='"'&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='\'')while(HtmlStr.charAt(++url_index)!='\''&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='"'||HtmlStr.charAt(main_index)=='\'')
{    //substring左闭右开
str=HtmlStr.substring(main_index+1, url_index);
if(checkUrl(str))
{
Pattern regex = Pattern.compile("^http://.*");//站内链接处理  
Matcher matcher = regex.matcher(str);  
if(!matcher.matches())  
;//str="http://hub.hust.edu.cn/"+str;
htmltext.appendLine("D:\\Url.txt",str);
   matchnum++;
}
}
sub_index=0;
}
}
else
{
main_index=main_index-sub_index+1;
sub_index=0;
}
}
System.out.println("本页面共有"+matchnum+"条链接");
return matchnum;
}


public int strMatchKMP(String sub_str) throws IOException
{
int main_index=0,sub_index=0,url_index,matchnum=0;
int main_len=HtmlStr.length();
int sub_len=sub_str.length();
Integer next[] = new Integer[sub_str.length()];//next数组
getNext(sub_str, next);//获取模式串的next数组
HtmlText htmltext=new HtmlText();
String str=null;
while(sub_index<sub_len&&main_index<main_len)
{
if(sub_index==-1||HtmlStr.charAt(main_index)==sub_str.charAt(sub_index))
{
sub_index++;main_index++;
if(sub_index==sub_len)//字符串在这里匹配结束,当下指向"或'
{
url_index=main_index;
if(HtmlStr.charAt(main_index)=='"')while(HtmlStr.charAt(++url_index)!='"'&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='\'')while(HtmlStr.charAt(++url_index)!='\''&&url_index<main_len);
if(HtmlStr.charAt(main_index)=='"'||HtmlStr.charAt(main_index)=='\'')
{    //substring左闭右开
str=HtmlStr.substring(main_index+1, url_index);
if(checkUrl(str))
{
Pattern regex = Pattern.compile("^http://.*");//站内链接处理  
Matcher matcher = regex.matcher(str);  
if(!matcher.matches())  
;//str="http://hub.hust.edu.cn/"+str;
htmltext.appendLine("D:\\Url.txt",str);
   matchnum++;
}
}
sub_index=0;
}
}
else
sub_index=next[sub_index];

}
System.out.println("本页面共有"+matchnum+"条链接");
return matchnum;
}




//获取KMP算法的next数组
public static void getNext(String sub_str,Integer next[])
{                      
next[0]=-1;   
next[1]=0;    
int i=1,j=0; 
while(i<sub_str.length())
{         
   if(j==-1||sub_str.charAt(i)==sub_str.charAt(j))
   {
    i++;j++;
    if(i>=sub_str.length())break;
    next[i]=j;
   }
   else j=next[j];
}
}
//URL过滤包括:zip、#、css、javascript、图片
public  Boolean checkUrl(String url)//
{
if(url.contains("#"))return false;
if(url.toLowerCase().contains("javascript"))return false;
if(url.toLowerCase().contains(".png"))return false;
if(url.toLowerCase().contains(".gif"))return false;
if(url.toLowerCase().contains("css"))return false;
if(url.toLowerCase().contains("zip"))return false;
return true;
}






public static void printArry(Integer[] Intval)
{
for(int i=0;i<Intval.length;i++)
System.out.print(Intval[i]+" ");
}
private String HtmlStr;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值