基于java正则表达式的网页解析

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

 

public class LinkParseManager extends AbstractPageManageStategy {

 private final Pattern TEXT_PATTERN;
 private final Pattern URL_PATTERN;
 //private List<URLFormat> urlQueue;
 List<String> urlHistory =null;

 public LinkParseManager(Reader reader,
   List<String> urlHistory) {
  this.urlHistory=urlHistory;
  this.links = new LinkedList<String>();
  //this.urlQueue = urlQueue;
  this.reader = reader;
  this.TEXT_PATTERN = Pattern.compile(
    "<a//b([^>]*)>(?:<[^>]+>)*(.*?)(?:<[^>]+>)*</a>",
    Pattern.CASE_INSENSITIVE);
  this.URL_PATTERN = Pattern.compile(
    "//bHREF//s*=//s*(/"([^/"]*)/"|'([^']*)'|([^'/">//s]+))",
    Pattern.CASE_INSENSITIVE);
 }

 public List<String> parseLinks() {
  //urlHistory.add(downloadingURL.getUrlStr());
  if(reader==null){return null;}
  try {
   String lineWords;
   BufferedReader br = new BufferedReader(reader);
   while ((lineWords = br.readLine()) != null) {
    this.parseLink(lineWords);
   }

  } catch (FileNotFoundException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
  return links;
 }

 public void parseLink(String lineWords) {
  String sss="163";
   CharSequence cs =sss.subSequence(0, sss.length()-1);
  String  linkURL;
  Matcher m = TEXT_PATTERN.matcher(lineWords);
  if (m.find() && m.groupCount() > 1) {
   String url = m.group(1);
   Matcher m2 = URL_PATTERN.matcher(url);
   if (m2.find()) {//&& m2.group(1).contains("http")&& m2.group(1).contains("163.com")

    linkURL = m2.group(1).replaceAll("/"", "").replaceAll("'", "");
    linkURL=linkURL.trim();
    if(linkURL.startsWith("http://")&& linkURL.contains(cs)){
     int i=0;
     for(i=0;i<urlHistory.size();i++){
      String st=urlHistory.get(i);
      if(st.equals(linkURL)){
       break;
      }
     }
     if(i>=urlHistory.size()){
      urlHistory.add(linkURL);
      links.add(linkURL);
      System.out.println(linkURL);
     }
     
    }
   }
  }
 }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值