基于java正则表达式的网页解析

 package testData.collect; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.Reader; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern;    public class LinksParse extends AbstractPageManageStategy {   private final Pattern TEXT_PATTERN;  private final Pattern URL_PATTERN;     public LinksParse(Reader reader) {     this.links = new LinkedList<String>();   //this.urlQueue = urlQueue;   this.reader = reader;   this.TEXT_PATTERN = Pattern.compile(     "<a//b([^>]*)>(?:<[^>]+>)*(.*?)(?:<[^>]+>)*</a>",     Pattern.CASE_INSENSITIVE);   this.URL_PATTERN = Pattern.compile(     "//s">//bHREF//s*=//s*(/"([^/"]*)/"|'([^']*)'|([^'/">//s]+))",     Pattern.CASE_INSENSITIVE);  }   public List<String> parseLinks() {   //urlHistory.add(downloadingURL.getUrlStr());   if(reader==null){return null;}   try {    String lineWords;    BufferedReader br = new BufferedReader(reader);    while ((lineWords = br.readLine()) != null) {     //System.out.println("<><><><><><>");     //if(lineWords.length()){}     this.parseLink(lineWords);    }    } catch (FileNotFoundException e) {    e.printStackTrace();   } catch (IOException e) {    e.printStackTrace();   }   return links;  }   public void parseLink(String lineWords) {      String  linkURL;   Matcher m = TEXT_PATTERN.matcher(lineWords);   if (m.find() && m.groupCount() > 1) {    String url = m.group(1);    Matcher m2 = URL_PATTERN.matcher(url);    if (m2.find()) {//&& m2.group(1).contains("http")&& m2.group(1).contains("163.com")      linkURL = m2.group(1).replaceAll("/"", "").replaceAll("'", "");     linkURL=linkURL.trim();     if(linkURL.startsWith("http://")){      String[] urls =linkURL.split("http");      for(int j=0;j<urls.length;j++){       String s="http"+urls[j];       s.trim();       links.add(s);             }                     //System.out.println(linkURL);     }    }   }  }}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值