package testData.collect; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.Reader; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class LinksParse extends AbstractPageManageStategy { private final Pattern TEXT_PATTERN; private final Pattern URL_PATTERN; public LinksParse(Reader reader) { this.links = new LinkedList<String>(); //this.urlQueue = urlQueue; this.reader = reader; this.TEXT_PATTERN = Pattern.compile( "<a//b([^>]*)>(?:<[^>]+>)*(.*?)(?:<[^>]+>)*</a>", Pattern.CASE_INSENSITIVE); this.URL_PATTERN = Pattern.compile( "//s">//bHREF//s*=//s*(/"([^/"]*)/"|'([^']*)'|([^'/">//s]+))", Pattern.CASE_INSENSITIVE); } public List<String> parseLinks() { //urlHistory.add(downloadingURL.getUrlStr()); if(reader==null){return null;} try { String lineWords; BufferedReader br = new BufferedReader(reader); while ((lineWords = br.readLine()) != null) { //System.out.println("<><><><><><>"); //if(lineWords.length()){} this.parseLink(lineWords); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return links; } public void parseLink(String lineWords) { String linkURL; Matcher m = TEXT_PATTERN.matcher(lineWords); if (m.find() && m.groupCount() > 1) { String url = m.group(1); Matcher m2 = URL_PATTERN.matcher(url); if (m2.find()) {//&& m2.group(1).contains("http")&& m2.group(1).contains("163.com") linkURL = m2.group(1).replaceAll("/"", "").replaceAll("'", ""); linkURL=linkURL.trim(); if(linkURL.startsWith("http://")){ String[] urls =linkURL.split("http"); for(int j=0;j<urls.length;j++){ String s="http"+urls[j]; s.trim(); links.add(s); } //System.out.println(linkURL); } } } }}
基于java正则表达式的网页解析
最新推荐文章于 2024-09-08 13:15:25 发布