// assume urlMatcher instance as in the previous example while (urlMatcher.find()) { int startIndex = urlMatcher.start(); int endIndex = urlMatcher.end(); String currentMatch = data.substring(startIndex, endIndex); // the brute force approach, using a new pattern! Pattern restricted = Pattern.compile(".*(abc|cbs|nbc)//.com.*"); Matcher restrictMatcher = restricted.matcher(currentMatch); if (!restrictMatcher.matches()) { System.out.println(currentMatch); } }
String data = getStringData(); // load the document String urlString = "(http|https|ftp)://([a-zA-Z0-9-//.]+)[///w//.//-//+//?%=&;:,#]*"; Pattern urlPattern = Pattern.compile(urlString); Matcher urlMatcher = urlPattern.matcher(data); // print out the domain from each URL while (urlMatcher.find()) { String domain = urlMatcher.group(2); // 2nd group is the domain System.out.println(domain); }
String data = getStringData(); String patternStr = "//s(of|or|the|to)//s+//1[//s//.,;]"; Pattern wordPattern = Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE); Matcher wordMatcher = wordPattern.matcher(data); while (wordMatcher.find()) { int start = wordMatcher.start(); String word = wordMatcher.group(1); // print the index location of the repeated word System.out.println("Repeated " + word + " starting at " + start); }