使用正则表达式及字符串操作,抽取网页信息
-
- /* 去script */
- public static String trimScript(String content) {
- String regEx = "<script[^>]*>[^<]+</script>";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(content);
- String result = content;
- if (m.find()) {
- result = m.replaceAll("");
- }
- return result;
- }
- /* 去除注释*/
- public static String trimComment(String content) {
- String regEx = "<!--[^-]*-->";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(content);
- String result = content;
- if (m.find()) {
- result = m.replaceAll("");
- }
- return result;
- }
- /* 去除标签 */
- public static String trimTag(String content) {
- String regEx = "<[^>]+>";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(content);
- String result = content;
- if (m.find()) {
- result = m.replaceAll("");
- }
- result = result.replace(" ", "").replace(">", "").replace(
- ">", "");
- return result;
- }
- /* 根据起始位置和结束位置,截取字符串 */
- public static String subString(String start, String end, String content) {
- int iStart = content.indexOf(start);
- int iEnd = content.indexOf(end);
- if (iStart < iEnd) {
- return content.substring(iStart, iEnd);
- }
- return null;
- }