使用正则表达式提取html中列表里的数据
类似这种有规律的数据
package com.regex;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Regex {
public static void main(String[] args) {
StringBuffer sb =new StringBuffer();
try {
// MyEclipse相对路径从工程的根目录下开始
//字节流
FileInputStream fis = new FileInputStream("src/com/regex/sample.html");
//字符流
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
//缓冲流
BufferedReader br= new BufferedReader(isr);
String lineText="";
while((lineText=br.readLine())!=null){
sb.append(lineText+"\n");
}
//System.out.println(sb.toString());
fis.close();
isr.close();
br.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String regex="<li>([\\u4e00-\\u9fa5]{1,9})([a-zA-Z]+)</li>";
//正则对象
Pattern p=Pattern.compile(regex);
//匹配正则
Matcher m=p.matcher(sb.toString());
while(m.find()){
//无论是否用分组模式,一定有group(0),前提是匹配的到
System.out.println(m.group(0));
//能将文件中的中英文提取出来,可以存库或者处理业务
String zw= m.group(1);
String yw=m.group(2);
System.out.println(zw+"-"+yw);
}
}
}
提取结果: