- /**
- * 得到文件所在的磁盘目录
- * @param file
- * @return
- */
- public static String getFileDirectory(String file){
- String regEx = "[a-zA-z]{1,4}:.*[\\\\/]";
- String dir = "";
- Pattern p=Pattern.compile(regEx);
- Matcher m=p.matcher(file);
- if(m.find()){
- dir = m.group(m.groupCount());
- }
- return dir;
- }
- /**
- * 得到文件名
- * @param file
- * @return
- */
- public static String getFileName(String file){
- String regEx =".+[\\\\|/](.+)$";
- String fileName = "";
- Pattern p=Pattern.compile(regEx);
- Matcher m=p.matcher(file);
- if(m.find()){
- fileName = m.group(m.groupCount());
- }
- return fileName;
- }
- /**
- * 得到文件扩展名
- * @param file
- * @return
- */
- public static String getFileExtName(String file){
- String regEx = ".*\\.";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(file);
- String extName = m.replaceAll("");
- return extName;
- }
- /**
- * 当在模式中存在用括号括起来的组时,可以分别检索每个组的匹配值。从最左边的组开始编为1,
- * 然后依次对每对括号相对应的组进行编号。在下面的模式中,第一组是协议(如http),第二组是域名。
- * 为了在匹配的字符串中访问组,可以使用Matcher的group方法。
- */
- public static void getMatchGroup(String data) {
- String urlString = "(http|https|ftp)://([a-zA-Z0-9-\\.]+)[/\\w\\.\\-\\+\\?%=&;:,#]*";
- Pattern urlPattern = Pattern.compile(urlString);
- Matcher urlMatcher = urlPattern.matcher(data);
- while (urlMatcher.find()) {
- String domain = urlMatcher.group(2); // 2nd group is the domain
- System.out.println(domain);
- }
- }
- /**
- * 在一个模式内引用一个以前的匹配组称为逆向引用(backreference),简化模式书写。
- * 为了对第三个组进行逆向引用,在模式中包括\3即可。这将会只匹配一个与以前的组相匹配的严格重复的数据。
- * eg.String data = " The the water molecules are made of of hydrogen and oxygen";
- */
- public static void getBackReferencesGroup(String data) {
- //该模式匹配情况如下:一个空白字符、特殊的单词列表中的一个单词、更多的空白、
- //再次重复的相同的单词(使用\1对第一个组进行逆向引用)以及空白符或标点符号。
- String patternStr = "\\s(of|or|the|to)\\s+\\1[\\s\\.,;]";
- Pattern wordPattern = Pattern.compile(patternStr,
- Pattern.CASE_INSENSITIVE);//不区分大小写
- Matcher wordMatcher = wordPattern.matcher(data);
- while (wordMatcher.find()) {
- int start = wordMatcher.start();
- String word = wordMatcher.group(1);
- System.out.println("Repeated " + word + " starting at " + start);
- }
- }
- /**
- * 取大括号内的内容
- * @param inputStr
- * @return
- */
- public static String getBraceContent(String inputStr){
- Pattern pattern = Pattern.compile("(?<=\\{)[^\\{\\}]*(?=\\})", 2);
- Matcher matcher = pattern.matcher(inputStr);
- StringBuffer sb = new StringBuffer();
- String temp;
- while(matcher.find()){
- temp = inputStr.substring(matcher.start(), matcher.end());
- sb.append(temp+"\n");
- }
- return sb.toString();
- }
- /**
- * 得到html标签的属性
- * @param html 文件内容
- * @param label 要提取属性的标签名称,如:font ,img...
- */
- public static void getHtmlAttribute(String html,String label){
- Map mapAttrib = new HashMap();
- String regEx = "<"+label+"\\s*([^>]*)\\s*>";
- String regEx2 = "([a-z]+)\\s*=\\s*\"([^\"]+)\"";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(html);
- if(m.find()){
- String attribs = m.group(1);
- p = Pattern.compile(regEx2);
- m = p.matcher(attribs);
- while(m.find()){
- mapAttrib.put(m.group(1), m.group(2));
- }
- }
- printMapData(mapAttrib);
- }
- public static void printMapData(Map map){
- Set entries = map.entrySet();
- Iterator iter = entries.iterator();
- while(iter.hasNext())
- {
- Map.Entry entry = (Map.Entry)iter.next();
- System.out.println(entry.getKey()+"="+entry.getValue());
- }
- }
- /**
- * 使用Jacob工具包完成word到html的转换
- * @param absPath 文件绝对路径
- */
- public static boolean wordFormatToHtml(String absPath) throws ProgramException{
- String FileFormat = "";
- FileFormat = getFileExtName(absPath);//文件类型
- if(FileFormat.equalsIgnoreCase("doc"))
- {
- String DocFile = absPath;
- //word文件的完整路径
- String HtmlFile = DocFile.substring(0, (DocFile.length() - 4)) + ".htm";
- //html文件的完整路径
- ActiveXComponent app = new ActiveXComponent("Word.Application");
- //启动word
- try{
- app.setProperty("Visible", new Variant(false));
- //设置word程序非可视化运行
- Dispatch docs = app.getProperty("Documents").toDispatch();
- Dispatch doc = Dispatch.invoke(docs,"Open", Dispatch.Method, new Object[]{DocFile,new Variant(false), new Variant(true)}, new int[1]).toDispatch();
- //打开word文件
- Dispatch oWordBasic = (Dispatch) Dispatch.call(app, "WordBasic").getDispatch();
- Dispatch.call(oWordBasic, "AcceptAllChangesInDoc");
- Dispatch.invoke(doc,"SaveAs",Dispatch.Method, new Object[]{HtmlFile,new Variant(8)}, new int[1]);
- //作为htm格式保存文件
- Dispatch.call(doc, "Close",new Variant(false));
- //关闭文件
- }
- catch (Exception e)
- {
- throw new ProgramException("error$Word转换为HTML时出错!");
- }
- finally
- {
- app.invoke("Quit", new Variant[] {});
- //退出word程序
- }
- //转化完毕
- return true;
- }
- return false;
- }
- /**
- * 逐行读取HTML文件内容
- * @param filePath HTML文件的路径
- * @return
- * @throws ProgramException
- */
- public static String getHTMLContent(String filePath) throws ProgramException{
- StringBuffer sb=new StringBuffer();
- try{
- String line="";
- File file=new File(filePath);
- InputStreamReader read = new InputStreamReader (new FileInputStream(file));
- BufferedReader br=new BufferedReader(read);
- while((line=br.readLine())!=null){
- sb.append(line);
- sb.append('\n');//注意换行符写入
- }
- }catch(FileNotFoundException e){
- throw new ProgramException("error$读HTML文件时,文件没有找到");
- }catch(IOException e){
- throw new ProgramException("error$读HTML文件时,出现IO异常");
- }
- String temp=sb.toString();
- //不管图片
- String regEx = "<img\\s*([^>]*)\\s*>";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(temp);
- temp=m.replaceAll("");
- String regEx2 = "<v:imagedata\\s*([^>]*)\\s*>";
- Pattern p2 = Pattern.compile(regEx2);
- Matcher m2 = p2.matcher(temp);
- temp=m2.replaceAll("");
- temp = temp.replace("\'", "\"");
- return temp;
- }
/**
* 得到文件所在的磁盘目录
* @param file
* @return
*/
public static String getFileDirectory(String file){
String regEx = "[a-zA-z]{1,4}:.*[\\\\/]";
String dir = "";
Pattern p=Pattern.compile(regEx);
Matcher m=p.matcher(file);
if(m.find()){
dir = m.group(m.groupCount());
}
return dir;
}
/**
* 得到文件名
* @param file
* @return
*/
public static String getFileName(String file){
String regEx =".+[\\\\|/](.+)$";
String fileName = "";
Pattern p=Pattern.compile(regEx);
Matcher m=p.matcher(file);
if(m.find()){
fileName = m.group(m.groupCount());
}
return fileName;
}
/**
* 得到文件扩展名
* @param file
* @return
*/
public static String getFileExtName(String file){
String regEx = ".*\\.";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(file);
String extName = m.replaceAll("");
return extName;
}
/**
* 当在模式中存在用括号括起来的组时,可以分别检索每个组的匹配值。从最左边的组开始编为1,
* 然后依次对每对括号相对应的组进行编号。在下面的模式中,第一组是协议(如http),第二组是域名。
* 为了在匹配的字符串中访问组,可以使用Matcher的group方法。
*/
public static void getMatchGroup(String data) {
String urlString = "(http|https|ftp)://([a-zA-Z0-9-\\.]+)[/\\w\\.\\-\\+\\?%=&;:,#]*";
Pattern urlPattern = Pattern.compile(urlString);
Matcher urlMatcher = urlPattern.matcher(data);
while (urlMatcher.find()) {
String domain = urlMatcher.group(2); // 2nd group is the domain
System.out.println(domain);
}
}
/**
* 在一个模式内引用一个以前的匹配组称为逆向引用(backreference),简化模式书写。
* 为了对第三个组进行逆向引用,在模式中包括\3即可。这将会只匹配一个与以前的组相匹配的严格重复的数据。
* eg.String data = " The the water molecules are made of of hydrogen and oxygen";
*/
public static void getBackReferencesGroup(String data) {
//该模式匹配情况如下:一个空白字符、特殊的单词列表中的一个单词、更多的空白、
//再次重复的相同的单词(使用\1对第一个组进行逆向引用)以及空白符或标点符号。
String patternStr = "\\s(of|or|the|to)\\s+\\1[\\s\\.,;]";
Pattern wordPattern = Pattern.compile(patternStr,
Pattern.CASE_INSENSITIVE);//不区分大小写
Matcher wordMatcher = wordPattern.matcher(data);
while (wordMatcher.find()) {
int start = wordMatcher.start();
String word = wordMatcher.group(1);
System.out.println("Repeated " + word + " starting at " + start);
}
}
/**
* 取大括号内的内容
* @param inputStr
* @return
*/
public static String getBraceContent(String inputStr){
Pattern pattern = Pattern.compile("(?<=\\{)[^\\{\\}]*(?=\\})", 2);
Matcher matcher = pattern.matcher(inputStr);
StringBuffer sb = new StringBuffer();
String temp;
while(matcher.find()){
temp = inputStr.substring(matcher.start(), matcher.end());
sb.append(temp+"\n");
}
return sb.toString();
}
/**
* 得到html标签的属性
* @param html 文件内容
* @param label 要提取属性的标签名称,如:font ,img...
*/
public static void getHtmlAttribute(String html,String label){
Map mapAttrib = new HashMap();
String regEx = "<"+label+"\\s*([^>]*)\\s*>";
String regEx2 = "([a-z]+)\\s*=\\s*\"([^\"]+)\"";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(html);
if(m.find()){
String attribs = m.group(1);
p = Pattern.compile(regEx2);
m = p.matcher(attribs);
while(m.find()){
mapAttrib.put(m.group(1), m.group(2));
}
}
printMapData(mapAttrib);
}
public static void printMapData(Map map){
Set entries = map.entrySet();
Iterator iter = entries.iterator();
while(iter.hasNext())
{
Map.Entry entry = (Map.Entry)iter.next();
System.out.println(entry.getKey()+"="+entry.getValue());
}
}
/**
* 使用Jacob工具包完成word到html的转换
* @param absPath 文件绝对路径
*/
public static boolean wordFormatToHtml(String absPath) throws ProgramException{
String FileFormat = "";
FileFormat = getFileExtName(absPath);//文件类型
if(FileFormat.equalsIgnoreCase("doc"))
{
String DocFile = absPath;
//word文件的完整路径
String HtmlFile = DocFile.substring(0, (DocFile.length() - 4)) + ".htm";
//html文件的完整路径
ActiveXComponent app = new ActiveXComponent("Word.Application");
//启动word
try{
app.setProperty("Visible", new Variant(false));
//设置word程序非可视化运行
Dispatch docs = app.getProperty("Documents").toDispatch();
Dispatch doc = Dispatch.invoke(docs,"Open", Dispatch.Method, new Object[]{DocFile,new Variant(false), new Variant(true)}, new int[1]).toDispatch();
//打开word文件
Dispatch oWordBasic = (Dispatch) Dispatch.call(app, "WordBasic").getDispatch();
Dispatch.call(oWordBasic, "AcceptAllChangesInDoc");
Dispatch.invoke(doc,"SaveAs",Dispatch.Method, new Object[]{HtmlFile,new Variant(8)}, new int[1]);
//作为htm格式保存文件
Dispatch.call(doc, "Close",new Variant(false));
//关闭文件
}
catch (Exception e)
{
throw new ProgramException("error$Word转换为HTML时出错!");
}
finally
{
app.invoke("Quit", new Variant[] {});
//退出word程序
}
//转化完毕
return true;
}
return false;
}
/**
* 逐行读取HTML文件内容
* @param filePath HTML文件的路径
* @return
* @throws ProgramException
*/
public static String getHTMLContent(String filePath) throws ProgramException{
StringBuffer sb=new StringBuffer();
try{
String line="";
File file=new File(filePath);
InputStreamReader read = new InputStreamReader (new FileInputStream(file));
BufferedReader br=new BufferedReader(read);
while((line=br.readLine())!=null){
sb.append(line);
sb.append('\n');//注意换行符写入
}
}catch(FileNotFoundException e){
throw new ProgramException("error$读HTML文件时,文件没有找到");
}catch(IOException e){
throw new ProgramException("error$读HTML文件时,出现IO异常");
}
String temp=sb.toString();
//不管图片
String regEx = "<img\\s*([^>]*)\\s*>";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(temp);
temp=m.replaceAll("");
String regEx2 = "<v:imagedata\\s*([^>]*)\\s*>";
Pattern p2 = Pattern.compile(regEx2);
Matcher m2 = p2.matcher(temp);
temp=m2.replaceAll("");
temp = temp.replace("\'", "\"");
return temp;
}
说明:
特殊构造(非捕获)
(?:X) X,作为非捕获组
(?idmsux-idmsux) Nothing,但是将匹配标志由 on 转为 off
(?idmsux-idmsux:X) X,作为带有给定标志 on - off 的非捕获组
(?=X) X,通过零宽度的正 lookahead
(?!X) X,通过零宽度的负 lookahead
(?<=X) X,通过零宽度的正 lookbehind
(?<!X) X,通过零宽度的负 lookbehind
(?>X) X,作为独立的非捕获组
public static Pattern compile(String regex,int flags);
参数:
regex - 要编译的表达式。
flags - 匹配标志,可能包括 CASE_INSENSITIVE、MULTILINE、DOTALL、UNICODE_CASE 和 CANON_EQ 的位掩码。
Pattern中的定义如下:
- public static final int UNIX_LINES = 0x01;
- public static final int CASE_INSENSITIVE = 0x02;
- public static final int COMMENTS = 0x04;
- public static final int MULTILINE = 0x08;
- public static final int LITERAL = 0x10;
- public static final int DOTALL = 0x20;
- public static final int UNICODE_CASE = 0x40;
- public static final int CANON_EQ = 0x80;
public static final int UNIX_LINES = 0x01;
public static final int CASE_INSENSITIVE = 0x02;
public static final int COMMENTS = 0x04;
public static final int MULTILINE = 0x08;
public static final int LITERAL = 0x10;
public static final int DOTALL = 0x20;
public static final int UNICODE_CASE = 0x40;
public static final int CANON_EQ = 0x80;
资源:
1.java.util.regex类 Pattern