需求:问答式对话内容通过正则表达式切割出来
txt文件内容:
代码如下:
package com.test;
import com.hankcs.hanlp.HanLP;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DataPreProcess {
/**
* 封装正则表达式处理的方法
* @param line
* @param regex
* @return
* @throws IOException
*/
private static ArrayList<String> lineProcess(String line,String regex) throws IOException {
Pattern pattern=Pattern.compile(regex);
Matcher matcher=pattern.matcher(line);
ArrayList<String> str=new ArrayList<>();
while (matcher.find()){
str.add(line.substring(matcher.start(),matcher.end()));
}
return str;
}
/**
* 封装读取TXT文件内容的方法
* @param filePath
* @return
*/
public static String readTxtFile(String filePath) {
String encoding = "GBK";
String result="";
try{
File file = new File(filePath);
if (file.isFile() && file.exists()) {//判断文件是否存在
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = "";
while ((lineTxt = bufferedReader.readLine()) != null) {
result+=lineTxt;
}
read.close();
} else {
result="找不到指定文件";
}
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
public static void main(String[] args)throws Exception{
String line=readTxtFile("C:\\Users\\Administrator\\Desktop\\bilu.txt");
/*方式一(第一时间想到的,比较不通用):
String regexAsk="(?<=问:)([^ ]+?)(?=答:)";
ArrayList<String> askList=lineProcess(line,regexAsk);
String regexAnwser="(?<=答:)([^ ]+?)[(?<=问:)(?<=。)]";
ArrayList<String> anwserList=lineProcess(line,regexAnwser);
for (String s:askList){
System.out.println(s);
}
for (String s:anwserList){
System.out.println(s);
}*/
//方式二:
String regx="问:([^答]*)答:([^问]*)";
ArrayList<String> askList=new ArrayList<>();
ArrayList<String> anwserList=new ArrayList<>();
Matcher matcher=Pattern.compile(regx).matcher(line);
while(matcher.find()){
askList.add(matcher.group(1));
anwserList.add(matcher.group(2));
}
for (String s:askList){
System.out.println(s);
}
for (String s:anwserList){
System.out.println(s);
}
}
}
切割效果: