Set<String> textList = new HashSet<>();
Pattern pattern = Pattern.compile("[a-zA-Z]+"); // 匹配英文字母
String folderPath = "C:/english-wordlists-master/english-wordlists-master/"; //文件夹目录
File folder = new File(folderPath);
if (folder.exists() && folder.isDirectory()) {
File[] files = folder.listFiles();
if (files != null){
for (File file : files) {
//判断是否是txt文件
if (file.isFile() && file.getName().endsWith(".txt")) {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"))) {
String line;
while ((line = reader.readLine()) != null) {
Matcher matcher = pattern.matcher(line);
while (matcher.find()) {
textList.add(matcher.group());
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
//判断是否是csv文件
if (file.isFile() && file.getName().endsWith(".csv")){
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
// 分割CSV行
String[] parts = line.split(",");
// 提取每个部分中的英文单词
for (String part : parts) {
Matcher matcher = pattern.matcher(part);
while (matcher.find()) {
String word = matcher.group();
textList.add(word);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
//判断是否是docx文件
if (file.isFile() && file.getName().endsWith(".docx")){
try (FileInputStream fis = new FileInputStream(file)) {
XWPFDocument document = new XWPFDocument(fis);
// 遍历文档中的段落
for (XWPFParagraph paragraph : document.getParagraphs()) {
String text = paragraph.getText();
Matcher matcher = pattern.matcher(text);
// 提取段落中的英文单词
while (matcher.find()) {
String word = matcher.group();
textList.add(word);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
//判断是否是pdf文件
if (file.isFile() && file.getName().endsWith(".pdf")){
try (PDDocument document = PDDocument.load(new FileInputStream(file))) {
PDFTextStripper textStripper = new PDFTextStripper();
// 获取PDF文档中的文本内容
String text = textStripper.getText(document);
Matcher matcher = pattern.matcher(text);
// 提取文本中的英文单词
while (matcher.find()) {
String word = matcher.group();
textList.add(word);
}
} catch (Exception e) {
e.printStackTrace();
}
}
//判断是否是xlsl文件
if (file.isFile() && file.getName().endsWith(".xlsx")){
try (FileInputStream fis = new FileInputStream(file)) {
Workbook workbook = WorkbookFactory.create(fis);
for (int i = 0; i < workbook.getNumberOfSheets(); i++) {
Sheet sheet = workbook.getSheetAt(i);
// 遍历行
for (Row row : sheet) {
// 遍历单元格
for (Cell cell : row) {
// 获取单元格内容并提取英文单词
String cellValue = cell.toString();
if (cellValue != null) {
Matcher matcher = pattern.matcher(cellValue);
while (matcher.find()) {
String word = matcher.group();
textList.add(word);
}
}
}
}
}
} catch (IOException | InvalidFormatException e) {
e.printStackTrace();
}
}
}
}
}else{
System.out.println("指定的文件夹路径不存在或不是一个文件夹。");
}
//入库
englishWordlistsMapper.batchEnglishWordlists(textList);
11-24
03-12
729
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)
10-24
1万+
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)
01-27
“相关推荐”对你有帮助么?
-
非常没帮助
-
没帮助
-
一般
-
有帮助
-
非常有帮助
提交