Java程序：读取学校大纲Word文档并存储到SQL数据库-CSDN博客

本文链接：https://blog.csdn.net/LiShiJi12345/article/details/149416044

下面是一个完整的Java程序，用于读取包含学校大纲结构的Word文档，并按模块标题（专业基本信息、培养目标等）提取内容，最后存储到SQL数据库中。

1. 系统设计

功能需求：

识别Word文档中的特定模块标题
提取每个模块下的文本内容和表格
将提取的内容存储到SQL数据库

数据库设计：

CREATE TABLE syllabus_modules (
    id INT AUTO_INCREMENT PRIMARY KEY,
    module_name VARCHAR(100) NOT NULL,
    content TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE syllabus_tables (
    id INT AUTO_INCREMENT PRIMARY KEY,
    module_id INT NOT NULL,
    table_index INT NOT NULL,
    table_content TEXT,
    FOREIGN KEY (module_id) REFERENCES syllabus_modules(id)
);

2. 完整Java实现

import org.apache.poi.xwpf.usermodel.*;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;

import java.io.FileInputStream;
import java.io.IOException;
import java.sql.*;
import java.util.*;
import java.util.regex.Pattern;

public class SyllabusParser {

    // 定义大纲模块标题
    private static final String[] MODULE_TITLES = {
            "专业基本信息", "培养目标", "毕业要求", "主干学科", 
            "专业核心课程及简介", "课程体系", "毕业条件", 
            "课程体系配置流程图", "专业课程设置及教学计划进度表", 
            "毕业要求与主要课程关系矩阵图"
    };

    // 数据库连接配置
    private static final String DB_URL = "jdbc:mysql://localhost:3306/syllabus_db";
    private static final String DB_USER = "username";
    private static final String DB_PASSWORD = "password";

    public static void main(String[] args) {
        String filePath = "school_syllabus.docx";
        
        try {
            // 1. 解析Word文档
            Map<String, ModuleContent> syllabusData = parseWordDocument(filePath);
            
            // 2. 存储到数据库
            storeToDatabase(syllabusData);
            
            System.out.println("大纲文档解析并存储完成！");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 解析Word文档
     */
    private static Map<String, ModuleContent> parseWordDocument(String filePath) throws IOException {
        Map<String, ModuleContent> result = new LinkedHashMap<>();
        
        try (FileInputStream fis = new FileInputStream(filePath);
             XWPFDocument document = new XWPFDocument(fis)) {
            
            String currentModule = null;
            StringBuilder currentContent = new StringBuilder();
            List<XWPFTable> currentTables = new ArrayList<>();
            
            for (IBodyElement element : document.getBodyElements()) {
                if (element instanceof XWPFParagraph) {
                    XWPFParagraph para = (XWPFParagraph) element;
                    String text = para.getText().trim();
                    
                    // 检查是否是模块标题
                    Optional<String> moduleOpt = Arrays.stream(MODULE_TITLES)
                            .filter(title -> text.equals(title) || text.startsWith(title))
                            .findFirst();
                    
                    if (moduleOpt.isPresent()) {
                        // 保存前一个模块
                        if (currentModule != null) {
                            result.put(currentModule, new ModuleContent(
                                    currentContent.toString(), 
                                    new ArrayList<>(currentTables))
                            );
                            currentContent = new StringBuilder();
                            currentTables.clear();
                        }
                        
                        currentModule = moduleOpt.get();
                    } else if (currentModule != null) {
                        // 添加到当前模块内容
                        if (currentContent.length() > 0) {
                            currentContent.append("\n");
                        }
                        currentContent.append(text);
                    }
                } else if (element instanceof XWPFTable && currentModule != null) {
                    // 添加到当前模块表格
                    currentTables.add((XWPFTable) element);
                }
            }
            
            // 添加最后一个模块
            if (currentModule != null) {
                result.put(currentModule, new ModuleContent(
                        currentContent.toString(), 
                        new ArrayList<>(currentTables))
                );
            }
        }
        
        return result;
    }

    /**
     * 将解析结果存储到数据库
     */
    private static void storeToDatabase(Map<String, ModuleContent> syllabusData) throws SQLException {
        try (Connection conn = DriverManager.getConnection(DB_URL, DB_USER, DB_PASSWORD)) {
            // 清空现有数据
            try (Statement stmt = conn.createStatement()) {
                stmt.executeUpdate("TRUNCATE TABLE syllabus_tables");
                stmt.executeUpdate("TRUNCATE TABLE syllabus_modules");
            }
            
            // 准备插入语句
            String moduleSql = "INSERT INTO syllabus_modules (module_name, content) VALUES (?, ?)";
            String tableSql = "INSERT INTO syllabus_tables (module_id, table_index, table_content) VALUES (?, ?, ?)";
            
            // 存储每个模块
            for (Map.Entry<String, ModuleContent> entry : syllabusData.entrySet()) {
                String moduleName = entry.getKey();
                ModuleContent content = entry.getValue();
                
                // 插入模块
                int moduleId;
                try (PreparedStatement pstmt = conn.prepareStatement(moduleSql, Statement.RETURN_GENERATED_KEYS)) {
                    pstmt.setString(1, moduleName);
                    pstmt.setString(2, content.getTextContent());
                    pstmt.executeUpdate();
                    
                    try (ResultSet rs = pstmt.getGeneratedKeys()) {
                        if (rs.next()) {
                            moduleId = rs.getInt(1);
                        } else {
                            continue;
                        }
                    }
                }
                
                // 插入表格
                if (!content.getTables().isEmpty()) {
                    try (PreparedStatement pstmt = conn.prepareStatement(tableSql)) {
                        int tableIndex = 1;
                        for (XWPFTable table : content.getTables()) {
                            pstmt.setInt(1, moduleId);
                            pstmt.setInt(2, tableIndex++);
                            pstmt.setString(3, convertTableToString(table));
                            pstmt.addBatch();
                        }
                        pstmt.executeBatch();
                    }
                }
            }
        }
    }

    /**
     * 将表格转换为字符串表示
     */
    private static String convertTableToString(XWPFTable table) {
        StringBuilder sb = new StringBuilder();
        
        for (XWPFTableRow row : table.getRows()) {
            for (XWPFTableCell cell : row.getTableCells()) {
                sb.append(cell.getText().replace("\n", " ")).append("\t");
            }
            sb.append("\n");
        }
        
        return sb.toString();
    }

    /**
     * 模块内容容器类
     */
    static class ModuleContent {
        private String textContent;
        private List<XWPFTable> tables;
        
        public ModuleContent(String textContent, List<XWPFTable> tables) {
            this.textContent = textContent;
            this.tables = tables;
        }
        
        public String getTextContent() {
            return textContent;
        }
        
        public List<XWPFTable> getTables() {
            return tables;
        }
    }
}

3. 程序说明

3.1 主要功能

文档解析：
- 使用Apache POI读取Word文档
- 识别预定义的模块标题
- 收集每个模块下的文本内容和表格
数据库存储：
- 使用JDBC连接MySQL数据库
- 将文本内容存入syllabus_modules表
- 将表格内容存入syllabus_tables表
表格处理：
- 将表格转换为制表符分隔的文本格式存储
- 保留表格的行列结构信息

3.2 使用方法

准备MySQL数据库，执行提供的SQL创建表结构
修改数据库连接配置（DB_URL, DB_USER, DB_PASSWORD）
将Word文档放在项目目录下，或修改文件路径
运行程序

3.3 扩展功能

如果需要更复杂的处理，可以考虑以下扩展：

表格结构化存储：

// 可以创建更详细的表格存储结构
CREATE TABLE syllabus_table_cells (
    id INT AUTO_INCREMENT PRIMARY KEY,
    table_id INT NOT NULL,
    row_num INT NOT NULL,
    col_num INT NOT NULL,
    cell_content TEXT,
    FOREIGN KEY (table_id) REFERENCES syllabus_tables(id)
);

样式信息提取：

// 提取段落样式信息
para.getStyle();
para.getAlignment();
para.getIndentation();

图片提取：

// 提取文档中的图片
for (XWPFPictureData picture : document.getAllPictures()) {
    byte[] data = picture.getData();
    // 保存图片文件
}

4. 注意事项

确保Word文档使用标准标题格式，否则可能需要调整标题识别逻辑
对于复杂的表格结构，可能需要特殊处理合并单元格等情况
程序假设模块标题是唯一的，如果文档中有重复标题需要额外处理
数据库操作应考虑使用连接池提高性能（如HikariCP）

这个程序提供了完整的从Word文档解析到数据库存储的解决方案，可以根据实际需求进行调整和扩展。