open XML获取world文件标题及所需的关键字

最新推荐文章于 2023-09-23 13:19:41 发布

since �

最新推荐文章于 2023-09-23 13:19:41 发布

阅读量760

点赞数

分类专栏： c#从入门到放弃-入门篇

本文链接：https://blog.csdn.net/h5_since/article/details/107399934

版权

c#从入门到放弃-入门篇专栏收录该内容

3 篇文章 0 订阅

订阅专栏

最近有个需求，使用c#操作world文件，并获取其中的标题及其他所需引导词后面的内容，如下图，获取文件中的标题，引导词后面的内容
在这里插入图片描述
采用的是open xml将world文件转换成xml(open xml只支持docx文件格式,如果不是docx格式的，可以先通过world转换过来)，可以获取每个段落中的所有文字喝标签样式，已知标题的字体最大，根据标签样式属性，获取最大的样式的那个段落既是标题，其他内容可以通过indexOf查询关键词进行获取
先看看完成效果
在这里插入图片描述
代码如下（初学C#一周，如果有写的不好的地方，望大佬指正）：

所需插件：
在这里插入图片描述
引入

using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Collections;
using System.Xml;
using DocumentFormat.OpenXml.Packaging;

定义

 public class objList
        {
            public string title;
            public string biaoji1;
            public string number;
            public string biaoji3;
        }
        public class textObjectAll
        {
            public int index;
            public string innerXml;
            public string innerText;
            public int indexof;
        }
        public class textObjectCh
        {
            public int index;
            public string innerXml;
            public string innerText;
            public int indexof;
        }
        public enum WordKind
        {
            Title,
           biaoji1,
           number,
           biaoji3,
        }

方法：（这是测试文件，所以定义的少，实际文件中会有很多引导词）

public static object GetContentFromWord(string docPath, string[] Kind)
        {
            const string wordmlNamespace = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
            string text = null;
            string keyword = null;
            string keyworda = null;
            string abstracta = null;
            string biaoji1 = "";
            string number = "";
            string biaoji3 = "";
            int num = 0;
            int numa = 0;
            int numb = 0;
            int numx = 0;
         
            ArrayList list = new ArrayList();
            ArrayList lista = new ArrayList();
            ArrayList listb = new ArrayList();
            objList objList = new objList();
            StringBuilder textBuilder = new StringBuilder();
            using (WordprocessingDocument wdDoc = WordprocessingDocument.Open(docPath, false))
            {
                NameTable nt = new NameTable();
                XmlNamespaceManager nsManager = new XmlNamespaceManager(nt);
                nsManager.AddNamespace("w", wordmlNamespace);
                XmlDocument xdoc = new XmlDocument(nt);
                xdoc.Load(wdDoc.MainDocumentPart.GetStream());
                XmlNodeList paragraphNodes = xdoc.SelectNodes("//w:p ", nsManager);
                List<textObjectCh> listCh = new List<textObjectCh>();
                List<textObjectAll> listAll = new List<textObjectAll>();
                for (var i = 0; i < paragraphNodes.Count; i++)
                {
                    if (!string.IsNullOrEmpty(paragraphNodes[i].InnerText))
                    {
                        XmlNodeList textNodes = paragraphNodes[i].SelectNodes(".//w:pStyle", nsManager);
                        XmlNodeList textNodesa = paragraphNodes[i].SelectNodes(".//w:sz", nsManager);
                        textObjectCh TextObjCh = new textObjectCh();
                        textObjectAll TextObjAll = new textObjectAll();
                        TextObjAll.index = i;
                        TextObjAll.innerText = paragraphNodes[i].InnerText;
                        TextObjAll.innerXml = paragraphNodes[i].InnerXml;
                        TextObjAll.indexof = numb;
                        listAll.Add(TextObjAll);
                        string AllinnerText = null;
                        if (paragraphNodes[i].InnerText==null)
                        {
                            
                        }
                        else
                             AllinnerText = paragraphNodes[i].InnerText.ToString();
                        for (int v = 0; v < Kind.Length; v++)
                        {
                            switch (Kind[v].ToString())
                            {
                                case "biaoji1":

                                    if (AllinnerText.IndexOf("标记1") > -1 && AllinnerText.IndexOf("号码") > -1)
                                    {
                                        biaoji1 = AllinnerText.Substring(AllinnerText.IndexOf("标记1") + 4, AllinnerText.IndexOf("号码")- (AllinnerText.IndexOf("标记1")+4));
                                    }
                                   
                                    break;
                             
                                case "number":

                                    if (AllinnerText.IndexOf("号码") > -1)
                                        number = AllinnerText.Substring(AllinnerText.IndexOf("号码") + 3);
                                    break;
                                case "biaoji3":

                                    if (AllinnerText.IndexOf("标记3") >-1)
                                    {
                                        biaoji3 = AllinnerText.Substring(AllinnerText.IndexOf("DOI") + 4);
                                       
                                    }
                                 
                                  
                                    break;
                              
                            }
                        }
                        if (textNodes.Count > 0)
                        //标题字体样式有两种可能，一种是直接的字体大小(w:sz)，这里获取最大的字体，还有一种是使用的标题，例如 标题1，标题2,这里是截取标题后面的数字，取最小的
                        {
                            try
                            {
                                text = textNodes[0].OuterXml;
                            }
                            catch { }

                            num = text.IndexOf("w:val");
                            numa = text.IndexOf("http://");
                            
                            if (numa - num > 20 && numa - num < 25)
                            {
                                keyworda = text.Substring(num);
                                keyword = text.Substring(num + 9, numa - num - 20);
                                numx = 1;
                            }
                        }
                        else
                        {
                            if (textNodesa.Count > 0)
                            {
                                try
                                {
                                    text = textNodesa[0].OuterXml;
                                }
                                catch { }

                                num = text.IndexOf("w:val");
                                numa = text.IndexOf("xmlns:w");
                                if (numa - num > 9 && numa - num < 20)
                                {
                                    keyword = text.Substring(num + 7, numa - num - 9);
                                    numx = 2;
                                }
                            }
                        }
                        try
                        {
                            if (int.TryParse(keyword, out numb))
                            {
                               
                                    TextObjCh.index = i;
                                    TextObjCh.innerText = paragraphNodes[i].InnerText;
                                    TextObjCh.innerXml = paragraphNodes[i].InnerXml;
                                    TextObjCh.indexof = numb;
                                    listCh.Add(TextObjCh);
                                
                              
                            }
                        }
                        catch
                        {
                        }
                    }
                    //}
                    //textBuilder.Append(Environment.NewLine);
                }
                if (numx == 1)
                {
                //如果标题采用的是标题1，标题2 这种，采用升序，取第一个
                    List<textObjectCh> listA = listCh.OrderBy(item => item.indexof).ToList();
                 
                    objList.title = listA[0].innerText;
                    if (biaoji1.Length > 0)
                    {
                        objList.biaoji1 = biaoji1;
                    }
                    if (number.Length>0)
                    {
                        objList.number = number;
                    }
                    if (biaoji3.Length>0)
                    {
                        objList.biaoji3 = biaoji3;
                    }
                 
                    
                    return objList;
                }
                else
                {
                    List<textObjectCh> listA = listCh.OrderByDescending(item => item.indexof).ToList();
                    objList.title = listA[0].innerText;
                    if (biaoji1.Length > 0)
                    {
                        objList.biaoji1 = biaoji1;
                    }
                    if (number.Length > 0)
                    {
                        objList.number = number;
                    }
                    if (biaoji3.Length > 0)
                    {
                        objList.biaoji3 = biaoji3;
                    }
                    return objList;
                }
            };
        }

调用方法：

 private void button1_Click(object sender, EventArgs e)
        {
            OpenFileDialog openFileDialog1 = new OpenFileDialog();  //显示选择文件对话框
            openFileDialog1.InitialDirectory = "c:\\";
            openFileDialog1.Filter = "txt files (*.docx)|*.docx|All files (*.docx)|*.docx"; //注意：open xml只支持docx文件格式
            openFileDialog1.FilterIndex = 2;
            openFileDialog1.RestoreDirectory = true;
            string[] strText = {
                        WordKind.Title.ToString(),
                        WordKind.biaoji1.ToString(),
                        WordKind.number.ToString(),
                        WordKind.biaoji3.ToString(),
                    };

            if (openFileDialog1.ShowDialog() == DialogResult.OK)
            {
                this.FilePath.Text = openFileDialog1.FileName;   //显示文件路径
                
                objList objList = (objList)GetContentFromWord(openFileDialog1.FileName, strText);
                this.Title.Text = objList.title;
                this.biaoji1.Text = objList.biaoji1;
                this.number.Text = objList.number;
                this.biaoji3.Text = objList.biaoji3;
            }
        }

since �

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
open XML获取world文件标题及所需的关键字

最近有个需求，使用c#操作world文件，并获取其中的标题及其他所需引导词后面的内容，如下图，获取文件中的标题，引导词后面的内容采用的是open xml将world文件转换成xml(open xml只支持docx文件格式,如果不是docx格式的，可以先通过world转换过来)，可以获取每个段落中的所有文字喝标签样式，已知标题的字体最大，根据标签样式属性，获取最大的样式的那个段落既是标题，其他内容可以通过indexOf查询关键词进行获取先看看完成效果代码如下（初学C#一周，如果有写的不好的地方，望大
复制链接

扫一扫