python实现|多文件夹多文档内容爬取和正则化使用

这是一个Python脚本,用于批量爬取多个Word文档中的ICD(国际疾病分类)相关数据,并将其整理成Excel表格。脚本首先创建一个Excel工作簿,然后通过递归遍历文件夹中的.doc或.docx文件,使用正则表达式匹配ICD-10和ICD-9名称及编码。匹配到的数据经过处理后写入Excel表格,便于后续分析。
摘要由CSDN通过智能技术生成

注:目前有多个文档,每个文档下有多个.doc或.docx后缀的文件,如果快速爬取多个文档中想要的内容,本文将为你提供参考,具体代码如下:

类似的文件,路径如下,现要爬取word文件中对应的有效内容

 

 

# encoding:utf-8
print(1)
import re
import os
import sys
import xlrd
import xlwt
import docx
from docx import Document #导入库
from win32com.client import Dispatch
import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')

#创建表:该表用于存储后面爬取的内容
work_book=xlwt.Workbook(encoding='utf-8')
sheet=work_book.add_sheet("sheet1")
sheet.write(0,0,'序号')
sheet.write(0,1,'临床路径名称')
sheet.write(0,2,'ICD_10_Name')
sheet.write(0,3,'ICD_10')
sheet.write(0,4,'ICD_9_Name')
sheet.write(0,5,"ICD_9")
sheet.write(0,6,'备注')
sheet.write(0,7,'描述')



#addsheet用于将爬取到的内容以什么规则加入创建的表
def addsheet(ICD_Name,ICD_ID):
    ICD_Name_1, ICD_ID_1 = [], []
    if ICD_Name!=[]:
        if len(ICD_Name) == len(ICD_ID):
            ICD_Name_1.extend(ICD_Name)
            ICD_ID_1.extend(ICD_ID)
        if len(ICD_Name) < len(ICD_ID):
            ICD_Name_1.extend(ICD_Name)
            for i in range(len(ICD_ID) - len(ICD_Name)):
                ICD_Name_1.append(ICD_Name[-1])
            ICD_ID_1.extend(ICD_ID)
        
        for ICDi in range(len(ICD_ID)):
            if '/' in ICD_ID[ICDi]:
                C=ICD_ID[ICDi].split('/')
                if len(C[-1])==1:
                    pass
                else:
                    ICD_Name_1.remove(ICD_Name[ICDi])
                    ICD_ID_1.remove(ICD_ID[ICDi])
                    for Ci in C:
                        ICD_Name_1.append(ICD_Name[ICDi])
                        ICD_ID_1.append(Ci)
            if '、' in ICD_ID[ICDi]:
                C=ICD_ID[ICDi].split('、')
                if len(C[-1])==1:
                    pass
                else:
                    ICD_Name_1.remove(ICD_Name[ICDi])
                    ICD_ID_1.remove(ICD_ID[ICDi])
                    for Ci in C:
                        ICD_Name_1.append(ICD_Name[ICDi])
                        ICD_ID_1.append(Ci)
            if '或' in ICD_ID[ICDi]:
                C=ICD_ID[ICDi].split('或')
                if len(C[-1])==1:
                    pass
                else:
                    ICD_Name_1.remove(ICD_Name[ICDi])
                    ICD_ID_1.remove(ICD_ID[ICDi])
                    for Ci in C:
                        ICD_Name_1.append(ICD_Name[ICDi])
                        ICD_ID_1.append(Ci)
            if ',' in ICD_ID[ICDi]:
                C=ICD_ID[ICDi].split(',')
                if len(C[-1])==1:
                    pass
                else:
                    ICD_Name_1.remove(ICD_Name[ICDi])
                    ICD_ID_1.remove(ICD_ID[ICDi])
                    for Ci in C:
                        ICD_Name_1.append(ICD_Name[ICDi])
                        ICD_ID_1.append(Ci)
            if '\\' in ICD_ID[ICDi]:
                C=ICD_ID[ICDi].split('\\')
                if len(C[-1])==1:
                    pass
                else:
                    ICD_Name_1.remove(ICD_Name[ICDi])
                    ICD_ID_1.remove(ICD_ID[ICDi])
                    for Ci in C:
                        ICD_Name_1.append(ICD_Name[ICDi])
                        ICD_ID_1.append(Ci)

            if ',' in ICD_ID[ICDi]:
                C=ICD_ID[ICDi].split(',')
                if len(C[-1])==1:
                    pass
                else:
                    ICD_Name_1.remove(ICD_Name[ICDi])
                    ICD_ID_1.remove(ICD_ID[ICDi])
                    for Ci in C:
                        ICD_Name_1.append(ICD_Name[ICDi])
                        ICD_ID_1.append(Ci)
    return ICD_Name_1,ICD_ID_1


# 爬取的内容放进创建的表
def put_into_excel(k,CPW,ICD_Name,ICD_ID,ICD_9_Name,ICD_9_ID,sheet,path,C):
    sheet.write(k, 7, C)
    if ICD_Name==[] and ICD_ID==[] and ICD_9_Name==[] and ICD_9_ID==[]:
        sheet.writesheet.write(k,6,path)
    try:
        ICD_Name, ICD_ID=addsheet(ICD_Name,ICD_ID)
        ICD_9_Name, ICD_9_ID=addsheet(ICD_9_Name,ICD_9_ID)
        if ICD_Name!=[] and ICD_9_Name!=[]:
            for I_N in range(len(ICD_Name)):
                for I9N in range(len(ICD_9_Name)):
                    sheet.write(k, 0, k)
                    sheet.write(k, 1, CPW)
                    sheet.write(k, 2, ICD_Name[I_N])
                    sheet.write(k, 3, ICD_ID[I_N])
                    sheet.write(k, 4, ICD_9_Name[I9N])
                    sheet.write(k, 5, ICD_9_ID[I9N])
                    k+=1
        if ICD_Name==[] and ICD_9_Name!=[]:
            for I9N in range(len(ICD_9_Name)):
                sheet.write(k, 0, k)
                sheet.write(k, 1, CPW)
                sheet.write(k, 4, ICD_9_Name[I9N])
                sheet.write(k, 5, ICD_9_ID[I9N])
                k += 1
        if ICD_9_Name==[] and ICD_Name!=[]:
            for I_N in range(len(ICD_Name)):
                sheet.write(k, 0, k)
                sheet.write(k, 1, CPW)
                sheet.write(k, 2, ICD_Name[I_N])
                sheet.write(k, 3, ICD_ID[I_N])
                k+=1
    except:
        k+=1
    return k

#正则化规则
def Standize(A):
    ICD_Name=[]
    ICD_ID=[]
    ICD_9_Name = []
    ICD_9_ID = []
    for A_i in A:
        if 'ICD-10' in A_i or 'ICD–10' in A_i :
            p1=re.compile(r'(?<=[为|,|、:或]).+?(?=[((]ICD[–--]10)')
            matcher = re.findall(p1, A_i)
            ICD_Name.extend(matcher)
            # ICD_ID:
            p2 = re.compile(r"(?<=ICD[–--]10[::]).+?(?=[)\)])")
            m2 = re.findall(p2, A_i)
            ICD_ID.extend(m2)
            if ICD_ID==[]:
                p5 = re.compile(r"(?<=(ICD-10[︰/s:]).+?(?=))")
                m5 = re.findall(p5, A_i)
                ICD_ID.extend(m5)
                print(1)
            if ICD_ID!=[] and ICD_Name==[]:
                p5 = re.compile(r"(?<=\w).+?(?=))")
                m5 = re.findall(p5, A_i)
                ICD_Name.extend(m5)
                print(2)
            if ICD_ID==[]:
                p2 = re.compile(r"(?<=ICD[–--]).+?(?=[)\)])")
                m2 = re.findall(p2, A_i)
                ICD_ID.extend(m2)
                print(3)
            if ICD_ID==[]:
                p2 = re.compile(r"(?<=ICD[–--]10/s).+?(?=[)\)])")
                m2 = re.findall(p2, A_i)
                ICD_ID.extend(m2)
                print(4)
            if ICD_ID==[]:
                p2 = re.compile(r"(?<=ICD10[/s::码]).+?(?=[)\)])")
                m2 = re.findall(p2, A_i)
                ICD_ID.extend(m2)
                print(5)
            if ICD_Name==[]:
                p5 = re.compile(r"(?<=[\w为或]).+?(?=[(\(]ICD10)")
                m5 = re.findall(p5, A_i)
                ICD_Name.extend(m5)
                print(6)
            if ICD_ID==[]:
                p2 = re.compile(r"(?<=ICD编码[/s::]).+?(?=[)\)])")
                m2 = re.findall(p2, A_i)
                ICD_ID.extend(m2)
                print(7)
            if ICD_Name==[]:
                p5 = re.compile(r"(?<=[\w为或]).+?(?=[(\(]ICD编码)")
                m5 = re.findall(p5, A_i)
                ICD_Name.extend(m5)
                print(8)
            if ICD_ID==[]:
                p2 = re.compile(r"(?<=ICD\s10[::]).+?(?=[)\)])")
                m2 = re.findall(p2, A_i)
                ICD_ID.extend(m2)
                print(9)
            if ICD_Name==[]:
                p5 = re.compile(r"(?<=[\w为或]).+?(?=[(\(]ICD编码)")
                m5 = re.findall(p5, A_i)
                ICD_Name.extend(m5)
                print(10)

        if 'CM-3' in A_i or 'CM–3' in A_i:
            p3 = re.compile(r"(?<=[因行,、]).+?(?=[(\(]ICD[–--]9)")
            m3 = re.findall(p3, A_i)
            ICD_9_Name.extend(m3)

            p4 =re.compile(r"(?<=CM[–--]3[::]).+?(?=[)\)])")
            m4 = re.findall(p4, A_i)
            ICD_9_ID.extend(m4)

            if ICD_9_ID!=[] and ICD_9_Name==[]:
                p4 = re.compile(r".+?(?=[(\(]ICD[–--]9)")
                m4 = re.findall(p4, A_i)
                ICD_9_Name.extend(m4)
            if ICD_9_Name==[]:
                p3 = re.compile(r"(?<=[因行,、]).+?(?=[(\(]CM)")
                m3 = re.findall(p3, A_i)
                ICD_9_Name.extend(m3)
            if ICD_9_ID==[]:
                p4 = re.compile(r"(?<=CM3[::]).+?(?=[)\)])")
                m4 = re.findall(p4, A_i)
                ICD_9_ID.extend(m4)
            if ICD_9_Name==[]:
                p3 = re.compile(r"(?<=[因行,、]).+?(?=[(\(]ICD9CM)")
                m3 = re.findall(p3, A_i)
                ICD_9_Name.extend(m3)
            if ICD_9_ID==[]:
                p4 = re.compile(r"(?<=CM[–--]3编码[::]).+?(?=[)\)])")
                m4 = re.findall(p4, A_i)
                ICD_9_ID.extend(m4)

  return   ICD_Name,ICD_ID,ICD_9_Name,ICD_9_ID


#爬取文件的路径

path = "C:/Users/Administrator/Desktop/224个病种临床路径(2019年版)"  #文件夹目录
# print(1)
files= os.listdir(path) #得到文件夹下的所有文件名称
s = []
k=1
M=0
for file in files: #遍历文件夹

    F1=os.listdir(path+"/"+file)
    if '.doc' in F1[0]:
        for f in F1:
            if '~' in f or '$' in f:
                print('这是打开的文件:', f)
            else:
                if '临床路径' in f:
                    CP = f.split('临床路径')[0]
                elif '(' in f:
                    CP = f.split('(')[0]
                elif '(' in f:
                    CP = f.split('(')[0]
                else:
                    CP = f.split('.')[0]

                file_abs_path = path + "/" + file  + '/' + f  # 打开文件
                try:
                    C = []
                    word = Document(file_abs_path)  # 读入文件
                    para_len = len(word.paragraphs)
                    for LEN in range(para_len):
                        if '诊断依据' in word.paragraphs[LEN].text:
                            break
                        if 'ICD' in word.paragraphs[LEN].text:
                            Context = word.paragraphs[LEN].text
                    ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID = Standize(C)
                    print(ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID)
                    k = put_into_excel(k, CP, ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID, sheet, file  + '/' + f ,C)
                    M += 1
                    print(M)
                except:
                    try:
                        wd = Dispatch("Word.application")
                        doc = wd.Documents.Open(file_abs_path)
                        doc.SaveAs("C:/Users/Administrator/Desktop/224个病种临床路径" + '/' + f + r"x", 12)  # 12表示docx格式
                        doc.Close()
                        word = Document("C:/Users/Administrator/Desktop/224个病种临床路径" + '/' + f + r"x")  # 读入文件
                        para_len = len(word.paragraphs)
                        C = []
                        for LEN in range(para_len):
                            if '诊断依据' in word.paragraphs[LEN].text:
                                break
                            if 'ICD' in word.paragraphs[LEN].text or 'CM' in word.paragraphs[LEN].text:
                                Context = word.paragraphs[LEN].text
                                C.append(Context)
                        print(C)
                        ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID = Standize(C)
                        print(ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID)
                        k = put_into_excel(k, CP, ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID, sheet,
                                           file + "/"  + f, C)
                        M += 1
                        print(M)
                    except:
                        k += 1
                        sheet.write(k, 6, file  + '/' + f )
                        print('这是未处理的文件:', f, )
                        print("C:/Users/Administrator/Desktop/224个病种临床路径" + '/' + f + r"x")
    else:
        for Fi1 in F1:
            F = os.listdir(path + "/" + file + "/" + Fi1 )
               for f in F:
                    if '~' in f or '$'in f :
                        print('这是打开的文件:',f)
                    else:
                        if '临床路径' in f:
                            CP = f.split('临床路径')[0]
                        elif '(' in f:
                            CP = f.split('(')[0]
                        elif '(' in f:
                            CP = f.split('(')[0]
                        else:
                            CP = f.split('.')[0]
                        file_abs_path=path + "/" + file + "/" + Fi1 +'/'+f  #打开文件
                        try:
                            C=[]
                            word = Document(file_abs_path )  # 读入文件
                            para_len=len(word.paragraphs)
                            for LEN in range(para_len):
                                if '诊断依据' in word.paragraphs[LEN].text:
                                    break
                                    if 'ICD' in word.paragraphs[LEN].text:
                                        Context=word.paragraphs[LEN].text
                                        C.append(Context)
                            print(C)
                            ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID = Standize(C)
                            print(ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID)
                            k = put_into_excel(k, CP, ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID,sheet,file + "/" + Fi1 +'/'+f,C)
                            M += 1
                            print(M)
                        except:
                            try:
                                wd = Dispatch("Word.application")
                                doc = wd.Documents.Open(file_abs_path)
                                doc.SaveAs("C:/Users/Administrator/Desktop/224个病种临床路径" +'/'+f+ r"x", 12)  # 12表示docx格式
                                doc.Close()
                                word = Document("C:/Users/Administrator/Desktop/224个病种临床路径" +'/'+f+ r"x")  # 读入文件
                                para_len = len(word.paragraphs)
                                C=[]
     
                                for LEN in range(para_len):
                                    if '诊断依据' in word.paragraphs[LEN].text:
                                        break
                                    if 'ICD' in word.paragraphs[LEN].text or 'CM' in               word.paragraphs[LEN].text:
                                        Context = word.paragraphs[LEN].text
                                        C.append(Context)
                                print(C)
                                ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID = Standize(C)
                                print(ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID)
                                k = put_into_excel(k, CP, ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID,sheet, file + "/" + Fi1 +'/'+f ,C )
                                M += 1
                                print(M)
                            except:
                                k += 1
                                sheet.write(k, 6,  file + "/" + Fi1 +'/'+f )
                                print('这是未处理的文件:',f,)
                                print("C:/Users/Administrator/Desktop/224个病种临床路径" +'/'+f+ r"x")
work_book.save("临床路径表.xls")
print(M,k)        

得到结果如下:

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

码丽莲梦露

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值