注:目前有多个文档,每个文档下有多个.doc或.docx后缀的文件,如果快速爬取多个文档中想要的内容,本文将为你提供参考,具体代码如下:
类似的文件,路径如下,现要爬取word文件中对应的有效内容
# encoding:utf-8
print(1)
import re
import os
import sys
import xlrd
import xlwt
import docx
from docx import Document #导入库
from win32com.client import Dispatch
import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
#创建表:该表用于存储后面爬取的内容
work_book=xlwt.Workbook(encoding='utf-8')
sheet=work_book.add_sheet("sheet1")
sheet.write(0,0,'序号')
sheet.write(0,1,'临床路径名称')
sheet.write(0,2,'ICD_10_Name')
sheet.write(0,3,'ICD_10')
sheet.write(0,4,'ICD_9_Name')
sheet.write(0,5,"ICD_9")
sheet.write(0,6,'备注')
sheet.write(0,7,'描述')
#addsheet用于将爬取到的内容以什么规则加入创建的表
def addsheet(ICD_Name,ICD_ID):
ICD_Name_1, ICD_ID_1 = [], []
if ICD_Name!=[]:
if len(ICD_Name) == len(ICD_ID):
ICD_Name_1.extend(ICD_Name)
ICD_ID_1.extend(ICD_ID)
if len(ICD_Name) < len(ICD_ID):
ICD_Name_1.extend(ICD_Name)
for i in range(len(ICD_ID) - len(ICD_Name)):
ICD_Name_1.append(ICD_Name[-1])
ICD_ID_1.extend(ICD_ID)
for ICDi in range(len(ICD_ID)):
if '/' in ICD_ID[ICDi]:
C=ICD_ID[ICDi].split('/')
if len(C[-1])==1:
pass
else:
ICD_Name_1.remove(ICD_Name[ICDi])
ICD_ID_1.remove(ICD_ID[ICDi])
for Ci in C:
ICD_Name_1.append(ICD_Name[ICDi])
ICD_ID_1.append(Ci)
if '、' in ICD_ID[ICDi]:
C=ICD_ID[ICDi].split('、')
if len(C[-1])==1:
pass
else:
ICD_Name_1.remove(ICD_Name[ICDi])
ICD_ID_1.remove(ICD_ID[ICDi])
for Ci in C:
ICD_Name_1.append(ICD_Name[ICDi])
ICD_ID_1.append(Ci)
if '或' in ICD_ID[ICDi]:
C=ICD_ID[ICDi].split('或')
if len(C[-1])==1:
pass
else:
ICD_Name_1.remove(ICD_Name[ICDi])
ICD_ID_1.remove(ICD_ID[ICDi])
for Ci in C:
ICD_Name_1.append(ICD_Name[ICDi])
ICD_ID_1.append(Ci)
if ',' in ICD_ID[ICDi]:
C=ICD_ID[ICDi].split(',')
if len(C[-1])==1:
pass
else:
ICD_Name_1.remove(ICD_Name[ICDi])
ICD_ID_1.remove(ICD_ID[ICDi])
for Ci in C:
ICD_Name_1.append(ICD_Name[ICDi])
ICD_ID_1.append(Ci)
if '\\' in ICD_ID[ICDi]:
C=ICD_ID[ICDi].split('\\')
if len(C[-1])==1:
pass
else:
ICD_Name_1.remove(ICD_Name[ICDi])
ICD_ID_1.remove(ICD_ID[ICDi])
for Ci in C:
ICD_Name_1.append(ICD_Name[ICDi])
ICD_ID_1.append(Ci)
if ',' in ICD_ID[ICDi]:
C=ICD_ID[ICDi].split(',')
if len(C[-1])==1:
pass
else:
ICD_Name_1.remove(ICD_Name[ICDi])
ICD_ID_1.remove(ICD_ID[ICDi])
for Ci in C:
ICD_Name_1.append(ICD_Name[ICDi])
ICD_ID_1.append(Ci)
return ICD_Name_1,ICD_ID_1
# 爬取的内容放进创建的表
def put_into_excel(k,CPW,ICD_Name,ICD_ID,ICD_9_Name,ICD_9_ID,sheet,path,C):
sheet.write(k, 7, C)
if ICD_Name==[] and ICD_ID==[] and ICD_9_Name==[] and ICD_9_ID==[]:
sheet.writesheet.write(k,6,path)
try:
ICD_Name, ICD_ID=addsheet(ICD_Name,ICD_ID)
ICD_9_Name, ICD_9_ID=addsheet(ICD_9_Name,ICD_9_ID)
if ICD_Name!=[] and ICD_9_Name!=[]:
for I_N in range(len(ICD_Name)):
for I9N in range(len(ICD_9_Name)):
sheet.write(k, 0, k)
sheet.write(k, 1, CPW)
sheet.write(k, 2, ICD_Name[I_N])
sheet.write(k, 3, ICD_ID[I_N])
sheet.write(k, 4, ICD_9_Name[I9N])
sheet.write(k, 5, ICD_9_ID[I9N])
k+=1
if ICD_Name==[] and ICD_9_Name!=[]:
for I9N in range(len(ICD_9_Name)):
sheet.write(k, 0, k)
sheet.write(k, 1, CPW)
sheet.write(k, 4, ICD_9_Name[I9N])
sheet.write(k, 5, ICD_9_ID[I9N])
k += 1
if ICD_9_Name==[] and ICD_Name!=[]:
for I_N in range(len(ICD_Name)):
sheet.write(k, 0, k)
sheet.write(k, 1, CPW)
sheet.write(k, 2, ICD_Name[I_N])
sheet.write(k, 3, ICD_ID[I_N])
k+=1
except:
k+=1
return k
#正则化规则
def Standize(A):
ICD_Name=[]
ICD_ID=[]
ICD_9_Name = []
ICD_9_ID = []
for A_i in A:
if 'ICD-10' in A_i or 'ICD–10' in A_i :
p1=re.compile(r'(?<=[为|,|、:或]).+?(?=[((]ICD[–--]10)')
matcher = re.findall(p1, A_i)
ICD_Name.extend(matcher)
# ICD_ID:
p2 = re.compile(r"(?<=ICD[–--]10[::]).+?(?=[)\)])")
m2 = re.findall(p2, A_i)
ICD_ID.extend(m2)
if ICD_ID==[]:
p5 = re.compile(r"(?<=(ICD-10[︰/s:]).+?(?=))")
m5 = re.findall(p5, A_i)
ICD_ID.extend(m5)
print(1)
if ICD_ID!=[] and ICD_Name==[]:
p5 = re.compile(r"(?<=\w).+?(?=))")
m5 = re.findall(p5, A_i)
ICD_Name.extend(m5)
print(2)
if ICD_ID==[]:
p2 = re.compile(r"(?<=ICD[–--]).+?(?=[)\)])")
m2 = re.findall(p2, A_i)
ICD_ID.extend(m2)
print(3)
if ICD_ID==[]:
p2 = re.compile(r"(?<=ICD[–--]10/s).+?(?=[)\)])")
m2 = re.findall(p2, A_i)
ICD_ID.extend(m2)
print(4)
if ICD_ID==[]:
p2 = re.compile(r"(?<=ICD10[/s::码]).+?(?=[)\)])")
m2 = re.findall(p2, A_i)
ICD_ID.extend(m2)
print(5)
if ICD_Name==[]:
p5 = re.compile(r"(?<=[\w为或]).+?(?=[(\(]ICD10)")
m5 = re.findall(p5, A_i)
ICD_Name.extend(m5)
print(6)
if ICD_ID==[]:
p2 = re.compile(r"(?<=ICD编码[/s::]).+?(?=[)\)])")
m2 = re.findall(p2, A_i)
ICD_ID.extend(m2)
print(7)
if ICD_Name==[]:
p5 = re.compile(r"(?<=[\w为或]).+?(?=[(\(]ICD编码)")
m5 = re.findall(p5, A_i)
ICD_Name.extend(m5)
print(8)
if ICD_ID==[]:
p2 = re.compile(r"(?<=ICD\s10[::]).+?(?=[)\)])")
m2 = re.findall(p2, A_i)
ICD_ID.extend(m2)
print(9)
if ICD_Name==[]:
p5 = re.compile(r"(?<=[\w为或]).+?(?=[(\(]ICD编码)")
m5 = re.findall(p5, A_i)
ICD_Name.extend(m5)
print(10)
if 'CM-3' in A_i or 'CM–3' in A_i:
p3 = re.compile(r"(?<=[因行,、]).+?(?=[(\(]ICD[–--]9)")
m3 = re.findall(p3, A_i)
ICD_9_Name.extend(m3)
p4 =re.compile(r"(?<=CM[–--]3[::]).+?(?=[)\)])")
m4 = re.findall(p4, A_i)
ICD_9_ID.extend(m4)
if ICD_9_ID!=[] and ICD_9_Name==[]:
p4 = re.compile(r".+?(?=[(\(]ICD[–--]9)")
m4 = re.findall(p4, A_i)
ICD_9_Name.extend(m4)
if ICD_9_Name==[]:
p3 = re.compile(r"(?<=[因行,、]).+?(?=[(\(]CM)")
m3 = re.findall(p3, A_i)
ICD_9_Name.extend(m3)
if ICD_9_ID==[]:
p4 = re.compile(r"(?<=CM3[::]).+?(?=[)\)])")
m4 = re.findall(p4, A_i)
ICD_9_ID.extend(m4)
if ICD_9_Name==[]:
p3 = re.compile(r"(?<=[因行,、]).+?(?=[(\(]ICD9CM)")
m3 = re.findall(p3, A_i)
ICD_9_Name.extend(m3)
if ICD_9_ID==[]:
p4 = re.compile(r"(?<=CM[–--]3编码[::]).+?(?=[)\)])")
m4 = re.findall(p4, A_i)
ICD_9_ID.extend(m4)
return ICD_Name,ICD_ID,ICD_9_Name,ICD_9_ID
#爬取文件的路径
path = "C:/Users/Administrator/Desktop/224个病种临床路径(2019年版)" #文件夹目录
# print(1)
files= os.listdir(path) #得到文件夹下的所有文件名称
s = []
k=1
M=0
for file in files: #遍历文件夹
F1=os.listdir(path+"/"+file)
if '.doc' in F1[0]:
for f in F1:
if '~' in f or '$' in f:
print('这是打开的文件:', f)
else:
if '临床路径' in f:
CP = f.split('临床路径')[0]
elif '(' in f:
CP = f.split('(')[0]
elif '(' in f:
CP = f.split('(')[0]
else:
CP = f.split('.')[0]
file_abs_path = path + "/" + file + '/' + f # 打开文件
try:
C = []
word = Document(file_abs_path) # 读入文件
para_len = len(word.paragraphs)
for LEN in range(para_len):
if '诊断依据' in word.paragraphs[LEN].text:
break
if 'ICD' in word.paragraphs[LEN].text:
Context = word.paragraphs[LEN].text
ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID = Standize(C)
print(ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID)
k = put_into_excel(k, CP, ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID, sheet, file + '/' + f ,C)
M += 1
print(M)
except:
try:
wd = Dispatch("Word.application")
doc = wd.Documents.Open(file_abs_path)
doc.SaveAs("C:/Users/Administrator/Desktop/224个病种临床路径" + '/' + f + r"x", 12) # 12表示docx格式
doc.Close()
word = Document("C:/Users/Administrator/Desktop/224个病种临床路径" + '/' + f + r"x") # 读入文件
para_len = len(word.paragraphs)
C = []
for LEN in range(para_len):
if '诊断依据' in word.paragraphs[LEN].text:
break
if 'ICD' in word.paragraphs[LEN].text or 'CM' in word.paragraphs[LEN].text:
Context = word.paragraphs[LEN].text
C.append(Context)
print(C)
ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID = Standize(C)
print(ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID)
k = put_into_excel(k, CP, ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID, sheet,
file + "/" + f, C)
M += 1
print(M)
except:
k += 1
sheet.write(k, 6, file + '/' + f )
print('这是未处理的文件:', f, )
print("C:/Users/Administrator/Desktop/224个病种临床路径" + '/' + f + r"x")
else:
for Fi1 in F1:
F = os.listdir(path + "/" + file + "/" + Fi1 )
for f in F:
if '~' in f or '$'in f :
print('这是打开的文件:',f)
else:
if '临床路径' in f:
CP = f.split('临床路径')[0]
elif '(' in f:
CP = f.split('(')[0]
elif '(' in f:
CP = f.split('(')[0]
else:
CP = f.split('.')[0]
file_abs_path=path + "/" + file + "/" + Fi1 +'/'+f #打开文件
try:
C=[]
word = Document(file_abs_path ) # 读入文件
para_len=len(word.paragraphs)
for LEN in range(para_len):
if '诊断依据' in word.paragraphs[LEN].text:
break
if 'ICD' in word.paragraphs[LEN].text:
Context=word.paragraphs[LEN].text
C.append(Context)
print(C)
ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID = Standize(C)
print(ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID)
k = put_into_excel(k, CP, ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID,sheet,file + "/" + Fi1 +'/'+f,C)
M += 1
print(M)
except:
try:
wd = Dispatch("Word.application")
doc = wd.Documents.Open(file_abs_path)
doc.SaveAs("C:/Users/Administrator/Desktop/224个病种临床路径" +'/'+f+ r"x", 12) # 12表示docx格式
doc.Close()
word = Document("C:/Users/Administrator/Desktop/224个病种临床路径" +'/'+f+ r"x") # 读入文件
para_len = len(word.paragraphs)
C=[]
for LEN in range(para_len):
if '诊断依据' in word.paragraphs[LEN].text:
break
if 'ICD' in word.paragraphs[LEN].text or 'CM' in word.paragraphs[LEN].text:
Context = word.paragraphs[LEN].text
C.append(Context)
print(C)
ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID = Standize(C)
print(ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID)
k = put_into_excel(k, CP, ICD_Name, ICD_ID, ICD_9_Name, ICD_9_ID,sheet, file + "/" + Fi1 +'/'+f ,C )
M += 1
print(M)
except:
k += 1
sheet.write(k, 6, file + "/" + Fi1 +'/'+f )
print('这是未处理的文件:',f,)
print("C:/Users/Administrator/Desktop/224个病种临床路径" +'/'+f+ r"x")
work_book.save("临床路径表.xls")
print(M,k)
得到结果如下: