#-*- coding:gbk -*-
importosimportdocxfrom win32com importclient as wcimportxlwtimportxlsxwriter#获取filepath文件夹下的所有的文件
defgetfilelist(filepath):
filelist=os.listdir(filepath)
files=[]for i inrange(len(filelist)):
child= os.path.join('%s\\%s' %(filepath, filelist[i]))ifos.path.isdir(child):
files.extend(getfilelist(child))else:
files.append(child)returnfiles#获取word文件文本
defgetDocx(fileName):
d=docx.opendocx(fileName)
doc=docx.getdocumenttext(d)returndoc#将doc转换为docx
defdoc2Docx(fileName):
word= wc.Dispatch("Word.Application")
doc=word.Documents.Open(fileName)
doc.SaveAs(fileName+ "x", 12, False, "", True, "", False, False, False, False)
os.remove(fileName)
doc.Close()
word.Quit()
filepath= "C:\\xxx\\xx\\xx\\xx\\数据集"filelist=(getfilelist(filepath))##如果文件夹下的文件都是doc,需要先通过该函数全部转变为docx##for i in range(len(filelist)):## doc2Docx(filelist[i])
list=[]for i inrange(len(filelist)):if (filelist[i].endswith("docx")):
list.append(filelist[i])#使用xlwt写入到excel,当存在大文本的时候会出现错误:Exception: String longer than 32767 characters##for i in range(len(list)):## fileName = list[i]## doc = get_docx(fileName)## filePaths = fileName.split("\\")## string = ""## for j in range(len(doc)):## string += doc[j] + "\n"## if (len(string) > 10000):## string = string[:10000]## filePaths.append(string)## for j in range(20, -1, -1):## if j < len(filePaths):## worksheet.write(i, j, label = filePaths[j])##workbook.save('Excel_Workbook.xls')
#使用xlsxwriter处理超过的32767word文本
workbook = xlsxwriter.Workbook(u'数据.xlsx')
worksheet= workbook.add_worksheet(u"数据")for i inrange(len(list)):
fileName=list[i]
doc=get_docx(fileName)
filePaths= fileName.split("\\")
string= ""
for j inrange(len(doc)):
string+= doc[j] + "\n"filePaths.append(string)for j in range(20, -1, -1):if j
worksheet.write(i, j, filePaths[j])
workbook.close()