python rtftotxt_如何使用任何库在Python中将RTF字符串转换为纯文本[重复]

# -*- coding: utf-8 -*-"""Extract text in RTF Files. Refactored to use with Python 3.xSource:http://stackoverflow.com/a/188877Code created by Markus Jarderot: http://mizardx.blogspot.com"""importredefstriprtf(text):pattern=re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)",re.I)# control words which specify a "destionation".destinations=frozenset(('aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid','atnparent','atnref','atntime','atrfend','atrfstart','author','background','bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping','colortbl','comment','company','creatim','datafield','datastore','defchp','defpap','do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt','fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl','ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype','fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr','footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl','header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc','hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers','leveltext','lfolevel','linkval','list','listlevel','listname','listoverride','listoverridetable','listpicture','liststylename','listtable','listtext','lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr','mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr','mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me','mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr','mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag','mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname','mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr','mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject','mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname','mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl','mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr','mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu','mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr','mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup','msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide','msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol','mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables','objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops','oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password','passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta','pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe','result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst','shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv','svb','tc','template','themedata','title','txe','ud','upr','userprops','wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform','xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl','xmlopen',))# Translation of some special characters.specialchars={'par':'\n','sect':'\n\n','page':'\n\n','line':'\n','tab':'\t','emdash':'\u2014','endash':'\u2013','emspace':'\u2003','enspace':'\u2002','qmspace':'\u2005','bullet':'\u2022','lquote':'\u2018','rquote':'\u2019','ldblquote':'\201C','rdblquote':'\u201D',}stack=[]ignorable=False# Whether this group (and all inside it) are "ignorable".ucskip=1# Number of ASCII characters to skip after a unicode character.curskip=0# Number of ASCII characters left to skipout=[]# Output buffer.formatchinpattern.finditer(text.decode()):word,arg,hex,char,brace,tchar=match.groups()ifbrace:curskip=0ifbrace=='{':# Push statestack.append((ucskip,ignorable))elifbrace=='}':# Pop stateucskip,ignorable=stack.pop()elifchar:# \x (not a letter)curskip=0ifchar=='~':ifnotignorable:out.append('\xA0')elifcharin'{}\\':ifnotignorable:out.append(char)elifchar=='*':ignorable=Trueelifword:# \foocurskip=0ifwordindestinations:ignorable=Trueelifignorable:passelifwordinspecialchars:out.append(specialchars[word])elifword=='uc':ucskip=int(arg)elifword=='u':c=int(arg)ifc<0:c+=0x10000ifc>127:out.append(chr(c))#NOQAelse:out.append(chr(c))curskip=ucskipelifhex:# \'xxifcurskip>0:curskip-=1elifnotignorable:c=int(hex,16)ifc>127:out.append(chr(c))#NOQAelse:out.append(chr(c))eliftchar:ifcurskip>0:curskip-=1elifnotignorable:out.append(tchar)return''.join(out)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值