源码链接:https://gitee.com/linjinpeng/python_case
# coding = utf-8
from collections import OrderedDict
import re
import time
def doMain():
f=open('a.txt', 'r');
content= f.read()
f.close()
content = re.sub(r'(Sample, \d{3,} of \d{3,5})', double, content)
#1.把所有[SEP]换成“空格”
content=content.replace('[SEP]',' ')
#2.把所有的换行都取消
content=content.replace('\n','')
#3.再次把所有[SEP]换成“空格”,取消换行之后会形成新的[SEP]
content=content.replace('[SEP]',' ')
#4.两个或两个以上标点符号链接,只留一个,优先句号,其次问号,再次分号,再次感叹号,再次逗号(这个是重点,难点)
newcontent=[]
content=re.split(r'([.。!!??;;,,、:\s+])', content)
content.append('')
content = [''.join(i) for i in zip(content[0::2], content[1::2])]#根据标点符号进行分割,分隔符放在句子后面
for item in content:
if len(item)>1:
newcontent.append(item)
content=''.join(newcontent)
#5.两种括号里面的句子,包括括号全部删掉
content=re.sub(u'\\(.*?\\)|\\(.*?)|\\{.*?}|\\[.*?]|\\【.*?】','', content)
#6.“泻药”都换为“谢邀”;“~”换成“。”
content=content.replace('泻药','谢邀').replace('~','。').replace('~','')
#7.删除“(二维码自动识别)” “进群” “备注” “图片来源于网络,侵删”
content=content.replace('(二维码自动识别)','').replace('进群','').replace('图片来源于网络,侵删','').replace('备注','')
#8.删除划线
content=content.replace('——','')
#9.特别长的句子,没有标点符号的,删掉(30字符以上算特别长)
newcontent = []
content = re.split(r"([.。!!??;;,,、:\s+])", content)
content.append('')
content = [''.join(i) for i in zip(content[0::2], content[1::2])]#根据标点符号进行分割,分隔符放在句子后面
for item in content:
if len(item)<30 and re.search(u'[.。!!??;;,,、:]',item)!=None:
newcontent.append(item)
content = ''.join(newcontent)
# 10.带“淘宝”“某宝”“公众号”“某东”“关注”“傻x”“知乎”“卖家”“谢邀”“大众点评”“傻逼”“转发收藏”
# “2018”“2019”“2020”“题主”“回答”“微信”“微博”“朋友圈”“知友”“下单”“旗舰店”“搜索”“楼主”“
# ”“直播”“领取”“百度”“教程”“小红书”“福利”“b站”“B站”“私信”“理财”“干货”“链接”“搞毛”“店铺”
# “水印”“你妈”“二维码”“全裸”“卖肉”“瞎几把”“微商”“广告”“天猫”“商城”“博主”“公主号”“官网”“wx”
# “vx”“投资”“股票”“基金”“股市”“A股”的,整句话全部去掉
newcontent = []
content = re.split(r"([.。!!??;;,,\s+])", content)
content.append('')
content = [''.join(i) for i in zip(content[0::2], content[1::2])]#根据标点符号进行分割,分隔符放在句子后面
for item in content:
if re.search(u"淘宝|\\某宝|\\公众号|\\某东|\\关注|\\傻x|\\知乎|\\卖家|"
u"\\谢邀|\\大众点评|傻逼|\\转发收藏|\\大众点评|\\傻逼|\\转发收藏|"
u"\\2018|\\2019|\\2020|\\题主|\\回答|\\微信|\\微博|\\朋友圈|"
u"\\知友|\\下单|\\旗舰店|\\搜索|\\楼主|\\直播|\\直播|\\领取|"
u"\\百度|\\教程|\\小红书|\\福利|\\b站|\\B站|\\私信|\\理财|"
u"\\干货|\\链接|\\搞毛|\\店铺|\\水印|\\你妈|\\二维码|\\全裸"
u"\\卖肉|\\瞎几把|\\微商|\\广告|\\天猫|\\商城|\\博主|\\公主号"
u"\\官网|\\wx|\\vx|\\投资|\\股票|\\基金|\\股市|\\A股",item)==None:
newcontent.append(item)
content="".join(newcontent)
#两个标点符号连接到一起时候,去掉其他,只剩下一个(有时候两个标点间有空格,也要去掉)
content = re.sub(u"\\!{1,}", "!", content)
content = re.sub(u"\\!{1,}", "!", content)
content = re.sub(u"\\。{1,}", "。", content)
content = re.sub(u"\\,{1,}", ",", content)
content = re.sub(u"\\?{1,}", "?", content)
content = re.sub(u"\\?{1,}", "?", content)
content = re.sub(u"\\.{1,}", "", content)
content = re.sub(u"\\。{1,}", "。", content)
#11.最后一句话结尾如果没有标点符号,删除这句话(前面可能是句号,问号,感叹号,或者分号)
newcontent=[]
content = content.split("Sample")#先根据Sample对自然段进行分割
for item in content:
item = re.split(r"([.。!!??;;,,、:\s+])", item)
item.append("")
item = ["".join(i) for i in zip(item[0::2], item[1::2])]#根据标点符号进行分割,分隔符放在句子后面
if(len(item))>1:
if re.search(r'[。!!??]', item[-1])==None:
item=rm(item)
item="".join(item)
newcontent.append("Sample"+item + "\n")
newcontent.remove(newcontent[0])
content = "".join(newcontent)
#去除杂质
content = content.replace("(_)","").replace(')','').replace('(','')
content = content.replace("▼","").replace("(⊙_)","")
content = content.replace("——", "").replace('▲', '').replace(" a (⊙o)", "").replace("(⊙o)", "")
content = content.replace("】", "").replace("]", "").replace('(⊙_⊙)', '').replace('【', '')
content = content.replace('~',"").replace(" 【", "").replace("[", "")
content = content.replace("。侵删", "").replace(",侵删", "").replace("(﹏)", "")
content = content.replace('泻药', '').replace('+', '').replace(')', '').replace('(', '')
#保存文件
fh = open(time.strftime('%Y%m%d%H%M%S')+'.txt', 'w', encoding='utf-8')
fh.write(content)
fh.close()
# 递归,去掉最后一句不是句号的。
def rm(item):
if len(item) > 1 and re.search(r'[。!!??]', item[-1])==None:
del item[-1]
rm(item)
return item
# 将匹配的数据改造
def double(matched):
return ','+matched.group(1)+','
if __name__ == '__main__':
doMain()
#12.删除重复,只留一个(单字重复2次以上算重复,单次以上重复2次算重复,look这个单词不算重复,不要变成lok)
# newcontent=[]
# content = re.split(r"([.。!!??;;,,、:\s+])", content)
# for item in content:
# item=OrderedDict.fromkeys(item)
# item="".join(item)
# item=re.sub(' |\n','',item)#删除空格/空行
# if(len(item))>0:
# newcontent.append(item)
# content="".join(newcontent)
#整段文字处理后少于490字的全部删掉
# newcontent = []
# content = content.split("Sample")
# for item in content:
# if len(item) > 490:
# newcontent.append("Sample" + item + "\n")
# content = "".join(newcontent)
#带英文整段全部删掉
# newcontent=[]
# content = content.split("Sample")#先根据Sample对自然段进行分割
# for item in content:
# #begin---根据标点符号进行分割,分隔符放在句子后面
# item=re.split(r"([.。!!??;;,,、:\s+])", item)
# item.append("")
# item = ["".join(i) for i in zip(item[0::2], item[1::2])]
# #end
# if len(item)>1:#已经根据符合分割,并,分隔符放在句子后面。长度为1的可能是符合或者空格
# newitem = []
# for childen in item:
# check=len(re.findall(re.compile(r'[A-EG-NP-Za-eg-np-z]', re.S), childen)) #匹配字母
# if(len(childen))>1 and check==False:
# newitem.append(childen)
# item = newitem
# item="".join(item)
# newcontent.append("Sample"+item+"\n")
# newcontent.remove(newcontent[0])
# content="".join(newcontent)
#删除相邻的两个符号
# newcontent=[]
# content = content.split("Sample")
# for item in content:
# search = re.search(r'[,.!?,,。!?]{2,}', item)
# if search:
# begin=search.span()[0]
# end=search.span()[1]
# if item[begin+1]==",":
# item = item[0:begin] + item[end:-1]
# else:
# item=item[0:begin]+"。"+item[end:-1]
# if (len(item)) > 0:
# newcontent.append("Sample" + item+"\n")
# else:
# if (len(item)) > 0:
# newcontent.append("Sample"+item+"\n")
# content="".join(newcontent)