导出银行字典静综
import pandas as pd
import numpy as np
import os
import re
text_list=[]
words=''
for root, dirs, files in os.walk('F:\\案件数据\\警综\\raw'):
for name in files:
if name[-5:]=='s.txt':
text_list.append(os.path.join(root, name))
for i in text_list:
# print(i)
try:
with open(i,encoding='gbk') as f:
text_word=f.read()
words=words+text_word
except:
with open(i,encoding='utf8') as f:
text_word=f.read()
words=words+text_word
# words_list=re.findall('<.*c=(.*d=.*)" p',words)
# for i in words_list:
# words_clean=re.findall('\d+',i)[0]+'-'+re.findall('d="(.*)',i)[0]
list_data=[]
for i in words_list:
words_clean=re.findall('\d+',i)[0]+'-'+re.findall('d="(.*)',i)[0]
list_data.append(words_clean)
data={'银行名称':list_data}
DF=pd.DataFrame(data)
DF.to_excel("警综字典.xlsx",index=False)