一些基础的python程序,写出来避免搬砖
包含中文去重…后续待添加
1、句子去重:
def uniqueInfo(textList, score):
"""比较一个List中句子,去除重复度比较高的"""
res = [textList[0]]
for i1 in textList[1:]:
mark = True
for i2 in res:
if i1 in i2 or i2 in i1:
mark = False
break
s1 = sum([1 if i in i2 else 0 for i in i1])
if s1 / len(i1) > score or s1 / len(i2) > score:
mark = False
break
if mark:
res.append(i1)
return res
2、统计中文数量:
def chnWordCnt(text):
"""统计中文的数量"""
return sum([1 for i in text if '\u4e00' <= i <= '\u9fff' ])
3、非法字符过滤:
import re
ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
textNew = ILLEGAL_CHARACTERS_RE.sub(text)
4、删除括号中的字符
text = re.sub(u"\\(.*?)|\\{.*?}|\\[.*?]|\\【.*?】|\\(.*?\\)", "", text)
5、字典保存为npy文件与读取
import numpy as np
dict = {'test':'test','test2':'test2','test3':'test3'}
np.save('my_file.npy', dict) # 注意带上后缀名
load_dict = np.load('my_file.npy').item()# Load,注意需要带上item()
5.1、保存为pkl文件
dict = {'test':'test','test2':'test2','test3':'test3'}
f_save = open('dict_file.pkl', 'wb')
pickle.dump(dict, f_save)
f_save.close()
f_read = open('dict_file.pkl', 'rb')
dict2 = pickle.load(f_read)
print(dict2)
f_read.close()
参考风雨潇潇一书生,感谢
6、读写多个sheet的excel
# 读取多个数据表格:
configFilePath = './newsClassification.xlsx'
configFileWriter = pd.ExcelFile(configFilePath)
# header=None表示没有表头;如果数据表有表头的话,该参数需要删除。
df = configFileWriter.parse(sheet_name=KeyWord, header=None)
writer1 = pd.ExcelWriter('./newsClassification.xlsx',engine='xlsxwriter')
for name in sheetNames:
df[name].to_excel(writer1, sheet_name=name, index=False)
worksheet1 = writer1.sheets[i]
writer1.close()
# 或者
with pd.ExcelWriter('./newsClassification.xlsx',engine='xlsxwriter') as writer:
df1.to_excel(writer, sheet_name='df1')
df2.to_excel(writer, sheet_name='df2')