一、list保存为Excel:
import xlrd
import xlwt
import re
import numpy as np
def lists_to_excel(listname1,listname2,listname3,listname4,filename):
f = xlwt.Workbook()
sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
for i in range(len(listname1)):
sheet1.write(i, 0, str(listname1[i]))
sheet1.write(i, 1, str(listname2[i]))
sheet1.write(i, 2, str(listname3[i]))
sheet1.write(i, 3, str(listname4[i]))
f.save(filename)
lists_to_excel(list1,list2,list3,list4,filename) #filename:Excel的保存路径及名称,可绝对路径也可相对路径。
二、字典数据结构保存到Excel:
import xlwt
def dict_to_excel(dict,filename):
key_list = list(dict.keys())
value_list = list(dict.values())
f = xlwt.Workbook()
sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True)
for i in range(len(key_list)):
sheet1.write(i,0,str(key_list[i]))
sheet1.write(i,1,int(value_list[i]))
print(key_list)
print(value_list)
f.save(filename)
# 调用
dict_to_excel(dict,filename) #filename:Excel的保存路径及名称,可绝对路径也可相对路径。
三、计算一个文本或者list中某些字段的频次,返回未排序的结果和按频次排序的结果:
import xlrd
import xlwt
import re
import numpy as np
#count the count of th text in the list.s is the list
def count_texts(s):
count_dict ={}
for word in s:
if word in count_dict:
count_dict[word] += 1
else:
count_dict[word] = 1
count_dict1 = sorted(CBRC_count_dict.items(), key=lambda x: x[1], reverse=True) #排序之后数据结果为list
return count_dict,count_dict1
#调用
real_count_dict,real_count_dict1 = count_texts(list1)
四、向已有文件追加list中的文本数据:
import xlrd
import xlwt
import re
import numpy as np
def text_save(filename,data):
file = open(filename,'a') #a向文件追加
for i in range(len(data)):
s = str(data[i]) #
s = s +'\n' #
file.write(s)
file.close()
#调用
filename='./text1.txt'
text_save(filename,data1) # data1 是已有的一个list数据表。将data1追加到已有的filename文件中,并以换行符进行分割
五、对于此类文本数据的处理:先是括号,括号内是有效文本数据,然后接着是括号内文本的解释。此类仅仅提取括号内的有效文本
import xlrd
import xlwt
import re
import numpy as np
#仅提取括号内的有效内容
def subString(text):
copy = False
finished = False
number= ['一','二','三','四','五','六','七','八']
slotList = []
str = ""
for s in text:
if s == '(' :
copy = True
elif s == ')':
copy = False
finished = True
elif copy and s not in number: #为了防止多条数据中的(一)等此类情况的发生,其并不符合此类文本数据。
str = str + s
if finished:
slotList.append(str)
str = ""
finished = False
slotList = "".join(slotList)
return slotList
# 调用
text = subString(text)
六、Excel中文本数据的预处理:
import xlrd
import xlwt
import re
import jieba
import numpy as np
def predict(predict_filename):
# 所有待预测文本数据集的文件名
all_data = xlrd.open_workbook(predict_filename)
all_table = all_data.sheet_by_index(0) # 第一张sheet
all_nrows = all_table.nrows
all_ncols = all_table.ncols
print('文本数据条数',all_nrows) #
print('列数:',all_ncols) #
all_uncleaned_texts = [] # 定义初始的预处理后的文本数据list
for row in range(all_nrows):
all_uncleaned_text = all_table.cell(row,2).value # 第一张sheet中第二列是文本数据列
# 将含有多条文本的文本数据的整合成一条文本
all_uncleaned_text = re.sub("2、|3、|4、|5、|6、|7、|8、|9、|10、|11、|12、|13、|14、|15、|", "", all_uncleaned_text)
all_uncleaned_text = re.sub("2.|3.|4.|5.|6.|7.|8.|9.|10.|11.|12.|13.|14.|15.|", "", all_uncleaned_text)
all_uncleaned_text = re.sub("二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|", "", all_uncleaned_text)
all_uncleaned_text = re.sub("(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|", "",all_uncleaned_text)
all_uncleaned_text = re.sub("1、|1.|一、|(一)|", "", all_uncleaned_text)
# 去除所含有的英文字母、所含的数字
all_uncleaned_text = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\《\》\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]","",all_uncleaned_text)
# 结巴分词
all_uncleaned_text = jieba.cut(all_uncleaned_text, cut_all=False)
all_uncleaned_text = ' '.join(['%s' % x for x in all_uncleaned_text])
all_uncleaned_texts.append(all_uncleaned_text)
print('所有数据中,去除了数字、字母且分词后文本:',all_uncleaned_texts)
七、由字典的值去获得字典的键:
#由字典的value得到对应的key
def get_keys(d, value):
return [k for k, v in d.items() if v == value]
text_label_texts = []
for i in range(len(test_predicted)):
text_label_text = get_keys(label_dict,test_predicted[i])
text_label_texts.append(text_label_text)
八、中文转相应的Unicode编码
def to_unicode(string):
ret = ''
for v in string:
ret = ret + hex(ord(v)).upper().replace('0X', '\\u')
return ret
print(to_unicode("作为")) # \u4F5C\u4E3A