一、
row为list对象
row = "".join(row) #将一个列表里面的元素拼接成一个字符串
for row in csvreader:
print("\r", "process:{}/{}".format(ids,all_sentences_num), end="", flush=True)
ids+=1
打印结果格式:process:186857/186857
二、
from collections import Counter
captions = read_input_csv1(excel_path) #captions是列表
counter = Counter(captions) #对captions列表进行统计个数,得到counter字典
#对counter字典进行按value值得大小排序,reverse=True降序
sorted(counter.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
三、
from openpyxl import load_workbook
def read_input_xlsx1(filename):
workbook = load_workbook(filename) #找到需要xlsx文件的位置
booksheet = workbook.active #获取当前活跃的sheet,默认是第一个sheet
#如果想获取别的sheet页采取下面这种方式,先获取所有sheet页名,在通过指定那一页。
# sheets = workbook.get_sheet_names("表单名字") # 从名称获取sheet
# booksheet = workbook.get_sheet_by_name(sheets[0])
#获取sheet页的行数据
rows = booksheet.rows
#获取sheet页的列数据
columns = booksheet.columns
col_0 = []#获取第0列全部内容
for s in rows:
s = s[0].value
# if s != '':
try:
s1 = s.split()
col_0.append(s1)
except:
continue
return col_0[10:]
#输入文件为csv时读取方式
def read_input_csv(filename):
i = 0
# filename = "./dataset/dataTime2.csv"
id_cn_sentences = defaultdict(str)
id_bd_sentences = defaultdict(str)
id = []
cn = []
baidu = []
# with open(filename, 'r') as file:
# reader = csv.DictReader(filename)
with open(filename, 'rt', encoding='gbk') as file:
reader = csv.reader(file)
for row in reader:
if i == 0:
i = i+1
continue
i = i+1
id.append(row[0])
row1 = row[2].split() #去除空格,返回以空格为切分点的列表
row1 = "".join(row1) #将一个列表里面的元素拼接成一个字符串
cn.append(row1)
baidu.append(row[4])
# print('row[0]',row[4])
# if i > 100:break
for id_, cn_sentence, baidu_sentence in zip(id, cn, baidu):
id_cn_sentences[id_] = cn_sentence
id_bd_sentences[id_] = baidu_sentence
return id_cn_sentences, id_bd_sentences
#初始化输出的csv文件
def init_csv_file(csv_file):
if os.path.exists(csv_file):
os.remove(csv_file)
csv_file = open(csv_file, 'a', newline='', encoding='utf-8-sig') #newline=''防止中间隔空行
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["id", "englishText", "chineseText", "sent_score"])
return csv_writer, csv_file
coherent,f1 = init_csv_file(save_coherent)
coherent.writerow([id_, en_sentences[id_], zh_sentences[id_], score])
f1.close
#json转csv,json格式为
{"id": 278, "text": "啊...这些是我一生的果实", "score": 0.30681753139473444}
{"id": 379, "text": "我想去这里", "score": 0.3820151642892597}
{"id": 403, "text": "我想知道它说什么", "score": 0.3295139660149289}
{"id": 450, "text": "非常好的联赛徽标", "score": 0.307017321127546}
# coding:utf-8
import json
import csv
with open('111.csv','w',newline = '',encoding='utf-8-sig') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["caption_id","chinese_text","modification_text"])
with open('111.json', 'r', encoding="utf-8-sig") as jsonfile:
for line in jsonfile.readlines():
line_dict = {}
line_dict = json.loads(line.strip('\n'))
text = str(line_dict['text'])
text_id = int(line_dict['id'])
csv_writer.writerow([text_id, text])
四、
defaultdict接受一个工厂函数作为参数,如下来构造:
dict =defaultdict( factory_function)
这个factory_function可以是list、set、str等等,作用是当key不存在时,返回的是工厂函数的默认值,比如list对应[ ],str对应的是空字符串,set对应set( ),int对应0,如下举例:
from collections import defaultdict
dict1 = defaultdict(int)
dict2 = defaultdict(set)
dict3 = defaultdict(str)
dict4 = defaultdict(list)
dict1[2] ='two'
print(dict1[3])
print(dict2[1])
print(dict3[1])
print(dict4[1])
结果:
0
set()
[]