文章目录
1. csv
1.1 读取
import pandas as pd
df = pd.read_csv("vgg16.csv",sep='\t')
1.2 导出
import pandas as pd
df.to_csv("vgg16.csv", index=False, sep="\t", encoding='utf_8_sig')
2. npy
2.1 读取
import numpy as np
df = np.load("vgg16.npy", allow_pickle=True, encoding="latin1")
2.2 导出
import numpy as np
np.save("./vgg16.npy",df)
如果ensure_ascii为false,则返回值可以包含非ascii值
3. json
3.1 读取
import json
with open('vgg16.json','r',encoding='utf8')as f:
df = json.load(f)
3.2 导出
import json
with open('./vgg16.json', "w", encoding="utf-8") as f:
json.dump(df, f, ensure_ascii=False)
- 例子
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
def re_tokenize(texts):
# num_words:None或整数,处理的最大单词数量。少于此数的单词丢掉
tokenizer = Tokenizer(num_words=None,char_level=True)
tokenizer.fit_on_texts(texts)
char_list = tokenizer.word_index.keys()
print("char_list={}".format(len(char_list)))
#np.save("./char2id.npy",tokenizer.word_index)
import json
with open('./char2id.json', 'w') as f:
json.dump(tokenizer.word_index, f, ensure_ascii=False)
return tokenizer
df = pd.read_csv("./pretrain.csv",lineterminator='\n')
re_tokenize(df["content"])
3.3 按行读取json文件
#读取原始数据
import json
data_test="./doc_quality_data_test.json"
data = []
with open(data_test) as f:
for line in f:
data.append(json.loads(line))
#将dict数据转为dataframe
from pandas.io.json import json_normalize
import pandas as pd
submission = pd.DataFrame.from_dict(json_normalize(data), orient='columns')
#将dataframe保存为dict
df.to_dict(orient='columns')
df.to_dict(orient='dict')
df.to_dict(orient='index')
df.to_dict(orient='records')
df.to_dict(orient='split')
df.to_dict(orient='series')
df.to_dict(orient='list')
4. txt
w:以写方式打开,
a:以追加模式打开 (从 EOF 开始, 必要时创建新文件)
r+:以读写模式打开
w+:以读写模式打开 (参见 w )
a+:以读写模式打开 (参见 a )
rb:以二进制读模式打开
wb:以二进制写模式打开 (参见 w )
ab:以二进制追加模式打开 (参见 a )
rb+:以二进制读写模式打开 (参见 r+ )
wb+:以二进制读写模式打开 (参见 w+ )
ab+:以二进制读写模式打开 (参见 a+ )fp.read([size])
4.1 读取
with open('./vgg16.txt','r',encoding='utf8') as f:
all_line_list=f.readlines()
for i,line in enumerate(all_line_list):
if i%2==0:
wav_name=line.strip("\n").split('\t')[0]
text=line.strip("\n").split('\t')[1]
continue
4.2 导出
with open("../vgg16.txt", "a") as f:
f.write("start train: {} train samples, {} test samples\n".format(len(X_train), len(X_test)))
- 例子
file="000001-010000.txt"
content = ''
with open(file,'r',encoding='utf8') as f:
all_line_list=f.readlines()
for i,line in enumerate(all_line_list):
if i%2==0:
wav_name=line.strip("\n").split('\t')[0]
text=line.strip("\n").split('\t')[1].replace("#",'').replace('1','').replace('2','').replace('3','').replace('4','')
content += wav_name + '\t' + text + '\n'
continue
with open("100000.txt",'a') as f:
f.write(content)
"""
000001 卡尔普#2陪外孙#1玩滑梯#4。---> 000001 卡尔普陪外孙玩滑梯。
"""
5. joblib
Joblib是一组用于在Python中提供轻量级流水线的工具。
特点:
- 透明的磁盘缓存功能和懒惰的重新评估(memoize模式)
- 简单的并行计算
Joblib可以将模型保存到磁盘并可在必要时重新运行:
#将训练的模型保存到磁盘(value=模型名) 默认当前文件夹下
joblib.dump(filename='LR.model',value=content)
# 下载本地模型
model1 = joblib.load(filename="LR.model")
with open("../junior.jbl", "wb") as fw:
joblib.dump(content, fw)
6. ijson
相对于json而言,ijson支持直接通过键进行数据的读取,避免了大文件爆内存的问题
'item.name’表示的是读取的字典中的键name,这样直接精准定位到键所在的位置。缺点就是文件必须是严格的json文件
with open(raw_query_corpus_seg_filepath, 'r')as f:
r = "[a-zA-Z():(\)\"“”''\ \·\ _.!\[\]+-=——,$%^,。??>、~@#¥%……&*《》<>「」{}【】\(\)/(\\)(\u3000)(\u200b)(\xa0)]"
vocab = []
dict_list = ijson.items(f, 'item.name', use_float=True)
for dict_text in tqdm(dict_list):
try:
sentence = re.sub(r, '', dict_text)
except:
print("TypeError: expected string or buffer for sentence:",dict_text)
continue
方法2:不要求是json文件,可以允许全部都是字典的文件
with open(raw_query_corpus_seg_filepath,'r',encoding='utf8')as f:
r = "[a-zA-Z():(\)\"“”''\ \·\ _.!\[\]+-=——,$%^,。??、~@#¥%……&*《
》<>「」{}【】\(\)/(\\)(\u3000)(\u200b)(\xa0)]"
vocab = []
lines = f.readlines()#读取每一行字点
for line in lines:
try:
dict_text = json.loads(line)#用json加载字典
print(dict_text["name"])
sentence = re.sub(r, '', dict_text["name"])
except:
print("TypeError: expected string or buffer for sentence:",dict_text)
continue