python遍历所有文件,并提取json中的text的值
这是txt的数据格式
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=566
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"apd","sn":1,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=1052
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,1],"sn":2,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=1200
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,2],"sn":3,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=1504
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,3],"sn":4,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=1844
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,4],"sn":5,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=2011
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,5],"sn":6,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"打"}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=2164
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,6],"sn":7,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"打"}]},{"bg":0,"cw":[{"sc":0,"w":"网球"}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=2586
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,7],"sn":8,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"打"}]},{"bg":0,"cw":[{"sc":0,"w":"王者"}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=2610
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"apd","sn":9,"ws":[{"bg":0,"cw":[{"sc":0,"w":""}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_iat=3045
eos_iat=60
{"text":{"bg":0,"ed":0,"ls":true,"pgs":"rpl","rg":[9,9],"sn":10,"ws":[{"bg":0,"cw":[{"sc":0,"w":""}]}]}}
sid=cida711ef55@dx000b13eef4bf010001
bos_nlp=3069
eos_nlp=84
{"intent":{"rc":4,"sid":"cida711ef55@dx000b13eef4bf010001","text":"你猜我是爱喝酒还是爱打王者","uuid":"cida711ef55@dx000b13eef4bf010001"}}
这里为提取text值的代码
import os
import pandas as pd
import numpy as np
import json
from pandas import Series,DataFrame
//listdir读取是无序的,需要排序
files_list = os.listdir('/Users/abc/Desktop/android/data_test/data/wake4')
//过滤非txt文件
files_list = [name for name in os.listdir('/Users/abc/Desktop/android/data_test/data/wake4') if name.endswith('.txt')]
//文件格式为audio-xx.txt,对倒数4-6位进行排序
files_list.sort(key=lambda x:int(x[-6:-4]))
files_path = '/Users/abc/Desktop/android/data_test/data/result_4.csv'
key_path = []
df = pd.DataFrame(np.random.randn(100,1))
i = 0
for file_name in files_list:
if '.txt' in file_name:
with open('/Users/abc/Desktop/android/data_test/data/wake4/'+file_name,'r') as f:
lines = f.readlines()
for line1 in lines:
if "uuid" in line1:
line1 = line1.strip('\n') # 去掉换行符
line1 = line1.split('\t')# 去掉缩进符
json_t = json.loads(line1[0])
//取intent中的text值
df.iloc[i,0] = json_t['intent']['text']
break;
i+=1
df.to_csv(files_path, index=0)