python遍历所有文件,并提取json中的text的值

python遍历所有文件,并提取json中的text的值

这是txt的数据格式

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=566
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"apd","sn":1,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=1052
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,1],"sn":2,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=1200
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,2],"sn":3,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=1504
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,3],"sn":4,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=1844
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,4],"sn":5,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=2011
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,5],"sn":6,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"打"}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=2164
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,6],"sn":7,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"打"}]},{"bg":0,"cw":[{"sc":0,"w":"网球"}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=2586
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"rpl","rg":[1,7],"sn":8,"ws":[{"bg":0,"cw":[{"sc":0,"w":"你"}]},{"bg":0,"cw":[{"sc":0,"w":"猜"}]},{"bg":0,"cw":[{"sc":0,"w":"我"}]},{"bg":0,"cw":[{"sc":0,"w":"是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"喝酒"}]},{"bg":0,"cw":[{"sc":0,"w":"还是"}]},{"bg":0,"cw":[{"sc":0,"w":"爱"}]},{"bg":0,"cw":[{"sc":0,"w":"打"}]},{"bg":0,"cw":[{"sc":0,"w":"王者"}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=2610
eos_iat=-1
{"text":{"bg":0,"ed":0,"ls":false,"pgs":"apd","sn":9,"ws":[{"bg":0,"cw":[{"sc":0,"w":""}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_iat=3045
eos_iat=60
{"text":{"bg":0,"ed":0,"ls":true,"pgs":"rpl","rg":[9,9],"sn":10,"ws":[{"bg":0,"cw":[{"sc":0,"w":""}]}]}}

sid=cida711ef55@dx000b13eef4bf010001
bos_nlp=3069
eos_nlp=84
{"intent":{"rc":4,"sid":"cida711ef55@dx000b13eef4bf010001","text":"你猜我是爱喝酒还是爱打王者","uuid":"cida711ef55@dx000b13eef4bf010001"}}


这里为提取text值的代码

import os
import pandas as pd
import numpy as np
import json
from pandas import Series,DataFrame
//listdir读取是无序的,需要排序
files_list = os.listdir('/Users/abc/Desktop/android/data_test/data/wake4')
//过滤非txt文件
files_list = [name for name in os.listdir('/Users/abc/Desktop/android/data_test/data/wake4') if name.endswith('.txt')]
//文件格式为audio-xx.txt,对倒数4-6位进行排序
files_list.sort(key=lambda x:int(x[-6:-4]))
files_path = '/Users/abc/Desktop/android/data_test/data/result_4.csv'
key_path = []

df = pd.DataFrame(np.random.randn(100,1))
i = 0
for file_name in files_list:
    if '.txt' in file_name:
        with open('/Users/abc/Desktop/android/data_test/data/wake4/'+file_name,'r') as f:
            lines = f.readlines()
            for line1 in lines:
                if "uuid" in line1:
                    line1 = line1.strip('\n') # 去掉换行符
                    line1 = line1.split('\t')# 去掉缩进符
                    json_t = json.loads(line1[0])
                    //取intent中的text值
                    df.iloc[i,0] = json_t['intent']['text']
                    break;
            i+=1
df.to_csv(files_path, index=0)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值