spark streaming python实例_Python+SparkStreaming+kafka+写入本地文件案例(可执行)

这是一个使用Python和Spark Streaming从Kafka消费数据并解析为JSON,然后将结果写入本地文件的案例。程序通过KafkaUtils创建Spark Streaming上下文,从多个Kafka主题中读取数据,解析JSON并提取关键信息,如trace_id、session_id、device_info等,最后将不同类型的数据分别写入不同的本地文件。
摘要由CSDN通过智能技术生成

#!/usr/bin/env python#-*- coding: utf-8 -*-#@Time : 2018/4/9 11:49#@Author : baoshan#@Site :#@File : readTraceFromKafkaStreamingToJson.py#@Software: PyCharm Community Edition

from pyspark importSparkContextfrom pyspark.streaming importStreamingContextfrom pyspark.streaming.kafka importKafkaUtilsimportdatetimeimportjsonimporttimefrom collections importdefaultdictimportsubprocessclassKafkaMessageParse:defextractFromKafka(self, kafkainfo):if type(kafkainfo) is tuple and len(kafkainfo) == 2:return kafkainfo[1]deflineFromLines(self, lines):if lines is not None and len(lines) >0:return lines.strip().split("\n")defmessageFromLine(self, line):if line is not None and "message" inline.keys():return line.get("message")defextractFromMessage(self, message):try:

jline=json.loads(message)

result=defaultdict()

name= jline.get("name")if "speech" inname:

trace_id= jline.get("trace_id")

parent_id= jline.get("parent_id")

span_id= jline.get("span_id")

sa= jline.get("sa")

sr= jline.get("sr")

ss= jline.get("ss")

ret= jline.get("ret")

result['trace_id'] =trace_id

result['parent_id'] =parent_id

result['span_id'] =span_id

result['name'] =name

result['sa'] =sa

result['sr'] =sr

result['ss'] =ss

result['ret'] =ret

annotation= jline.get("annotation")try:for anno inannotation:if anno.get("name") == "nlp":

debug_log= anno.get("debug_log")

debug_log_anno=debug_log[0]

asr= debug_log_anno.get("asr") #asr

nlp = debug_log_anno.get("nlp")

action= debug_log_anno.get("action")

jaction=json.loads(action)

response= jaction.get("response")

tts= response.get("action").get("directives")[0].get("item").get("tts")

result['tts'] =tts

jnlp=json.loads(nlp)

intent= jnlp.get('intent')

app_id= jnlp.get('appId')

cloud= jnlp.get("cloud")

slots= jnlp.get("slots")

result['app_id'] =app_id

result['intent'] =intent

result['cloud'] =cloud

result['asr'] =asr

result['nlp'] =nlp

result['slots'] =slots

debug_log= jline.get("debug_log")

debug_log0=debug_log[0]

session_id= debug_log0.get("session_id")

codec= debug_log0.get("codec")if notsession_id:

session_id= "" #超级无敌重要

wavfile = session_id + ".wav"codecfile= session_id + "." +codec

asrtimestr= session_id.split("-")[-1]try:

st=time.localtime(float(asrtimestr))except:

st=time.localtime()

asrtime= time.strftime("%Y-%m-%d %H:%M:%S", st)

asrthedate= time.strftime("%Y%m%d", st)

asrdeviceid= debug_log0.get("device_id")

asrdevicetype= debug_log0.get("device_type")

asrdevicekey= debug_log0.get("device_key")

result['session_id'] =session_id

result['device_id'] =asrdeviceid

result['device_key'] =asrdevicekey

result['device_type'] =asrdevicetype

result['thedate'] =asrtime

result['wavfile'] =wavfile

result['codecfile'] =codecfile

result['asrthedate'] =asrthedate

strmessage= json.dumps(result, ensure_ascii=False)returnstrmessageexcept:returnstrmessageelif "tts" in name: #tts

try:

trace_id= jline.get("trace_id")

parent_id= jline.get("parent_id")

span_id= jline.get("span_id")

name= jline.get("name")

sa= jline.get("sa")

sr= jline.get("sr")

ss= jline.get("ss")

ret= jline.get("ret")

result['trace_id'] =trace_id

result['parent_id'] =parent_id

result['span_id'] =span_id

result['name'] =name

result['sa'] =sa

result['sr'] =sr

result['ss'] =ss

result['ret'] =ret

debug_log= jline.get("debug_log")

debug_log_tts=debug_log[0]

text= debug_log_tts.get("text")

codec= debug_log_tts.get("codec")

declaimer= debug_log_tts.get("declaimer")

logs= debug_log_tts.get("logs")

params= debug_log_tts.get("params")

result['text'] =text

result['codec'] =codec

result['declaimer'] =declaimer

result['logs'] =logs

result['params'] =params

strresult= json.dumps(result, ensure_ascii=False)returnstrresultexcept:returnNoneexcept:return ''

def tpprint(val, num=10000):"""Print the first num elements of each RDD generated in this DStream.

@param num: the number of elements from the first will be printed."""

deftakeAndPrint(time, rdd):

taken= rdd.take(num + 1)print("########################")print("Time: %s" %time)print("########################")

DATEFORMAT= '%Y%m%d'today=datetime.datetime.now().strftime(DATEFORMAT)

speechfile= open("/mnt/data/trace/trace.rt.speech." + today, "a")

ttsfile= open("/mnt/data/trace/trace.rt.tts." + today, "a")

otherfile= open("/mnt/data/trace/trace.rt.other." + today, "a")for record intaken[:num]:if record is not None and len(record) > 2: #None 不打印

print(record)

jrecord=json.loads(record)

name= jrecord.get("name")if "speech" inname:

speechfile.write(str(record)+ "\n")elif "tts" inname:

ttsfile.write(str(record)+ "\n")else:

otherfile.write(str(record)+ "\n")

speechfile.close()

ttsfile.close()

otherfile.close()if len(taken) >num:print("...")

val.foreachRDD(takeAndPrint)if __name__ == '__main__':

zkQuorum= 'datacollect-1:2181,datacollect-2:2181,datacollect-3:2181'topic= {'trace-open-gw-5': 1, 'trace-open-gw-6': 1, 'trace-open-gw-7': 1, 'trace-open-gw-8': 1, 'trace-open-gw-9': 1}

groupid= "rokid-trace-rt"master= "local[*]"appName= "SparkStreamingRokidTrace"timecell= 5sc= SparkContext(master=master, appName=appName)

ssc=StreamingContext(sc, timecell)

kvs=KafkaUtils.createStream(ssc, zkQuorum, groupid, topic)

kmp=KafkaMessageParse()

lines= kvs.map(lambdax: kmp.extractFromKafka(x))

lines1= lines.flatMap(lambdax: kmp.lineFromLines(x))

valuedict= lines1.map(lambdax: eval(x))

message= valuedict.map(lambdax: kmp.messageFromLine(x))

rdd2= message.map(lambda x: kmp.extractFromMessage(x)) #result is a json str

tpprint(rdd2)

ssc.start()

ssc.awaitTermination()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值