spark读取数据并打印_SparkStreaming python 读取kafka数据将结果输出到单个指定本地文件...

该博客介绍了一个使用Python的Spark Streaming从Kafka读取数据,解析消息并将其转换为地理位置信息的过程。数据经过处理后,结果被输出到指定的本地文件中。博客中详细展示了如何利用KafkaUtils创建DStream,以及如何使用自定义函数进行数据提取和转换。
摘要由CSDN通过智能技术生成

#-*- coding: UTF-8 -*-#!/bin/env python3

#filename readFromKafkaStreamingGetLocation.py

importIPfrom pyspark importSparkContextfrom pyspark.streaming importStreamingContextfrom pyspark.streaming.kafka importKafkaUtilsimportdatetimeclassKafkaMessageParse:defextractFromKafka(self,kafkainfo):if type(kafkainfo) is tuple and len(kafkainfo) == 2:return kafkainfo[1]deflineFromLines(self,lines):if lines is not None and len(lines) >0:return lines.strip().split("\n")defmessageFromLine(self,line):if line is not None and "message" inline.keys():return line.get("message")defip2location(self,ip):

result=[]

country= ‘country‘province= ‘province‘city= ‘city‘ipinfo=IP.find(ip.strip())try:

location= ipinfo.split("\t")if len(location) == 3:

country=location[0]

province= location[1]

city= location[2]elif len(location) == 2:

country=location[0]

province= location[1]else:pass

exceptException:passresult.append(ip)

result.append(country)

result.append(province)

result.append(city)returnresultdefvlistfromkv(self, strori, sep1, sep2):

resultlist=[]

fields=strori.split(sep1)for field infields:

kv=field.split(sep2)

resultlist.append(kv[1])returnresultlistdefextractFromMessage(self, message):if message is not None and len(message) > 1:if len(message.split("\u0001")) == 8:

resultlist= self.vlistfromkv(message, "\x01", "\x02")

source=resultlist.pop()

ip=resultlist.pop()

resultlist.extend(self.ip2location(ip))

resultlist.append(source)

result= "\x01".join(resultlist)returnresultdef tpprint(val, num=10000):

"""

Print the first num elements of each RDD generated in this DStream.

@param num: the number of elements from the first will be printed.

"""

def takeAndPrint(time, rdd):

taken = rdd.take(num + 1)

print("########################")

print("Time: %s" % time)

print("########################")

DATEFORMAT = ‘%Y%m%d‘

today = datetime.datetime.now().strftime(DATEFORMAT)

myfile = open("/data/speech/speech." + today, "a")

for record in taken[:num]:

print(record)

myfile.write(str(record)+"\n")

myfile.close()

if len(taken) > num:

print("...")

print("")

val.foreachRDD(takeAndPrint)if __name__ == ‘__main__‘:

zkQuorum= ‘datacollect-1:2181,datacollect-2:2181,datacollect-3:2181‘topic= {‘speech-1‘: 1, ‘speech-2‘: 1, ‘speech-3‘: 1, ‘speech-4‘:1, ‘speech-5‘:1}

groupid= "rokid-speech-get-location"master= "local[*]"appName= "SparkStreamingRokid"timecell= 5sc= SparkContext(master=master, appName=appName)

ssc=StreamingContext(sc, timecell)#ssc.checkpoint("checkpoint_"+time.strftime("%Y-%m-%d", time.localtime(time.time())))

kvs=KafkaUtils.createStream(ssc, zkQuorum, groupid, topic)

kmp=KafkaMessageParse()

lines= kvs.map(lambdax: kmp.extractFromKafka(x))

lines1= lines.flatMap(lambdax: kmp.lineFromLines(x))

valuedict= lines1.map(lambdax: eval(x))

message= valuedict.map(lambdax: kmp.messageFromLine(x))

rdd2= message.map(lambdax: kmp.extractFromMessage(x))#rdd2.pprint()

tpprint(rdd2)#rdd2.fileprint(filepath="result.txt")

#rdd2.foreachRDD().saveAsTextFiles("/home/admin/agent/spark/result.txt")

#sc.parallelize(rdd2.cache()).saveAsTextFile("/home/admin/agent/spark/result", "txt")

#rdd2.repartition(1).saveAsTextFiles("/home/admin/agent/spark/result.txt")

ssc.start()

ssc.awaitTermination()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值