1. 大数据mapper书写范式hdfs
import json
import sys
def read_input(input_stream):
for line in input_stream:
yield line.rstrip('\n')
def load_json_data(json_line):
try:
data = json.loads(json_line)
unique_id = data.get('id')
combined_content = ' '.join([data.get('title', ''), data.get('text', '')])
return unique_id, combined_content
except json.JSONDecodeError:
return None, None
def mapper(input_stream, output_stream=sys.out):
processed_ids = set()
for json_line in read_input(input_stream):
id, text = load_json_data(json_line)
if filter():
output_stream.write(json_line + "\n")
processed_ids.add(id)
def getKeywords():
pass
if __name__ == "main":
mapper(sys.stdin)