python实现mapreduce

最新推荐文章于 2024-05-04 22:53:57 发布

小鸡仔_orz

最新推荐文章于 2024-05-04 22:53:57 发布

阅读量421

点赞数

分类专栏： Python 文章标签： hadoop

本文链接：https://blog.csdn.net/weixin_40650252/article/details/108491167

版权

Python 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

python 实现map

  1 #!/usr/bin/env python
  2 
  3 #-*- coding:utf-8 -*- 
  4 import sys
  5 import time
  6 reload(sys)
  7 sys.setdefaultencoding('utf8')
  8 
  9 MIN_WORD_CNT = 10
 10 
 11 if __name__ == '__main__':
 12 
 13     cnt = 0
 14     map_dict = {}
 15     for line in sys.stdin:
 16         line = line.strip()
 17         cnt = cnt + 1
 18         info = line.split("\t")
 19         query = info[0]
 20         if query.startswith("http"):
 21             continue
 22         pv = info[3]
 23         wenda_info = info[9]
 24         print("%s\t%s" % (query,pv))

python 实现reduce

  1 #!/usr/bin/env python
  2 #-*- coding:utf-8 -*- 
  3 import sys
  4 import time
  5 reload(sys)
  6 
  7 
  8 if __name__=="__main__":
  9     last_key = None
 10     last_pv = 0
 11     for line in sys.stdin:
 12         line_splits = line.rstrip("\n")
 13         line_splits = line_splits.split("\t")
 14         outstr = ""
 15         if len(line_splits) != 2:
 16             continue
 17         query = line_splits[0]
 18         pv = int(line_splits[1])
 19 
 20         if query == last_key:
 21             last_pv += pv
 22         else:
 23             if last_key:
 24                 print("%s\t%d" % (last_key, last_pv))
 25             last_key = query
 26             last_pv = pv
 27     if last_key:
 28         print("%s\t%d" % (last_key,last_pv))

bash 调用Hadoop streaming


  1 #!/bin/sh
  2 
  3 export HADOOP_OPTS=-Xmx8000m
  4 hadoop=/usr/bin/hadoop/software/hadoop/bin/hadoop
  5 streaming=/usr/bin/hadoop/software/hadoop/contrib/streaming/hadoop-streaming.jar
  6 #input=/home/eng/lixiang3/sousuo_url
  7 input=/home/hdp-qss-bigdata/dataware/pc_query/202008*/0000/
  8 output=/home/eng/jiguifang/querypv_pc_0831
  9 ${hadoop} fs -rmr ${output}
 10 segmenter_path="/home/hdp-vertical/dependency/sws_data#sws_data"
 11 ${hadoop} jar ${streaming} \
 12 -D mapred.job.priority="VERY_HIGH" \
 13 -D mapred.job.name="lx_3" \
 14 -D mapred.job.max.map.running=1024 \
 15 -D mapred.job.max.reduce.running=1024 \
 16 -D mapred.map.tasks=1024 \
 17 -D mapred.reduce.tasks=1024 \
 18 -D mapred.task.timeout=7200000 \
 19 -D mapred.success.file.status=false \
 20 -D mapred.output.compress="false" \
 21 -D mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \
 22 -inputformat org.apache.hadoop.mapred.lib.CombineTextInputFormat \
 23 -cacheFile ${segmenter_path} \
 24 -mapper "python map_pc_query.py" \
 25 -reducer "python red_pc_query.py" \
 26 -file map_pc_query.py \
 27 -file red_pc_query.py \
 28 -input ${input} \
 29 -output ${output}