python 实现map
1 #!/usr/bin/env python
2
3 #-*- coding:utf-8 -*-
4 import sys
5 import time
6 reload(sys)
7 sys.setdefaultencoding('utf8')
8
9 MIN_WORD_CNT = 10
10
11 if __name__ == '__main__':
12
13 cnt = 0
14 map_dict = {}
15 for line in sys.stdin:
16 line = line.strip()
17 cnt = cnt + 1
18 info = line.split("\t")
19 query = info[0]
20 if query.startswith("http"):
21 continue
22 pv = info[3]
23 wenda_info = info[9]
24 print("%s\t%s" % (query,pv))
python 实现reduce
1 #!/usr/bin/env python
2 #-*- coding:utf-8 -*-
3 import sys
4 import time
5 reload(sys)
6
7
8 if __name__=="__main__":
9 last_key = None
10 last_pv = 0
11 for line in sys.stdin:
12 line_splits = line.rstrip("\n")
13 line_splits = line_splits.split("\t")
14 outstr = ""
15 if len(line_splits) != 2:
16 continue
17 query = line_splits[0]
18 pv = int(line_splits[1])
19
20 if query == last_key:
21 last_pv += pv
22 else:
23 if last_key:
24 print("%s\t%d" % (last_key, last_pv))
25 last_key = query
26 last_pv = pv
27 if last_key:
28 print("%s\t%d" % (last_key,last_pv))
bash 调用Hadoop streaming
1 #!/bin/sh
2
3 export HADOOP_OPTS=-Xmx8000m
4 hadoop=/usr/bin/hadoop/software/hadoop/bin/hadoop
5 streaming=/usr/bin/hadoop/software/hadoop/contrib/streaming/hadoop-streaming.jar
6 #input=/home/eng/lixiang3/sousuo_url
7 input=/home/hdp-qss-bigdata/dataware/pc_query/202008*/0000/
8 output=/home/eng/jiguifang/querypv_pc_0831
9 ${hadoop} fs -rmr ${output}
10 segmenter_path="/home/hdp-vertical/dependency/sws_data#sws_data"
11 ${hadoop} jar ${streaming} \
12 -D mapred.job.priority="VERY_HIGH" \
13 -D mapred.job.name="lx_3" \
14 -D mapred.job.max.map.running=1024 \
15 -D mapred.job.max.reduce.running=1024 \
16 -D mapred.map.tasks=1024 \
17 -D mapred.reduce.tasks=1024 \
18 -D mapred.task.timeout=7200000 \
19 -D mapred.success.file.status=false \
20 -D mapred.output.compress="false" \
21 -D mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec \
22 -inputformat org.apache.hadoop.mapred.lib.CombineTextInputFormat \
23 -cacheFile ${segmenter_path} \
24 -mapper "python map_pc_query.py" \
25 -reducer "python red_pc_query.py" \
26 -file map_pc_query.py \
27 -file red_pc_query.py \
28 -input ${input} \
29 -output ${output}