数据提取python脚本

数据提取python脚本

版权声明:本文为博主原创文章,转载请注明出处:https://blog.csdn.net/sgqhappy/article/details/83988985

我们经常用到数据提取的Hive Sql的编写,每次数据提取都得进行hive的编写,为了将这种重复性强的运行命令简单化自动化人性化,我特地编写了一个python脚本,可以实现数据清洗,数据处理,计数下发,读写文件,保存日志等功能。

1. 导包

#!/usr/bin/python
#coding:utf-8

#Made by sgqhappy
#Date: 20181113
#function: data extract

from subprocess import Popen,PIPE
import os
import sys
import io
import re
import commands
import logging
from logging import handlers
from re import match

2. 定义一个类,用来打印脚本运行的log日志

日志既可以打印在控制台上,也可以输出到log文件。

class Logger(object):
	def __init__(self,log_file_name,log_level,logger_name):
		self.__logger = logging.getLogger(logger_name);
		self.__logger.setLevel(log_level);
		file_handler = logging.FileHandler(log_file_name);
		console_handler = logging.StreamHandler();
		
		#set log format and show log at console and log_file.
		LOG_FORMAT = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s : %(message)s";
		formatter = logging.Formatter(LOG_FORMAT);
		
		file_handler.setFormatter(formatter);
		console_handler.setFormatter(formatter);
		
		self.__logger.addHandler(file_handler);
		self.__logger.addHandler(console_handler);
		
	def get_log(self):
		return self.__logger;

3. 定义文件名及文件路径

	#This is file name.
	file_name = "%s_%s_%s" % (sys.argv[2],sys.argv[4],sys.argv[11]);
	info_log_path = '/python_test/%s.info.log' % (file_name);
	
	#this is record name and path.
	record_name = "data_extract_record.txt";
	record_path = "/python_test/";
	
	logger = Logger(log_file_name="%s" % (info_log_path),log_level=logging.DEBUG,logger_name="myLogger").get_log();
	
	#this is log path.
	path = '/python_test/%s.desc.log' % (file_name);
	logger.info("\n");
	logger.info("log path: %s" % (path));
	logger.info("\n");

4. 提取字段信息保存

	#function:write all fields to log file.
	hive_cmd_desc = 'beeline -u ip -n username -e "desc %s.%s" >> %s' % (sys.argv[1],sys.argv[2],path);
	logger.info(hive_cmd_desc);
	logger.info("\n");
	status,output = commands.getstatusoutput(hive_cmd_desc);
	logger.info(output);
	logger.info("\n");
	
	#logger.info success or failed information.
	if status ==0:
		logger.info("desc %s to %s successful!" % (sys.argv[2],path));
	else:
		#set color: '\033[;31;40m'+...+'\033[0m'
		logger.error('\033[;31;40m'+"desc %s to %s failed!" % (sys.argv[2],path)+'\033[0m');
		#exit program.
		exit();
	logger.info("\n");

5. 字符串处理

	#this is fields list
	fields_list = [];
	with io.open(path,'r',encoding="utf-8") as f:
		fields = list(f);
		for line in fields:
			#remove start letter "|".
			line_rm_start_letter = line.strip("|");
			logger.info(line_rm_start_letter);
			#remove start and end space.
			pos = line_rm_start_letter.find("|");
			fields_list.append(line_rm_start_letter[0:pos].strip());
	logger.info("\n");
	
	#remove desc.log.
	remove_desc_log = 'rm %s' % (path);
	logger.info(remove_desc_log);
	status,output = commands.getstatusoutput(remove_desc_log);
	
	#logger.info success or failed information.
	if status == 0:
		logger.info("remove %s successful!" % (path));
	else:
		logger.error('\033[;31;40m'+"remove %s failed!" % (path)+'\033[0m');
		exit();
	logger.info("\n");
	
	#remove the first three lines.
	del fields_list[0:3];
	create = "";
	start_or_etl = "";
	if 'etl_load_date' in fields_list:
		start_or_etl = "etl_load_date";
		end_letter_pos = fields_list.index("etl_load_date");
		len = len(fields_list);
		del fields_list[end_letter_pos:len+1];
	if 'start_dt' in fields_list:
		start_or_etl = "start_dt";
		end_letter_pos = fields_list.index("start_dt");
		len = len(fields_list);
		del fields_list[end_letter_pos:len+1];		

6. 添加附加条件

	#add condition_field.
	condition_field = "%s" % (sys.argv[3]);
	if condition_field == "0":
		pass;
	else:
		start_or_etl = condition_field;
		
	for i in fields_list:
		#logger.info(len(i));
		logger.info(i);
	logger.info("\n");

7. 拼接字段

	#splice fields.
	fields_splice = "";
	for i in fields_list:
		fields_splice = fields_splice+"nvl(a.\`"+i+"\`,''),'|',";
	logger.info(fields_splice);
	logger.info("\n");

8. 建表

	#create table command.
	add_conditions = "%s" % (sys.argv[9]);
	if add_conditions == "and 1=1":
		create = "create table if not exists database.%s stored as textfile as select concat (%s from %s.%s a join %s b on trim(a.\`%s\`)=trim(b.\`%s\`) where b.code='%s' and a.\`%s\`>='%s' and a.\`%s\`<='%s' %s;" % (file_name,fields_splice,sys_argv[1],sys.argv[2],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[4],start_or_etl,sys.argv[10],start_or_etl,sys.argv[11],sys.argv[9]);
	else:
		create = "create table if not exists database.%s stored as textfile as select concat(%s from %s.%s a %s;" % (file_name,fields_splice,sys.argv[1],sys.argv[2],sys.argv[9]);
	logger.info(create);
	logger.info("\n");
	
	#execute the command.
	hive_cmd_create = 'beeline -u ip -n username -e "%s"' % (create);
	logger.info(hive_cmd_create);
	logger.info("\n");
	status,output = commands.getstatusoutput(hive_cmd_create);
	logger.info(output);
	logger.info("\n");
	
	#logger.info success or failed information.
	if status ==0:
		logger.info("create database.%s successful!" % (file_name));
	else:
		#set color: '\033[;31;40m'+...+'\033[0m'
		logger.error('\033[;31;40m'+"create database.%s failed!" % (file_name)+'\033[0m');
		#exit program.
		exit();
	logger.info("\n");

9. 计数

	#count table_new command.
	count = "select count(*) from database.%s;" % (file_name);
	logger.info(count);
	logger.info("\n");
	
	#execute the command.
	hive_cmd_count = 'beeline -u ip -n username -e "%s"' % (count);
	logger.info(hive_cmd_count);
	logger.info("\n");
	status,output = commands.getstatusoutput(hive_cmd_count);
	
	#logger.info success or failed information.
	if status ==0:
		logger.info("count database.%s successful!" % (file_name));
	else:
		#set color: '\033[;31;40m'+...+'\033[0m'
		logger.error('\033[;31;40m'+"count database.%s failed!" % (file_name)+'\033[0m');
		#exit program.
		exit();
	logger.info("\n");
	logger.info(output);
	logger.info("\n");

10. 提取数量

	#extract number.
	output_split = output.split("\n");
	number = output_split[7].strip("|").strip();
	result = re.match(r"^\d+$",number);
	if result:
		#logger.info count.
		logger.info("The number matched success!");
		logger.info('\033[1;33;40m'+"The count is : %s" % (number)+'\033[0m');
		logger.info("\n");
	else:
		logger.warning("The number matched failed!");

11. 抽样查看数据的准确性

	#show the first five data.
	first_five_data = "select * from database.%s limit 5;" % (file_name);
	logger.info(first_five_data);
	logger.info("\n");
	
	#execute the command.
	hive_first_five_data = 'beeline -u ip -n username -e "%s"' % (first_five_data);
	logger.info(hive_first_five_data);
	logger.info("\n");
	status,output = commands.getstatusoutput(hive_first_five_data);
	
	#logger.info success or failed information.
	if status == 0:
		logger.info("show the first five data of database.%s successful!" % (file_name));
	else:
		#set color: '\033[;31;40m'+...+'\033[0m'
		logger.error('\033[;31;40m'+"show the first five data of database.%s failed!" % (file_name)+'\033[0m');
		#exit program.
		exit();
	logger.info("\n");
	
	#logger.info the first five data.
	logger.info('\033[1;33;40m'+"the first five data are : \n\n%s" % (output)+'\033[0m');
	logger.info("\n");

12. 记录相关信息到文件

	#append to record.txt.
	output = open("%s%s" % (record_path,record_name),'a');
	if add_conditions == "and 1=1":
		output.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ('database_name','table_name','code','extract_date','count','rel_tb_name','rel_field_name_pre','rel_field_name_after','date_pre','date_after'));
		output.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (sys.argv[1],sys.argv[2],sys.argv[4],sys.argv[5],number,sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[10],sys.argv[11]));
	else:
		output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ('database_name','table_name','code','extract_date','count','add_conditions'));
		output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (sys.argv[1],sys.argv[2],sys.argv[4],sys.argv[5],number,sys.argv[9]));
	output.close();
	
	#logger.info the data extraction success information.
	logger.info('\033[1;35;40m'+"*****Data extract success!*****"+'\033[0m');
	logger.info('\033[1;35;40m'+"*****Made by sgqhappy in %s!*****" % (sys.argv[5])+'\033[0m');
	logger.info("\n");

13. 执行python脚本

python data.py database table 0 1000 20181101 rel_table rel_field rel_field "and 1=1" 20180731 20180801
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值