python正则解析文件生成XML文件

一、要解析文件:

CREATE EXTERNAL TABLE cdt_complaint_bug_log_list_d (
provincecode int,
day string,
buzi_day_no string,
logid int,
msisdn string,
local_name string,
area_name string,
link_man string,
link_tel string,
ask_address string,
buzi_type int,
phenomena string,
fault_reason_name string,
current_status int,
change_status_date string,
city_id int,
imsi string,
imsi_lte string)
PARTITIONED BY (
p_provincecode int,
p_date string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS orc
LOCATION
'hdfs://VMAXCluster/zxvmax/telecom/union/cdt/cdt_complaint_bug_log_list_d';

CREATE EXTERNAL TABLE cfg_sector_c (
oid string,
insert_time timestamp,
bsc_id string,
bsc_name string,
bts_id int,
related_bts string,
bts_name string,
sector_id int,
sector_name string,
ci int,
station_type string,
province_id int,
city_id int,
district_id int,
net_type string,
inside string,
rru_pull string,
geo_area string,
cover_scene string,
longitude decimal(20,8),
latitude decimal(20,8),
ant_dir_type string,
ant_azimuth int)
PARTITIONED BY (
p_provincecode int,
p_date string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS orc
LOCATION
'hdfs://VMAXCluster/zxvmax/telecom/cfg/cfg_sector_c';

CREATE EXTERNAL TABLE cdt_lte_calltrace_d (
lastservingcellrsrq double,
lasthodstrsrp double,
lasthodstrsrq double,
lasthoresult int,
lasthodelay int,
lasthofailreason int,
lasthoulbler double,
lasthoregionid int,
lasthox_offset int,
lasthoy_offset int,
firsthosourceerfcn double,
firsthotargeterfcn double,
lasthosourceerfcn double,
lasthotargeterfcn double,
lasthodstenb bigint)
PARTITIONED BY (
p_provincecode int,
p_date string,
p_hour int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS Textfile
LOCATION
'hdfs://VMAXCluster/zxvmax/telecom/union/cdt/cdt_lte_calltrace_d'

 

 

二、解析上面文件的python文件 third.py 

#!/usr/bin/python
#-*- coding:utf-8 -*-
#def read_sqlfile():
import re
import sys
from lxml import etree
from xml.etree import ElementTree as ET
import xml.dom.minidom
import time

#获取当前时间
current_time = time.strftime("%Y-%m-%d",time.localtime())

print "传进来的参数为:",sys.argv
Argumentlist = sys.argv
#print "列表",Argumentlist
table_type = Argumentlist[2]
sql_file_name = Argumentlist[1]
print "sql文件名和表类型分别是:",sql_file_name,",",table_type

#exit()
#打开文件
fopen = open(sql_file_name)
#读取文件后,将文件转换为小写
table_all_content=fopen.read().lower()

print "读取到得内容如下:",table_all_content
print '\n'

print "***************按逗号给单个sql分组*********************"
each_table_all_content = table_all_content.split(';')
sql_len = len(each_table_all_content)
print "建表语句的个数为:",sql_len
print "分组后的整个sql:",each_table_all_content
print "\n"
#print each_table_all_content[0]
#print each_table_all_content[1]

#**********汇总表*************
doc = xml.dom.minidom.Document()
#创建一个根节点metadata对象
metadata = doc.createElement("metadata")
#将根节点添加到文档对象中
doc.appendChild(metadata)
#添加一个叶子节点summariestablemetadata
summariestablemetadata = doc.createElement('summariestablemetadata')
#设置叶子节点的属性
summariestablemetadata.setAttribute('name','Summaries_Table')
#将各子节点summariestablemetadata添加到父节点metadata中
metadata.appendChild(summariestablemetadata)

 

#**********字段表***************
#在内存中创建一个空的文档
docu = xml.dom.minidom.Document()
#创建一个根节点metadata对象
metadata = docu.createElement("metadata")
#将根节点添加到文档对象中
docu.appendChild(metadata)

def analyze_sql_file(str):
for sql_file_len in range(sql_len):
print "**************开始解析表信息,按python正则解析出表名信息********************"
table_pattern_name=re.findall(r"create (.*)\s+table\s+(.*?)\s",each_table_all_content[sql_file_len])
if table_pattern_name:
print "按正则解析出的表名信息为:",table_pattern_name[0]
exit()
table_name_pattern = table_pattern_name[0]
if table_name_pattern[0]:
isexternal = '1'
print "按正则解析出的是否为外部表:",table_name_pattern[0]
if table_name_pattern[1]:
print "按正则解析出的表名为:",table_name_pattern[1]
print "\n"

print "**************解析出表存储方式********************"
table_stored=re.findall(r"stored as (.*).",each_table_all_content[sql_file_len])
if table_stored:
print "表的存储方式为:",table_stored[0]
print '\n'


print "**************解析出表存储路径、制式、数据库********************"
table_location=re.findall(r"\'hdfs://vmaxcluster+(/.*)+\'",each_table_all_content[sql_file_len])
if table_location:
print "正则解析出来的表存储路径为:",table_location[0]
print "\n"

table_classfly = re.findall(r"\'hdfs://vmaxcluster/zxvmax/telecom/+(.*?)(/.*)+\'",each_table_all_content[sql_file_len])
if table_classfly:
print "正则解析出来的制式属于:",table_classfly[0][0]
print '\n'

data_base = re.findall(r"\'hdfs://vmaxcluster/+(.*?)(/.*)+\'",each_table_all_content[sql_file_len])
if data_base:
print "正则解析出来的数据库为:",data_base[0][0]
print '\n'



print "**************解析出表分区信息********************"
#按正则解析出表分区信息
table_content = each_table_all_content[sql_file_len].replace('\r\n','')
print "去掉\r\n后的表信息为:",table_content
table_partition =re.findall(r"partitioned by \((.*?)\)",table_content)
print "分区信息为:",table_partition
table_partition_elements = table_partition[0].split(',')
print "分隔后的字符串为:",table_partition_elements
table_partition_len = len(table_partition_elements)
print "字段组成的数组的个数为:",table_partition_len
print '\n'
list1 = []
list2 = []
for m in range(table_partition_len):
print table_partition_elements[m]
table_partitions_elements = table_partition_elements[m].strip()
print "去掉空格后的:",table_partitions_elements
table_partitions = table_partitions_elements.split(' ')
print "字段和字段类型按空格分隔后:",table_partitions
n = len(table_partitions)
print "长度为:",n
list1.append(table_partitions[0])
list2.append(table_partitions[2])
print "表分区:",list1
print "表分区类型:",list2
a=','
partition_name = a.join(list1)
partition_type = a.join(list2)
print "表分区:",partition_name
print "表分区类型:",partition_type
print '\n'



print "********************按正则解析出字段信息********************"
#按正则解析出字段和字段名
table_field_content = each_table_all_content[sql_file_len].replace('\r\n','')
print "去行符后:",table_field_content
print '\n'
table_feild = re.findall(r"create (.*)\s+table\s+(.*?)\((.*)\)+partitioned\s",table_field_content)
print "截取的字段为:",table_feild

table_feild_elements = table_feild[0][2].split(', ')
print "分隔后的字符串为:",table_feild_elements
table_len = len(table_feild_elements)
print "字段组成的数组的个数为:",table_len
#循环输出字段和字段名
list3 = []
list4 = []
for i in range(table_len):
#将每一组字段和字段名按空格分隔
table_field = table_feild_elements[i].strip()
table = table_field.split(' ')
if table:
table_feild_len = len(table)
for j in range(table_feild_len):
if len(table[j]):
table_feild_name = table[j]
table_feild_type = table[j+2]
list3.append(table[j])
list4.append(table[j+2])
print "字段名为:",table_feild_name
print "字段型为:",table_feild_type
break

print "字段名为:",list3
print "字段型为:",list4
field_len = len(list3)
print field_len
print '\n'



print "**********************组装XML*****************************"
print "**********************1、组装汇总XML*****************************"
def create_summaries_xml():
#添加一个叶子节点 field
field = doc.createElement('field')
#设置叶子节点的属性
#for num_sql in range(sql_len):
field.setAttribute('fieldname',table_name_pattern[1])
field.setAttribute('tablename',table_name_pattern[1])
field.setAttribute('tabletype',table_type)
field.setAttribute('checkruletable','')
field.setAttribute('historicaldatatable','')
field.setAttribute('dependenttable','')
field.setAttribute('delaytime','30')
field.setAttribute('virtualpartitionvalue','')
field.setAttribute('storedperiod','SparkDefaultType')
field.setAttribute('isexternal',isexternal)
field.setAttribute('partitionfieldtype',partition_type)
field.setAttribute('partitionfield',partition_name)
field.setAttribute('location',table_location[0])
field.setAttribute('dbname',data_base[0][0])
field.setAttribute('storedtype',table_stored[0])
field.setAttribute('createtime',current_time)
field.setAttribute('description','')
field.setAttribute('version','V1.0')
field.setAttribute('timegranularity','1day')
field.setAttribute('source','vmax')
field.setAttribute('classify',table_classfly[0][0])
#将各叶子节点field添加到子节点summariestablemetadata中
summariestablemetadata.appendChild(field)


print "**********************2、组装字段XML*****************************"
def create_table_xml():
#添加一个叶子节点summariestablemetadata
tablemetadata = docu.createElement('tablemetadata')
#设置tablemetadata节点属性
tablemetadata.setAttribute('tabletype',table_type)
tablemetadata.setAttribute('name',table_name_pattern[1])
print '##########################'
for field_number in range(field_len):
field = docu.createElement('field')
#设置叶子节点的属性
field.setAttribute('source','')
field.setAttribute('algorithm','')
field.setAttribute('description','')
field.setAttribute('isprimarykey','0')
field.setAttribute('length','')
field.setAttribute('notnull','0')
field.setAttribute('fieldtype',list4[field_number])
field.setAttribute('fieldname',list3[field_number])
#将各叶子节点field添加到子节点summariestablemetadata中
tablemetadata.appendChild(field)
#将各子节点summariestablemetadata添加到父节点metadata中
metadata.appendChild(tablemetadata)
#summariestablemetadataxml = etree.tostring(metadata, pretty_print=True, encoding="UTF-8", method="xml", xml_declaration=True, standalone=None)
#print summariestablemetadataxml
#开始写xml文档
#create_table_xml()
if str == "summaries":
create_summaries_xml()
elif str == "table":
create_table_xml()

#汇总表
def write_summries_xml():

analyze_sql_file("summaries")
#print summariestablemetadataxml
#开始写xml文档
sum_file_name='summaries_'+sql_file_name[:-4]+'.xml'
fp2 = open(sum_file_name, 'w')
summariestablemetadataxml = doc.writexml(fp2,addindent='\t',newl='\n',encoding="utf-8")
print summariestablemetadataxml
print '\n'
write_summries_xml()

#字段表
def write_table_xml():
analyze_sql_file("table")
table_file_name='tab_'+sql_file_name[:-4]+'.xml'
fp1 = open(table_file_name, 'w')
tablemetadataxml = docu.writexml(fp1,addindent='\t',newl='\n',encoding="utf-8")
print tablemetadataxml
write_table_xml()

fopen.close()

 

 

 

三、驱动third.py的shell 

#!/bin/bash

task=""
task_name=$1
table_type=$2
echo "虚拟任务名为:" $task_name
echo "表类型为:" $table_type

if [ $# -ne 2 ]
then
echo "please input 2 parameter that include name of task and type of table。\\nplease this form:sh fifth.sh union_dpi_app_task spark"
exit 1
fi

[[ $table_type == "spark" ]]&&{

task=$task_name"_"$table_type".sql"
echo "sql文件名为;"$task
python third.py $task $table_type 
}
[[ $table_type == "gbase" ]]&&{
task=$task_name"_"$table_type".sql"
echo "sql文件名为;"$task 
python third.py $task $table_type
}

 

转载于:https://www.cnblogs.com/hymmiaomiao/p/8991505.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值