使用python监控HDFS文件的增量【优化中】

目录

1、需求和步骤

2、项目结构

3、项目代码

       3.1建表语句 hdfs_Ctreate_table
       3.2删除文件记录 hdfs_delete_file_record.py
       3.3文件路径的小时监控 hdfs_path_Monitor.py
       3.4文件路径的天监控 hdfs_path_Monitor_day.py
       3.5文件大小记录 hdfs_size.py
       3.6mysql连接信息
       3.7mysql工具类 mysqlHelper.py
       3.8工具类utils.py

4、结果展示

————————————————————————————-

1、需求和步骤

需求:
1、获取HDFS每个文件夹每小时和每天的增量
2、定时删除HDFS任务历史记录

需求1步骤:
1、获取到HDFS所有文件名称,参考:hdfs_size.py
2、采用递归获取文件大小,参考:hdfs_path_Monitor.py
3、每天计算文件大小,参考:hdfs_path_Monitor_day.py

需求2步骤:
1、查一个月前的所有文件存储到hdfs_delete_file_record中,标记状态为0
2、通过mysql查询标记为0的数据,获取对应路径
3、通过hadoop fs -rm -r -skipTrash + path 进行删除
4、删除之后标记为1

2、项目结构

这里写图片描述

3、项目代码

3.1建表语句 hdfs_Ctreate_table

 CREATE TABLE hdfs_delete_file_record(
     date VARCHAR(25) not null,
     p_name VARCHAR(255) not null,
     status VARCHAR(5) not null,
     primary key (date,p_name)
 )ENGINE=InnoDB DEFAULT CHARSET=utf8;

 CREATE TABLE hdfs_path_Monitor(
     id INT not NULL,
     p_id INT not NULL,
     p_name VARCHAR(255) not null,
     date VARCHAR(25) not null,
     dir_kb float not null,
     hour_increase float not null,
     day_increase float not null,
     primary key (id,p_id,p_name,date)
 )ENGINE=InnoDB DEFAULT CHARSET=utf8;

 CREATE TABLE hdfs_path_Monitor_temp(
     id INT not NULL,
     p_id INT not NULL,
     p_name VARCHAR(255) not null,
     date VARCHAR(25) not null,
     dir_kb float not null,
     hour_increase float not null,
     day_increase float not null,
     primary key (id,p_id,p_name,date)
 )ENGINE=InnoDB DEFAULT CHARSET=utf8;

 CREATE TABLE hdfs_size(
     id INT not NULL,
     p_name VARCHAR(255) not null,
     date VARCHAR(25) not null,
     dir_byte float not null,
     primary key (id,p_name,date)
 )ENGINE=InnoDB DEFAULT CHARSET=utf8;
3.2删除文件记录 hdfs_delete_file_record.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-

'''
1、查一个月前的所有文件存储到hdfs_delete_file_record中,标记状态为0
2、通过mysql查询标记为0的数据,获取对应路径
3、通过hadoop fs -rm -r -skipTrash + path 进行删除
4、删除之后标记为1
'''

import os
import time
import utils
import mysqlHelper

'''30天前的时间'''
monthAgo = str(utils.getBeforeDay(30))[0:16]

'''历史路径的列表'''
L = []

# hadoop fs -rm -r -skipTrash   不进入回收站
def cleanOldLog():
    values=mysqlHelper.mysqlSearch('select * from hdfs_delete_file_record where STATUS=0')
    for line in values:
        os.popen("hadoop fs -rm -r -skipTrash " + line[1])
        print line[1]
        mysqlHelper.mysqlInsert("update hdfs_delete_file_record set STATUS=1 where p_name = '%s'  " % (line[1]))

def cleanApplicationHistory():
    logs = os.popen('hdfs dfs -ls /user/spark/applicationHistory')
    args = []

    mysqlHelper.mysqlInsert("DELETE FROM hdfs_delete_file_record where status = '0'")

    for line in logs:
        lines = line.replace('\n', "").split(" ")
        length = len(lines)

        '''文件时间'''
        date = str(lines[length - 3]) + " " + str(lines[length - 2])

        if date > monthAgo:
            continue

        '''文件路径'''
        filePath = lines[length - 1]
        L.append(filePath)

        sql = "insert into hdfs_delete_file_record (`date`, p_name , status) values (%s, %s, %s)"
        args.append((date, filePath, "0"))

    mysqlHelper.mysqlBatchInset(sql , args)

def main():
    StartTime = time.time()
    cleanApplicationHistory()
    cleanOldLog()
    mysqlHelper.mysqlClose()
    EndTime = time.time()
    print "所需要的时间为  : " + str(EndTime - StartTime)

if __name__ == "__main__":
    main()
3.3文件路径的小时监控 hdfs_path_Monitor.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-


'''
备注:使用【树】的结构记录父子节点
1、使用递归查出每层目录结构
2、使用字典记录父节点和子节点
   2.1、使用getHDFSSize()方法,去除路径下面的文件避免多次查询
   2.2、规定层级避免多次查询
   2.3、符合条件的路径批量插入到mysql中
   2.4、符合条件的路径进入List中,通过递归的方法继续获取路径
3、每小时数据进入mysql后,通过hourComputer()方法进行计算
'''

import os
import time
import mysqlHelper
import utils

'''全局变量系统时间 LocalTime 2017-07-28 14:37:47'''
LocalTime = str(utils.getnowTime())[0:16]

BeforeOneHour = str(utils.getBeforeHour(1))[0:16]

FileNames = set()

'''获取目录的曾层级'''
hierarchy = 4

'''p_id的字典'''
data_dict = {}

'''堆方法'''


def digui(list):
    if (len(list) == 0) == True:
        return 1
    else:
        gethdfs(list)


'''
有一个细节,当path下面没有文件或者path是一个文件,
则只会放回的参数为本身路径,所有过滤掉这一层
'''


def gethdfs(pathSet):
    print pathSet
    for path in pathSet:
        L = set()
        lines = os.popen('hdfs dfs -du -h %s ' % path)
        args = []
        for line in lines:
            if (line.replace('\n', "").split(' ').pop() == path) == False:
                num = line.split(' ')[0]
                tags = line.split(' ')[1]
                # 文件大小
                fileSize = convertSize(num, tags)
                # 文件路径
                filePath = line.replace('\n', "").split(' ').pop()

                # 判断是否大于层级
                if len(filePath.split("/")) > hierarchy:
                    continue

                A = set()
                A.add(filePath)
                # 判断是否是文件
                if len((A.difference(FileNames))) == 0:
                    continue

                # print A
                # 节点自增ID
                Node_ID = add()
                mysqlTup = (Node_ID, filePath)

                # 通过字典获取父节点路径
                data_dict[mysqlTup[1]] = Node_ID
                p_list = mysqlTup[1].split('/')
                # print "p_list ----->", type(p_list), p_list
                length = len(p_list)
                p_str = '/'.join(p_list[:length - 1]) if isinstance(p_list, list) and len(p_list) > 0 else None
                # print "p_str ----->", p_str
                p_id = data_dict.get(p_str, 1)

                global sql
                # 批量插入语句
                sql = "insert into hdfs_path_Monitor (id, p_id, p_name, `date`, dir_kb, hour_increase, day_increase) values (%s,%s, %s, %s, %s, %s, %s)"
                args.append((mysqlTup[0], p_id, mysqlTup[1], LocalTime, fileSize, '0', '0'))
                L.add(line.replace('\n', "").split(' ').pop())
            else:
                return 1

        # 批量语句的执行
        mysqlHelper.mysqlBatchInset(sql, args)

        if digui(L) == 1:
            continue


'''自增函数'''


def add(x=1):
    try:
        add.sum += x
    except AttributeError:
        add.sum = x

    return add.sum


def convertSize(num, tags):
    totals = 0
    if tags == "G":
        totals = float(str(num)) * 1024 * 1024
    elif tags == "M":
        totals = float(num) * 1024
    elif tags == "K":
        totals = float(num)
    return totals


'''获取所有文件名称列表'''


def getHDFSSize():
    results = mysqlHelper.mysqlSearch("select * from hdfs_size")
    for row in results:
        p_name = row[1]
        # 打印结果
        FileNames.add(p_name)


'''每小时计算'''


def hourComputer():
    ''' 每个小时增长计算方法,结果数据插入到临时表 '''
    sql = '''
               INSERT INTO hdfs_path_Monitor_temp(id,p_id,p_name,date,dir_kb,hour_increase,day_increase)
               select h3.id as id, h3.p_id as p_id, h3.p_name as p_name,h3.date as date, h3.dir_kb as dir_kb,ABS((h3.dir_kb - h2.dir_kb)) as
               hour_increase,h3.day_increase as day_increase
               from hdfs_path_Monitor h2
               RIGHT JOIN hdfs_path_Monitor h3 on h3.p_name = h2.p_name
               where h2.date=\'%s\' and h3.date=\'%s\'
          ''' % (BeforeOneHour, LocalTime)

    mysqlHelper.mysqlInsert(sql)

    '''删除hdfs_path_Monitor中为当前时间的数据'''
    delete_Monitor_table = "DELETE from hdfs_path_Monitor where date=\'%s\'" % (LocalTime)

    mysqlHelper.mysqlInsert(delete_Monitor_table)

    '''将hdfs_path_Monitor_temp表中的数据插入到hdfs_path_Monitor'''
    insert_Monitor_table = '''
               INSERT INTO hdfs_path_Monitor(id,p_id,p_name,date,dir_kb,hour_increase,day_increase)
               SELECT *
               FROM hdfs_path_Monitor_temp
    '''
    mysqlHelper.mysqlInsert(insert_Monitor_table)

    '''删除临时表的数据'''
    insert_Temp_table = "DELETE from hdfs_path_Monitor_temp where date=\'%s\'" % (LocalTime)

    mysqlHelper.mysqlInsert(insert_Temp_table)


def main():
    if LocalTime[11:16] == "00:00":
        exit()

    StartTime = time.time()
    path = "/"
    pathSet = set()
    pathSet.add(path)

    mysqlHelper.mysqlInsert("delete from hdfs_path_Monitor where DATE =\'%s\';" % (LocalTime))
    mysqlHelper.mysqlInsert("insert into hdfs_path_Monitor values(%s, %s, \'%s\', \'%s\', %s, %s, %s)" % (
        add(), 0, path, LocalTime, 0.00, 0.00, 0.00))
    getHDFSSize()
    gethdfs(pathSet)
    hourComputer()
    mysqlHelper.mysqlClose()
    EndTime = time.time()

    print "所需要的时间为  : " + str(EndTime - StartTime) + "插入的数量为 :" + str(add() - 1)


if __name__ == "__main__":
    main()
3.4文件路径的天监控 hdfs_path_Monitor_day.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-


import os
import time
import mysqlHelper
import utils

'''全局变量系统时间 LocalTime 2017-07-28 14:37:47'''
LocalTime = str(utils.getnowTime())[0:16]

BeforeOneDay = str(utils.getBeforeHour(24))[0:16]

def hourComputer():

    sql = '''
               INSERT INTO hdfs_path_Monitor_temp(id,p_id,p_name,date,dir_kb,hour_increase,day_increase)
               select h3.id as id, h3.p_id as p_id, h3.p_name as p_name,h3.date as date, h3.dir_kb as dir_kb,h3.hour_increase as
               hour_increase,ABS((h3.hour_increase - h2.hour_increase)) as day_increase
               from hdfs_path_Monitor h2
               RIGHT JOIN hdfs_path_Monitor h3 on h3.p_name = h2.p_name
               where h2.date=\'%s\' and h3.date=\'%s\'
          ''' %( BeforeOneDay , LocalTime )

    mysqlHelper.mysqlInsert(sql)

    delete_Monitor_table = "DELETE from hdfs_path_Monitor where date=\'%s\'" %(LocalTime)

    mysqlHelper.mysqlInsert(delete_Monitor_table)

    insert_Monitor_table = '''
               INSERT INTO hdfs_path_Monitor(id,p_id,p_name,date,dir_kb,hour_increase,day_increase)
               SELECT *
               FROM hdfs_path_Monitor_temp
    '''
    mysqlHelper.mysqlInsert(insert_Monitor_table)

    insert_Temp_table = "DELETE from hdfs_path_Monitor_temp where date=\'%s\'" %(LocalTime)

    mysqlHelper.mysqlInsert(insert_Temp_table)

def main():
    StartTime = time.time()
    hourComputer()
    mysqlHelper.mysqlClose()
    EndTime = time.time()
    print "所需要的时间为  : " + str(EndTime - StartTime)

if __name__ == "__main__":
    main()
3.5文件大小记录 hdfs_size.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import os
import time
import mysqlHelper
import utils

'''
1、通过命令(hdfs dfs -ls -R /)获取到所有文件的全部内容
2、将文件按条处理,空格切分,第一个字段的第一个字符为横杆‘-’,则为文件
3、通过mysql批量插入
'''

LocalTime = utils.getnowTime()

def gethdfsSize():
    lines = os.popen('hdfs dfs -ls -R / ')
    args = []

    mysqlHelper.mysqlInsert("delete from hdfs_size")
    for line in lines:
        str = line.replace('\n', "").split(' ')
        filePath = str.pop()
        dir_byte = str[len(str) - 3]

        sql = "insert into hdfs_size (id, p_name, `date`, dir_byte) values (%s, %s, %s, %s)"
        if ((str[0][0] == "-") == True):
            args.append((add(), filePath, LocalTime, dir_byte))

    mysqlHelper.mysqlBatchInset(sql, args)


def add(x=1):
    try:
        add.sum += x
    except AttributeError:
        add.sum = x

    return add.sum


def main():
    StartTime = time.time()
    gethdfsSize()
    mysqlHelper.mysqlClose()
    EndTime = time.time()
    print "所需要的时间为  : " + str(EndTime - StartTime) + "插入的数量为 :" + str(add() - 1)


if __name__ == "__main__":
    main()
3.6mysql连接信息
# database source
[downdb]
host = xxx.xxx.xxx.xxx
port = 3306
user = funnel
pass = 1qaz@<2wsx
dbName = user_privileges


[ondb]
host = xxx.xxx.xxx.xxx
port = 3306
user = funnel
pass = 1qaz@<2wsx
dbName = bi_data
3.7mysql工具类 mysqlHelper.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import ConfigParser
import codecs
import pymysql

DBSource = "ondb"

cp = ConfigParser.SafeConfigParser()
with codecs.open('myapp.conf', 'r', encoding='utf-8') as f:
    cp.readfp(f)
'''mysql的连接'''
conn = pymysql.connect(host=cp.get(DBSource, 'host'), user=cp.get(DBSource, 'user'), password=cp.get(DBSource, 'pass'),
                       database=cp.get(DBSource, 'dbName'), use_unicode=True)
'''mysql的游标'''
cursor = conn.cursor()


def mysqlInsert(sql):
    print "游标 ---> ", type(cursor), cursor
    print "sql ---> ", type(sql), sql
    cursor.execute(sql)
    conn.commit()


def mysqlSearch(sql):
    cursor.execute(sql)
    return cursor.fetchall()


def mysqlBatchInset(sql, args):
    cursor.executemany(sql, args)
    conn.commit()


def mysqlClose():
    conn.close()


if __name__ == "__main__":

    sql = "select * from hdfs_path_Monitor limit 10 "

    # 使用 fetchone() 方法获取一条数据库。
    values = mysqlSearch(sql)

    for line in values:
        print line

    mysqlClose()
3.8 工具类utils.py
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import time
from datetime import datetime, timedelta


def getBeforeDay(num):
    return str(datetime.now() + timedelta(days=-num))[0:19]


def getLastDay(num):
    return str(datetime.now() + timedelta(days=num))[0:19]


def getBeforeHour(num):
    return str(datetime.now() + timedelta(hours=-num))[0:19]


def getLastHour(num):
    return str(datetime.now() + timedelta(hours=num))[0:19]


def getnowTime():
    return str(datetime.now())[0:19]


if __name__ == "__main__":
    print str(getBeforeHour(1))[0:16]

4、结果展示

这里写图片描述

这里写图片描述

这里写图片描述

这里写图片描述

       如果您喜欢我写的博文,读后觉得收获很大,不妨小额赞助我一下,让我有动力继续写出高质量的博文,感谢您的赞赏!!!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

北京小辉

你的鼓舞将是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值