Pyhton mongodb 抽样对比

本文介绍了一种用于比较两个MongoDB集群数据一致性的Python脚本。该脚本能够检查数据库和集合是否存在差异,对比集合中的记录数量,索引信息,并通过随机抽样进行数据对比,确保源和目标集群的数据一致性。
摘要由CSDN通过智能技术生成
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys

import pymongo

# 全局参数
config = {}
# 错误输出
errLogFile = None


def tips():
    print(
        '|------------------------------------------------------------------------------------------------------------------------------------------------------------------------|')
    print('| 启动demo ./mongodbCompare.py.py --src=localhost:27017/db? --dest=localhost:27018/db? --logPath=/opt/xx)  |')
    print(
        '|------------------------------------------------------------------------------------------------------------------------------------------------------------------------|')


class MongoCluster:
    # pymongo connection
    conn = None

    # connection string
    url = ""

    def __init__(self, url):
        self.url = url

    def connect(self):
        self.conn = pymongo.MongoClient(self.url)

    def close(self):
        self.conn.close()


"""
 对比校验mongo信息
"""


def check(src, dest):
    srcDatabase = src.conn.get_database()
    destDatabase = dest.conn.get_database()

    srcCollectionNames = srcDatabase.list_collection_names()
    destCollectionNames = destDatabase.list_collection_names()
    srcColls = [coll for coll in srcCollectionNames if coll not in config['skip'].split(",")]
    dstColls = [coll for coll in destCollectionNames if coll not in config['skip'].split(",")]
    print("src Collection", srcColls)
    print("dst Collection", dstColls)
    # src 存在的必须在 dest
    differenceCollectionNames = [item for item in srcColls if not item in dstColls]
    if len(differenceCollectionNames) > 0:
        errLogFile.writelines("src 与 dst Collection 差集:" + "".join(differenceCollectionNames) + "\n")
    for coll in srcColls:
        if coll in differenceCollectionNames:
            print("src 与 dst Collection 差集:%s 跳过" % (coll))
            continue
        srcCollection = srcDatabase[coll]
        destCollection = destDatabase[coll]
        srcCollectionCount = srcCollection.count()
        destCollectionCount = destCollection.count()
        if srcCollectionCount != destCollectionCount:
            errLogFile.writelines(
                "Collection: " + coll + " 条数不同,src count:" + str(srcCollectionCount) + "条 dest count:" + str(
                    destCollectionCount) + "\n")
            continue
        src_index_length = len(srcCollection.index_information())
        dest_index_length = len(destCollection.index_information())
        if src_index_length != dest_index_length:
            errLogFile.writelines("Collection: " + coll + " 索引信息不同")
            continue
        if data_comparison(srcCollection, destCollection):
            print("比对 Collection : %s 通过 " % (coll))
        else:
            errLogFile.writelines("Collection: " + coll + " 数据抽样校验不通过")
            continue
    return True


"""
 随机抽样对比数据
"""


def data_comparison(srcColl, dstColl):
    count = config['count']
    if count > srcColl.count():
        count = srcColl.count()
    if count == 0:
        return True

    rec_count = count
    batch = 16
    show_progress = (batch * 64)
    total = 0
    while count > 0:
        # sample a bounch of docs

        docs = srcColl.aggregate([{"$sample": {"size": batch}}])
        while docs.alive:
            doc = docs.next()
            migrated = dstColl.find_one(doc["_id"])
            # both origin and migrated bson is Map . so use ==
            if doc != migrated:
                print("DIFF => src_record[%s], dst_record[%s]" % (doc, migrated))
                return False

        total += batch
        count -= batch

        if total % show_progress == 0:
            print("  ... process %d docs, %.2f %% !" % (total, total * 100.0 / rec_count))

    return True


if __name__ == "__main__":
    tips()
    config['count'] = 100
    # 跳过的
    config['skip'] = 'system.profile,system.js'
    for i in range(1, len(sys.argv)):
        c = sys.argv[i]
        c = c[2:]
        c = c.split("=", 1)
        if c[0] == 'count':
            config[c[0]] = int(c[1])
        else:
            config[c[0]] = c[1]
    print('启动参数:', config)
    errLogFile = open(config['logPath'], "a")
    try:
        src, dest = MongoCluster(config['src']), MongoCluster(config['dest'])
        src.connect()
        dest.connect()
    except Exception as e:
        print('mongo对比程序出错:', e)
        exit()

    if check(src, dest):
        print('SUCCESS')
        exit(0)
    else:
        print('FAIL')
        exit(-1)

    src.close()
    dest.close()
    errLogFile.close()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值