使用pymongo保存数据到MongoDB的工具类

pymongo 的增删改查请参考:Python连接MongoDB,使用pymongo进行增删改查

在此基础上,引入多线程(pymongo多线程安全),打包成一个工具类,可以直接把一个dataframe保存/更新/覆盖的工具类:

文章目录

工具类

from pymongo import MongoClient
import pandas as pd
import numpy as np


class MongoHelper(object):
    def __init__(self, database, collection=None, host="127.0.0.1", port=27017):
        mongo_conn = MongoClient(host=host, port=port)
        self.mongo_db = mongo_conn.get_database(database)  # 数据库连接(pymongo自带连接池)
        self.coll = None
        if collection:  # 集合
            self.coll = self.mongo_db.get_collection(collection)

    def set_collection(self, collection):
        """指定链接的集合"""
        self.coll = self.mongo_db.get_collection(collection)

    def get_coll(self):
        return self.coll

    def _save_or_update_mongodb(self, record, dict_value, drop=False):
        """根据检查_id,如果存在就覆盖,如果不存在就新增
        :param record: 数据库记录
        :param dict_value: 待保存的数据
        :param drop: 如果有记录,是否先删除后保存
        """
        if not record:  # 不存在,直接插入
            self.coll.insert_one(dict_value)
        elif drop is True:  # 先删除,然后插入,相当于覆盖原始数据
            self.coll.delete_one(record)
            self.coll.insert_one(dict_value)
        else:  # 更新
            self.coll.update_one(record, {
                "$set": dict_value,
            })

    def get_all_data(self, record_list=["_id"]):
        """获取全部数据,record_list中为需要的数据列名称"""
        for _record in self.coll.find({}, record_list):
            yield _record

    def save_new_dataframe_to_mongo(self, dataframe: pd.DataFrame):
        """保存dataframe格式的新数据到MongoDB,会自动分配_id,也可以指定_id"""
        coll = self.get_coll()
        coll.insert_many(dataframe.to_dict(orient="record"))

    def update_dataframe(self, dataframe: pd.DataFrame, record_by: list = ["_id"], drop=False):
        """多线程插入dataframe"""
        import threading
        coll = self.get_coll()
        # 剔除dataframe中不存在的列名
        record_by = [_r for _r in record_by if _r in dataframe.columns]
        thread_list = []
        for df_record in dataframe.to_dict(orient="record"):
            find_dict = {}
            for _r_by in record_by:
                find_dict[_r_by] = df_record[_r_by]
            record = coll.find_one(find_dict, {})
            thread = threading.Thread(target=self._save_or_update_mongodb, args=(record, df_record, drop))
            thread.start()
            thread_list.append(thread)
        for _thr in thread_list:  # 等待线程全部执行完毕
            _thr.join()

    def update_dict(self, new_dict_value: dict, record_by: list = ["_id"], drop=False):
        """更新一个dict的数据"""
        coll = self.get_coll()
        record_by = [_r for _r in record_by if _r in new_dict_value.keys()]
        find_dict = {}
        for _r_by in record_by:
            find_dict[_r_by] = new_dict_value[_r_by]
        record = coll.find_one(find_dict, {})
        self._save_or_update_mongodb(record, new_dict_value, drop=drop)

示例代码

from pymongo import MongoClient
import pandas as pd
import numpy as np


class MongoHelper(object):
    def __init__(self, database, collection=None, host="127.0.0.1", port=27017):
        mongo_conn = MongoClient(host=host, port=port)
        self.mongo_db = mongo_conn.get_database(database)  # 数据库连接(pymongo自带连接池)
        self.coll = None
        if collection:  # 集合
            self.coll = self.mongo_db.get_collection(collection)

    def set_collection(self, collection):
        """指定链接的集合"""
        self.coll = self.mongo_db.get_collection(collection)

    def get_coll(self):
        return self.coll

    def _save_or_update_mongodb(self, record, dict_value, drop=False):
        """根据检查_id,如果存在就覆盖,如果不存在就新增
        :param record: 数据库记录
        :param dict_value: 待保存的数据
        :param drop: 如果有记录,是否先删除后保存
        """
        if not record:  # 不存在,直接插入
            self.coll.insert_one(dict_value)
        elif drop is True:  # 先删除,然后插入,相当于覆盖原始数据
            self.coll.delete_one(record)
            self.coll.insert_one(dict_value)
        else:  # 更新
            self.coll.update_one(record, {
                "$set": dict_value,
            })

    def get_all_data(self, record_list=["_id"]):
        """获取全部数据,record_list中为需要的数据列名称"""
        for _record in self.coll.find({}, record_list):
            yield _record

    def save_new_dataframe_to_mongo(self, dataframe: pd.DataFrame):
        """保存dataframe格式的新数据到MongoDB,会自动分配_id,也可以指定_id"""
        coll = self.get_coll()
        coll.insert_many(dataframe.to_dict(orient="record"))

    def update_dataframe(self, dataframe: pd.DataFrame, record_by: list = ["_id"], drop=False):
        """多线程插入dataframe"""
        import threading
        coll = self.get_coll()
        # 剔除dataframe中不存在的列名
        record_by = [_r for _r in record_by if _r in dataframe.columns]
        thread_list = []
        for df_record in dataframe.to_dict(orient="records"):
            find_dict = {}
            for _r_by in record_by:
                find_dict[_r_by] = df_record[_r_by]
            record = coll.find_one(find_dict, {})
            thread = threading.Thread(target=self._save_or_update_mongodb, args=(record, df_record, drop))
            thread.start()
            thread_list.append(thread)
        for _thr in thread_list:  # 等待线程全部执行完毕
            _thr.join()

    def update_dict(self, new_dict_value: dict, record_by: list = ["_id"], drop=False):
        """更新一个dict的数据"""
        coll = self.get_coll()
        record_by = [_r for _r in record_by if _r in new_dict_value.keys()]
        find_dict = {}
        for _r_by in record_by:
            find_dict[_r_by] = new_dict_value[_r_by]
        record = coll.find_one(find_dict, {})
        self._save_or_update_mongodb(record, new_dict_value, drop=drop)


def main():
    mongo_helper = MongoHelper(database='test_db', collection="test_coll")
    df1 = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'b'])
    df1["_id"] = [str(i) for i in range(df1.shape[0])]
    mongo_helper.save_new_dataframe_to_mongo(df1)
    # 改变原始数据
    df2 = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'c'])
    df2["_id"] = [str(i) for i in range(df2.shape[0])]
    mongo_helper.update_dataframe(df2)


if __name__ == '__main__':
    main()

如果是

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

呆萌的代Ma

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值