(Django)对object.all()等大量数据的QuerySet限制内存使用

最新推荐文章于 2024-07-14 22:37:22 发布

pushiqiang

最新推荐文章于 2024-07-14 22:37:22 发布

阅读量4.1k

点赞数 1

分类专栏： django python 数据库文章标签： django 内存 mysql 遍历

python 同时被 3 个专栏收录

48 篇文章 0 订阅

订阅专栏

django

31 篇文章 0 订阅

订阅专栏

数据库

15 篇文章 0 订阅

订阅专栏

---------------------------------------------------我是分割线 2021-04-13---------------------------------------------------
现在django已不存在该问题。迭代all()时django会自动分块加载数据，并不会一次性加载所有数据到内存，可以放心遍历all()，但是仍然要注意大数据内存的使用，比如list(objects.all())此类操作

Source code：

# django.db.models.sql.constants.GET_ITERATOR_CHUNK_SIZE
GET_ITERATOR_CHUNK_SIZE = 100

# django.db.models.sql.compiler
def results_iter(self, results=None, tuple_expected=False, chunked_fetch=False,
                     chunk_size=GET_ITERATOR_CHUNK_SIZE):
    if results is None:
            results = self.execute_sql(MULTI, chunked_fetch=chunked_fetch, chunk_size=chunk_size)
    ...
    rows = chain.from_iterable(results)
    ...
    
def execute_sql(self, result_type=MULTI, chunked_fetch=False, chunk_size=GET_ITERATOR_CHUNK_SIZE):
	...
	result = cursor_iter(
            cursor, self.connection.features.empty_fetchmany_value,
            self.col_count if self.has_extra_select else None,
            chunk_size,
        )
	...

def cursor_iter(cursor, sentinel, col_count, itersize):
    """
    Yield blocks of rows from a cursor and ensure the cursor is closed when
    done.
    """
    try:
        for rows in iter((lambda: cursor.fetchmany(itersize)), sentinel):
            yield rows if col_count is None else [r[:col_count] for r in rows]
    finally:
        cursor.close()

---------------------------------------------------我是分割线---------------------------------------------------

问题

在django的使用中，经常会出现大量数据的遍历操作，或者是对大量数据进行遍历迁移跟新，比如

for user in User.objects.all():
    user.A = user.B
    User.B = None

等种种情况。

在本地开发环境中QuerySet对象最初具有非常小的内存占用，随着业务量的增长QuerySet对象在我遍历它们时缓存每个model_instance，all()返回的QuerySet会越来越来，可能最终耗尽内存，被托管服务提供商杀死线程。

###解决方法:

import copy
from decimal import Decimal


class MemorySavingQuerysetIterator(object):
    def __init__(self, queryset, max_obj_num=1000):
        self._base_queryset = queryset
        self._generator = self._setup()
        self.max_obj_num = max_obj_num

    def _setup(self):
        for i in xrange(0, self._base_queryset.count(), self.max_obj_num):
            # By making a copy of of the queryset and using that to actually
            # access the objects we ensure that there are only `max_obj_num`
            # objects in memory at any given time
            smaller_queryset = copy.deepcopy(self._base_queryset
                                             )[i:i + self.max_obj_num]
            # logger.debug('Grabbing next %s objects from DB' % self.max_obj_num)
            for obj in smaller_queryset.iterator():
                yield obj

    def __iter__(self):
        return self

    def next(self):
        return self._generator.next()

###调用：

Users = User.objects.all()
for user in MemorySavingQuerysetIterator(users, 100):
    Pass

###python mysql原生操作

import MySQLdb

class QuerySetIterator(object):
    def __init__(self, cursor, query, max_num):
        self.query = query
        self.max_num = max_num
        self._cursor = cursor
        self._generator = self._setup()

    def _setup(self):
        for i in xrange(0, 90000000, self.max_num):
            new_query = "{query} limit {limit} offset {offset}".format(
                query=self.query, limit=self.max_num, offset=i
            )
            self._cursor.execute(new_query)
            result = self._cursor.fetchall()
            if not result:
                break
            for obj in result:
                yield obj

    def __iter__(self):
        return self

    def next(self):
        return self._generator.next()


class TestModel(object):
    db = MySQLdb.connect("localhost", "root", "123456", "test")
    cursor = db.cursor()

    def __init__(self, tb_name, max_num=100):
        self.tb_name = tb_name
        self.max_num = max_num
        self._query_sql_tpl = "select * from {tb_name}".format(tb_name=tb_name)

    def query_all(self, query_sql=None):
        if not query_sql:
            query_sql = self._query_sql_tpl
        return QuerySet(self.cursor, query_sql, self.max_num)

test = TestModel('test')
result = test.query_all()

for obj in result:
    print obj