/* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE ============================================= The tablespace cache is responsible for providing fast read/write access to tablespaces and logs of the database. File creation and deletion is done in other modules which know more of the logic of the operation, however. A tablespace consists of a chain of files. The size of the files does not have to be divisible by the database block size, because we may just leave the last incomplete block unused. When a new file is appended to the tablespace, the maximum size of the file is also specified. At the moment, we think that it is best to extend the file to its maximum size already at the creation of the file, because then we can avoid dynamically extending the file when more space is needed for the tablespace. A block's position in the tablespace is specified with a 32-bit unsigned integer. The files in the chain are thought to be catenated, and the block corresponding to an address n is the nth block in the catenated file (where the first block is named the 0th block, and the incomplete block fragments at the end of files are not taken into account). A tablespace can be extended by appending a new file at the end of the chain. Our tablespace concept is similar to the one of Oracle. To acquire more speed in disk transfers, a technique called disk striping is sometimes used. This means that logical block addresses are divided in a round-robin fashion across several disks. Windows NT supports disk striping, so there we do not need to support it in the database. Disk striping is implemented in hardware in RAID disks. We conclude that it is not necessary to implement it in the database. Oracle 7 does not support disk striping, either. Another trick used at some database sites is replacing tablespace files by raw disks, that is, the whole physical disk drive, or a partition of it, is opened as a single file, and it is accessed through byte offsets calculated from the start of the disk or the partition. This is recommended in some books on database tuning to achieve more speed in i/o. Using raw disk certainly prevents the OS from fragmenting disk space, but it is not clear if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file system + EIDE Conner disk only a negligible difference in speed when reading from a file, versus reading from a raw disk. To have fast access to a tablespace or a log file, we put the data structures to a hash table. Each tablespace and log file is given an unique 32-bit identifier. Some operating systems do not support many open files at the same time, though NT seems to tolerate at least 900 open files. Therefore, we put the open files in an LRU-list. If we need to open another file, we may close the file at the end of the LRU-list. When an i/o-operation is pending on a file, the file cannot be closed. We take the file nodes with pending i/o-operations out of the LRU-list and keep a count of pending operations. When an operation completes, we decrement the count and return the file node to the LRU-list if the count drops to zero. */ /* When mysqld is run, the default directory "." is the mysqld datadir, but in the MySQL Embedded Server Library and ibbackup it is not the default directory, and we must set the base file path explicitly */ const char* fil_path_to_mysql_datadir = "."; /* The number of fsyncs done to the log */ ulint fil_n_log_flushes = 0; ulint fil_n_pending_log_flushes = 0; ulint fil_n_pending_tablespace_flushes = 0; /* Null file address */ fil_addr_t fil_addr_null = {FIL_NULL, 0}; /* File node of a tablespace or the log data space */ struct fil_node_struct { fil_space_t* space; /* backpointer to the space where this node belongs */ char* name; /* path to the file */ ibool open; /* TRUE if file open */ os_file_t handle; /* OS handle to the file, if file open */ ibool is_raw_disk;/* TRUE if the 'file' is actually a raw device or a raw disk partition */ ulint size; /* size of the file in database pages, 0 if not known yet; the possible last incomplete megabyte may be ignored if space == 0 */ ulint n_pending; /* count of pending i/o's on this file; closing of the file is not allowed if this is > 0 */ ulint n_pending_flushes; /* count of pending flushes on this file; closing of the file is not allowed if this is > 0 */ ib_longlong modification_counter;/* when we write to the file we increment this by one */ ib_longlong flush_counter;/* up to what modification_counter value we have flushed the modifications to disk */ UT_LIST_NODE_T(fil_node_t) chain; /* link field for the file chain */ UT_LIST_NODE_T(fil_node_t) LRU; /* link field for the LRU list */ ulint magic_n; }; #define FIL_NODE_MAGIC_N 89389 /* Tablespace or log data space: let us call them by a common name space */ struct fil_space_struct { char* name; /* space name = the path to the first file in it */ ulint id; /* space id */ ib_longlong tablespace_version; /* in DISCARD/IMPORT this timestamp is used to check if we should ignore an insert buffer merge request for a page because it actually was for the previous incarnation of the space */ ibool mark; /* this is set to TRUE at database startup if the space corresponds to a table in the InnoDB data dictionary; so we can print a warning of orphaned tablespaces */ ibool stop_ios;/* TRUE if we want to rename the .ibd file of tablespace and want to stop temporarily posting of new i/o requests on the file */ ibool stop_ibuf_merges; /* we set this TRUE when we start deleting a single-table tablespace */ ibool is_being_deleted; /* this is set to TRUE when we start deleting a single-table tablespace and its file; when this flag is set no further i/o or flush requests can be placed on this space, though there may be such requests still being processed on this space */ ulint purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */ UT_LIST_BASE_NODE_T(fil_node_t) chain; /* base node for the file chain */ ulint size; /* space size in pages; 0 if a single-table tablespace whose size we do not know yet; last incomplete megabytes in data files may be ignored if space == 0 */ ulint n_reserved_extents; /* number of reserved free extents for ongoing operations like B-tree page split */ ulint n_pending_flushes; /* this is > 0 when flushing the tablespace to disk; dropping of the tablespace is forbidden if this is > 0 */ ulint n_pending_ibuf_merges;/* this is > 0 when merging insert buffer entries to a page so that we may need to access the ibuf bitmap page in the tablespade: dropping of the tablespace is forbidden if this is > 0 */ hash_node_t hash; /* hash chain node */ hash_node_t name_hash;/* hash chain the name_hash table */ rw_lock_t latch; /* latch protecting the file space storage allocation */ UT_LIST_NODE_T(fil_space_t) unflushed_spaces; /* list of spaces with at least one unflushed file we have written to */ ibool is_in_unflushed_spaces; /* TRUE if this space is currently in the list above */ UT_LIST_NODE_T(fil_space_t) space_list; /* list of all spaces */ ibuf_data_t* ibuf_data; /* insert buffer data */ ulint magic_n; }; #define FIL_SPACE_MAGIC_N 89472 /* The tablespace memory cache; also the totality of logs = the log data space, is stored here; below we talk about tablespaces, but also the ib_logfiles form a 'space' and it is handled here */ typedef struct fil_system_struct fil_system_t; struct fil_system_struct { mutex_t mutex; /* The mutex protecting the cache */ hash_table_t* spaces; /* The hash table of spaces in the system; they are hashed on the space id */ hash_table_t* name_hash; /* hash table based on the space name */ UT_LIST_BASE_NODE_T(fil_node_t) LRU; /* base node for the LRU list of the most recently used open files with no pending i/o's; if we start an i/o on the file, we first remove it from this list, and return it to the start of the list when the i/o ends; log files and the system tablespace are not put to this list: they are opened after the startup, and kept open until shutdown */ UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces; /* base node for the list of those tablespaces whose files contain unflushed writes; those spaces have at least one file node where modification_counter > flush_counter */ ulint n_open; /* number of files currently open */ ulint max_n_open; /* n_open is not allowed to exceed this */ ib_longlong modification_counter;/* when we write to a file we increment this by one */ ulint max_assigned_id;/* maximum space id in the existing tables, or assigned during the time mysqld has been up; at an InnoDB startup we scan the data dictionary and set here the maximum of the space id's of the tables there */ ib_longlong tablespace_version; /* a counter which is incremented for every space object memory creation; every space mem object gets a 'timestamp' from this; in DISCARD/ IMPORT this is used to check if we should ignore an insert buffer merge request */ UT_LIST_BASE_NODE_T(fil_space_t) space_list; /* list of all file spaces */ };