/*-------------------------------------------------------------------------
*
* fd.c
* Virtual file descriptor code.
*
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/storage/file/fd.c
*
* NOTES:
*
* This code manages a cache of 'virtual' file descriptors (VFDs).
* The server opens many file descriptors for a variety of reasons,
* including base tables, scratch files (e.g., sort and hash spool
* files), and random calls to C library routines like system(3); it
* is quite easy to exceed system limits on the number of open files a
* single process can have. (This is around 256 on many modern
* operating systems, but can be as low as 32 on others.)
*
* VFDs are managed as an LRU pool, with actual OS file descriptors
* being opened and closed as needed. Obviously, if a routine is
* opened using these interfaces, all subsequent operations must also
* be through these interfaces (the File type is not a real file
* descriptor).
*
* For this scheme to work, most (if not all) routines throughout the
* server should use these interfaces instead of calling the C library
* routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
* may find ourselves short of real file descriptors anyway.
*
* This file used to contain a bunch of stuff to support RAID levels 0
* (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone
* because the parallel query processing code that called it is all
* gone. If you really need it you could get it from the original
* POSTGRES source.
*-------------------------------------------------------------------------
*/
Fd.c 虚拟文件描述符代码
这段代码管理的是虚拟文件描述符的一段缓存,服务器因为各种原因打开许多的文件描述符,包括基本的表、临时文件(比如排序和哈希池文件)以及随机调用C语言库文件像system.因此一个进程打开的文件数很容易就超过系统的限制。(很多现代操作系统中大约是256个,也有的低至32个。)
LRU池管理VFDs,根据实际的需要打开和关闭操作系统描述符。很显然如果一个程序使用这些接口,所有的后继操作必须也通过这些接口。(该文件类型不是一个真正的文件描述符。)
基于这种工作机制,服务器中大多数程序应该使用这些接口而不是调用C语言库中的程序。否则我们也许会发现缺少实际描述符。
这个文件过去包含一堆的东西来支持RAID级别 0,1,5。现在并行查询处理代码已经没有了,因此相关的东西也就没有了。如果你确实需要它,就去postgresql原始的代码中获取。
* Private Routines
*
* Delete - delete a file from the Lru ring
* LruDelete - remove a file from the Lru ring and close its FD
* Insert - put a file at the front of the Lru ring
* LruInsert - put a file at the front of the Lru ring and open it
* ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
* AllocateVfd - grab a free (or new) file record (from VfdArray)
* FreeVfd - free a file record
*
* The Least Recently Used ring is a doubly linked list that begins and
* ends on element zero. Element zero is special -- it doesn't represent
* a file and its "fd" field always == VFD_CLOSED. Element zero is just an
* anchor that shows us the beginning/end of the ring.
* Only VFD elements that are currently really open (have an FD assigned) are
* in the Lru ring. Elements that are "virtually" open can be recognized
* by having a non-null fileName field.
1、VFD插入到LRU 中:
Vfd数据结构
typedef struct vfd
{
int fd; /* current FD, or VFD_CLOSED if none */
unsigned short fdstate; /* bitflags for VFD's state */
ResourceOwner resowner; /* owner, for automatic cleanup */
File nextFree; /* link to next free VFD, if in freelist */
File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
off_t seekPos; /* current logical file position */
char *fileName; /* name of file, or NULL for unused VFD */
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
int fileFlags; /* open(2) flags for (re)opening the file */
int fileMode; /* mode to pass to open(2) */
} Vfd;
所有的系统文件描述符封装到vfd当中进行管理,vfd中第一个成员变量装载的就是实际的fd。进程在打开第一个文件的时候,声明并初始化一个数组Vfdcache[32],表示可以存放32个Vfd,同时给这32个Vfd分配内存空间,并将每一个Vfd中的fd字段置为VFD_CLOSED.这32个数组元素通过Vfd中成员nextFree链接成FreeList。
当需要打开一个文件的时候,就取出FreeList链表头元素,然后将该文件的文件描述符,文件名以及相关的标志信息填充到Vfd中。Postgresql 将所有的打开的文件的Vfd通过lruMoreRecently,lruLessRecently链接成一个双向链表。
根据文件名打开一个文件,分配一个Vfd并初始化该Vfd。
这里用到了strdup,表示用malloc分配一个内存空间,并且初始化内容为参数的内容,这段空间同样需要实用free进行释放,否则会造成内存泄漏。
/* * open a file in an arbitrary directory * * NB: if the passed pathname is relative (which it usually is), * it will be interpreted relative to the process' working directory * (which should always be $PGDATA when this code is running). */ File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode) { char *fnamecopy; File file; Vfd *vfdP; DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o", fileName, fileFlags, fileMode)); /* * We need a malloc'd copy of the file name; fail cleanly if no room. */ fnamecopy = strdup(fileName);//复制文件名 if (fnamecopy == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); file = AllocateVfd();//分配一个Vfd vfdP = &VfdCache[file];//指向file对应的Vfd的内存空间首地址 while (nfile + numAllocatedDescs >= max_safe_fds)//如果Vfd的数目已经达到了使用上限,则从LRU中释放最不常用的空间; { if (!ReleaseLruFile()) break; } vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);//通过文件名获取操作系统提供的文件fd if (vfdP->fd < 0)//如果fd不合法,则释放Vfd的空间和fnamecopy
FreeVfd(file);free(fnamecopy);return -1;}++nfile;//打开的文件数加1DO_DB(elog(LOG, "PathNameOpenFile: success %d", vfdP->fd));Insert(file);//将该文件插入VfdCache中 vfdP->fileName = fnamecopy;/* Saved flags are adjusted to be OK for re-opening file */vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);vfdP->fileMode = fileMode; vfdP->seekPos = 0;vfdP->fdstate = 0x0; vfdP->resowner = NULL;return file; }{
void InitFileAccess(void) { Assert(SizeVfdCache == 0); /* call me only once */ /* initialize cache header entry */ VfdCache = (Vfd *) malloc(sizeof(Vfd)); if (VfdCache == NULL) ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));//初始化内存空间为0 VfdCache->fd = VFD_CLOSED;//初始化fd为VFD_CLOSED SizeVfdCache = 1;//VfdCache的大小为1 /* register proc-exit hook to ensure temp files are dropped at exit */ on_proc_exit(AtProcExit_Files, 0); }
InitFileAccess 主要功能是初始化VfdCache,分配一个Vfd的内存空间,并将其中所有的内存内容设置为0,VfdCache[0].fd设置为VFD_CLOSED。该Vfd不会分配给任何文件,主要是用做LRU池的访问头部。
虚拟文件描述符从0开始,第一次申请32个,紧接着申请的个数为上一次申请数量的两倍。编码为:0,1,2,3,4... ... 。
typedef int File;
static File AllocateVfd(void) { Index i; File file; DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache)); Assert(SizeVfdCache > 0); /* InitFileAccess not called? */ if (VfdCache[0].nextFree == 0) { /* * The free list is empty so it is time to increase the size of the * array. We choose to double it each time this happens. However, * there's not much point in starting *real* small. */ Size newCacheSize = SizeVfdCache * 2; Vfd *newVfdCache; if (newCacheSize < 32) newCacheSize = 32; /* * Be careful not to clobber VfdCache ptr if realloc fails. */ newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize); if (newVfdCache == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); VfdCache = newVfdCache; /* * Initialize the new entries and link them into the free list. */ for (i = SizeVfdCache; i < newCacheSize; i++) { MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd)); VfdCache[i].nextFree = i + 1; VfdCache[i].fd = VFD_CLOSED; } VfdCache[newCacheSize - 1].nextFree = 0; VfdCache[0].nextFree = SizeVfdCache; /* * Record the new size */ SizeVfdCache = newCacheSize; } file = VfdCache[0].nextFree; VfdCache[0].nextFree = VfdCache[file].nextFree; return file; }
static void Insert(File file) { Vfd *vfdP;//申明一个临时Vfd变量, Assert(file != 0);//断言file是否为空 DO_DB(elog(LOG, "Insert %d (%s)", file, VfdCache[file].fileName)); DO_DB(_dump_lru()); vfdP = &VfdCache[file]; vfdP->lruMoreRecently = 0; vfdP->lruLessRecently = VfdCache[0].lruLessRecently; VfdCache[0].lruLessRecently = file; VfdCache[vfdP->lruLessRecently].lruMoreRecently = file; DO_DB(_dump_lru()); } /* returns 0 on success, -1 on re-open failure (with errno set) */ static int LruInsert(File file) { Vfd *vfdP; Assert(file != 0); DO_DB(elog(LOG, "LruInsert %d (%s)", file, VfdCache[file].fileName)); vfdP = &VfdCache[file]; if (FileIsNotOpen(file)) { while (nfile + numAllocatedDescs >= max_safe_fds) { if (!ReleaseLruFile()) break; } /* * The open could still fail for lack of file descriptors, eg due to * overall system file table being full. So, be prepared to release * another FD if necessary... */ vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags, vfdP->fileMode); if (vfdP->fd < 0) { DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno)); return vfdP->fd; } else { DO_DB(elog(LOG, "RE_OPEN SUCCESS")); ++nfile; } /* seek to the right position */ if (vfdP->seekPos != (off_t) 0) { off_t returnValue; returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET); Assert(returnValue != (off_t) -1); } } /* * put it at the head of the Lru ring */ Insert(file); return 0; }