pgpool-II的SQL解析
从源码中,我们看到相关函数调用顺序如下:
其中,do child函数如下:
/*
* child main loop
*/
void do_child(int unix_fd, int inet_fd)
{
…
for (;;)
{
…
/* perform accept() */
frontend = do_accept(unix_fd, inet_fd, &timeout);
if (frontend =/=* N cUonLLn)ection request from frontend timed out */
{
/* check select() timeout */
if (connected && pool_config->child_life_time > 0 &&
timeout.tv_sec == 0 && timeout.tv_usec == 0)
{
pool_debug("child life %d seconds expired", pool_config->child_life_time);
/*
* Doesn't need to call this. child_exit() calls it.
* send_frontend_exits();
*/
child_exit(2);
}
continue;
}
…
/*
* Ok, negotiaton with frontend has been done. Let's go to the
* next step. Connect to backend if there's no existing
* connection which can be reused by this frontend.
* Authentication is also done in this step.
*/
…
/*
* if there's no connection associated with user and database,
* we need to connect to the backend and send the startup packet.
*/
/* look for existing connection */
found = 0;
backend = pool_get_cp(sp->user, sp->database, sp->major, 1);
…
/* Mark this connection pool is conncted from frontend */
pool_coninfo_set_frontend_connected(pool_get_process_context()->proc_id, pool_pool_index());
/* query process loop */
for (;;)
{
POOL_STATUS status;
status = pool_process_query(frontend, backend, 0);
sp = MASTER_CONNECTION(backend)->sp;
switch (status)
{
…
}
if (status != POOL_CONTINUE)
break;
}
…
}
child_exit(0);
}
在查询过程循环中,调用了函数pool_process_query,该函数是主要查询处理模块,代码如下:
/*
* Main module for query processing
* reset_request: if non 0, call reset_backend to execute reset queries
*/
POOL_STATUS pool_process_query(POOL_CONNECTION *frontend,
POOL_CONNECTION_POOL *backend,
int reset_request)
{
…
for (;;)
{
…
/*
* If we are prcessing query, process it.
*/
if (pool_is_query_in_progress())
{
status = ProcessBackendResponse(frontend, backend, &state, &num_fields);
if (status != POOL_CONTINUE)
return status;
}
/*
* If frontend and all backends do not have any pending data in
* the receiving data cache, then issue select(2) to wait for new
* data arrival
*/
else if (is_cache_empty(frontend, backend))
{
bool cont = true;
status = read_packets_and_process(frontend, backend, reset_request,
&state, &num_fields, &cont);
if (status != POOL_CONTINUE)
return status;
else if (!c/o*n Dt)etected admin shutdown */
return status;
}
else
{
…
}
…
}
return POOL_CONTINUE;
}
检查是否有等待的数据,如果没有则调用read_packets_and_process函数并等待数据到达,其中read_packets_and_process函数定义如下:
/*
* Read packet from either frontend or backend and process it.
*/
static POOL_STATUS read_packets_and_process(POOL_CONNECTION *frontend,
POOL_CONNECTION_POOL *backend, int reset_request, int *state, short *num_fields, bool *cont)
{
…
if (!reset_request)
{
if (FD_ISSET(frontend->fd, &exceptmask))
return POOL_END;
else if (FD_ISSET(frontend->fd, &readmask))
{
status = ProcessFrontendResponse(frontend, backend);
if (status != POOL_CONTINUE)
return status;
}
}
…
return POOL_CONTINUE;
}
其中调用了ProcessFrontendResponse函数,其定义如下:
POOL_STATUS ProcessFrontendResponse(POOL_CONNECTION *frontend,
POOL_CONNECTION_POOL *backend)
{
…
switch (fkind)
{
…
case 'X': /* Terminate */
free(contents);
return POOL_END;
case 'Q': /* Query */
allow_close_transaction = 1;
status = SimpleQuery(frontend, backend, len, contents);
break;
…
default:
pool_error("ProcessFrontendResponse: unknown message type %c(%02x)", fkind, fkind);
status = POOL_ERROR;
}
free(contents);
if (status != POOL_CONTINUE)
status = POOL_ERROR;
return status;
}
在switch语句中,当case为query时,调用SimpleQuery函数,代码如下:
/*
* Process Query('Q') message
* Query messages include an SQL string.
*/
POOL_STATUS SimpleQuery(POOL_CONNECTION *frontend,
POOL_CONNECTION_POOL *backend, int len, char *contents)
{
…
/* log query to log file if necessary */
if (pool_config->log_statement)
{
pool_log("statement: %s", contents);
}
else
{
pool_debug("statement2: %s", contents);
}
…
if (parse_tree_list != NIL)
{
…
/*
* Decide where to send query
*/
pool_where_to_send(query_context, query_context->original_query,
query_context->parse_tree);
…
}
…
/* switch memory context */
pool_memory_context_switch_to(old_context);
return POOL_CONTINUE;
}
其中调用了决定发送查询的位置的函数pool_where_to_send,其定义如下:
/*
* Decide where to send queries(thus expecting response)
*/
void pool_where_to_send(POOL_QUERY_CONTEXT *query_context, char *query, Node *node)
{
…
/*
* In raw mode, we send only to master node. Simple enough.
*/
if (RAW_MODE)
{
pool_set_node_to_be_sent(query_context, REAL_MASTER_NODE_ID);
}
else if (MASTER_SLAVE && query_context->is_multi_statement)
{
…
}
else if (MASTER_SLAVE)
{
POOL_DEST dest;
POOL_MEMORY_POOL *old_context;
old_context = pool_memory_context_switch_to(query_context->memory_context);
dest = send_to_where(node, query);
pool_memory_context_switch_to(old_context);
pool_debug("send_to_where: %d query: %s", dest, query);
/* Should be sent to primary only? */
if (dest == POOL_PRIMARY)
{
pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID);
}
/* Should be sent to both primary and standby? */
else if (dest == POOL_BOTH)
{
pool_setall_node_to_be_sent(query_context);
}
/*
* Ok, we might be able to load balance the SELECT query.
*/
else
{
…
}
}
else if (REPLICATION || PARALLEL_MODE)
{
…
}
else
{
pool_error("pool_where_to_send: unknown mode");
return;
}
…
return;
}
当处在Master/Slave模式的时候,调用了函数send_to_where,代码如下:
/*
* From syntactically analysis decide the statement to be sent to the
* primary, the standby or either or both in master/slave+HR/SR mode.
*/
static POOL_DEST send_to_where(Node *node, char *query)
{
if (bsearch(&nodeTag(node), nodemap, sizeof(nodemap)/sizeof(nodemap[0]),
sizeof(NodeTag), compare) != NULL)
{
/*
* SELECT INTO
* SELECT FOR SHARE or UPDATE
*/
if (IsA(node, SelectStmt))
{
/* SELECT INTO or SELECT FOR SHARE or UPDATE ? */
if (pool_has_insertinto_or_locking_clause(node))
return POOL_PRIMARY;
return POOL_EITHER;
}
…
/*
* Transaction commands
*/
else if (IsA(node, TransactionStmt))
{
/*
* Check "BEGIN READ WRITE" "START TRANSACTION READ WRITE"
*/
if (is_start_transaction_query(node))
{
/* But actually, we send BEGIN to standby if it's
BEGIN READ WRITE or START TRANSACTION READ WRITE */
if (is_read_write((TransactionStmt *)node))
return POOL_BOTH;
/* Other TRANSACTION start commands are sent to both primary
and standby */
else
return POOL_BOTH;
}
/* SAVEPOINT related commands are sent to both primary and standby */
else if (is_savepoint_query(node))
return POOL_BOTH;
/*
* 2PC commands
*/
else if (is_2pc_transaction_query(node))
return POOL_PRIMARY;
else
/* COMMIT etc. */
return POOL_BOTH;
}
…
/*
* EXECUTE
*/
else if (IsA(node, ExecuteStmt))
{
/* This is temporary decision. where_to_send will inherit
* same destination AS PREPARE.
*/
return POOL_PRIMARY;
}
…
/*
* Other statements are sent to primary
*/
return POOL_PRIMARY;
}
/*
* All unknown statements are sent to primary
*/
return POOL_PRIMARY;
}
send_to_where函数中,处在Master/Slave模式的时候,数据的增、删、改指令只向PrimaryDB发送。begin/commit这样的事务有关的指令,则既向Master送信,也向Slave送信。
pgpool-II进程池
配置文件pgpool.conf中有配置选项num_init_children,其为预先生成的 pgpool-II 服务进程数。默认为 32。num_init_children 也是 pgpool-II 支持的从客户端发起的最大并发连接数。如果超过 num_init_children 数的客户端尝试连接到 pgpool-II,它们将被阻塞(而不是拒绝连接),直到到任何一个 pgpool-II 进程的连接被关闭为止。最多有 2*num_init_children 可以被放入等待队列。
main函数中的部分代码如下:
/*
* pgpool main program
*/
int main(int argc, char **argv)
{
……
/* create unix domain socket */
unix_fd = create_unix_domain_socket(un_addr);
/* create inet domain socket if any */
if (pool_config->listen_addresses[0])
{
inet_fd = create_inet_domain_socket
(pool_config->listen_addresses, pool_config->port);
}
……
/*
* We need to block signal here. Otherwise child might send some
* signals, for example SIGUSR1(fail over). Children will inherit
* signal blocking but they do unblock signals at the very beginning
* of process. So this is harmless.
*/
POOL_SETMASK(&BlockSig);
/* fork the children */
for (i=0;i<pool_config->num_init_children;i++){
process_info[i].pid = fork_a_child(unix_fd, inet_fd, i);
process_info[i].start_time = time(NULL);
}
/* set up signal handlers */
pool_signal(SIGTERM, exit_handler);
pool_signal(SIGINT, exit_handler);
pool_signal(SIGQUIT, exit_handler);
pool_signal(SIGCHLD, reap_handler);
pool_signal(SIGUSR1, failover_handler);
pool_signal(SIGUSR2, wakeup_handler);
pool_signal(SIGHUP, reload_config_handler);
/* create pipe for delivering event */
if (pipe(pipe_fds) < 0){
pool_error("failed to create pipe");
myexit(1);
}
pool_log("%s successfully started. version %s (%s)",
PACKAGE, VERSION, PGPOOLVERSION);
…… main loop is here
pool_shmem_exit(0);
}
在fork the children的for循环中,每一个num_init_children都fork一个子进程,所以有多少个num_init_children,就fork多少个子进程。fork_a_child定义如下:
/*
* fork a child
*/
pid_t fork_a_child(int unix_fd, int inet_fd, int id)
{
pid_t pid;
pid = fork();
if (pid == 0)
{
……
/* call child main */
POOL_SETMASK(&UnBlockSig);
reload_config_request = 0;
my_proc_id = id;
run_as_pcp_child = false;
do_child(unix_fd, inet_fd);
}
else if (pid == -1)
{
pool_error("fork() failed. reason: %s", strerror(errno));
myexit(1);
}
return pid;
}
其中调用了do child函数,而且每fork一个进程就会执行一次do child函数,各个子进程就开始工作了,do child函数定义如下:
/*
* child main loop
*/
void do_child(int unix_fd, int inet_fd)
{
……
for (;;)
{
……
/* perform accept() */
frontend = do_accept(unix_fd, inet_fd, &timeout);
if (frontend == NULL)/* connection request from frontend timed out */
{
……
}
……
/* query process loop */
for (;;)
{
……
}
……
}
child_exit(0);
}
do_child函数里面调用 do_accept函数,如果客户端有请求,就会开始响应客户端,开始工作。