pgpool-II分析篇

最新推荐文章于 2022-04-21 10:20:00 发布

choulaogua2082

最新推荐文章于 2022-04-21 10:20:00 发布

阅读量789

点赞数

文章标签：数据库

原文链接：https://my.oschina.net/u/3308173/blog/906579

版权

pgpool-II的SQL解析

从源码中，我们看到相关函数调用顺序如下：

其中，do child函数如下：

/*                                    
* child main loop                                    
*/                                    
void do_child(int unix_fd, int inet_fd)                                    
{                                    
…                                    
    for (;;)                                
    {                                
        …                            
        /* perform accept() */                            
        frontend = do_accept(unix_fd, inet_fd, &timeout);                            
        if (frontend =/=* N cUonLLn)ection request from frontend timed out */                            
        {                            
            /* check select() timeout */                        
            if (connected && pool_config->child_life_time > 0 &&                        
                timeout.tv_sec == 0 && timeout.tv_usec == 0)                    
            {                        
                pool_debug("child life %d seconds expired", pool_config->child_life_time);                    
                /*                    
                * Doesn't need to call this. child_exit() calls it.                    
                * send_frontend_exits();                    
                */                    
                child_exit(2);                    
            }                        
            continue;                        
        }                            
        …                            
        /*                            
        * Ok, negotiaton with frontend has been done. Let's go to the                            
        * next step. Connect to backend if there's no existing                            
        * connection which can be reused by this frontend.                            
        * Authentication is also done in this step.                            
        */                            
        …                            
        /*                            
        * if there's no connection associated with user and database,                            
        * we need to connect to the backend and send the startup packet.                            
        */                            
        /* look for existing connection */                            
        found = 0;                            
        backend = pool_get_cp(sp->user, sp->database, sp->major, 1);                            
        …                            
        /* Mark this connection pool is conncted from frontend */                            
        pool_coninfo_set_frontend_connected(pool_get_process_context()->proc_id, pool_pool_index());                            
        /* query process loop */                            
        for (;;)                            
        {                            
            POOL_STATUS status;                        
            status = pool_process_query(frontend, backend, 0);                        
            sp = MASTER_CONNECTION(backend)->sp;                        
            switch (status)                        
            {                        
                …                    
            }                        
            if (status != POOL_CONTINUE)                        
                break;                    
        }                            
        …                            
    }                                
    child_exit(0);                                
}

在查询过程循环中，调用了函数pool_process_query，该函数是主要查询处理模块，代码如下：

/*                                    
* Main module for query processing                                    
* reset_request: if non 0, call reset_backend to execute reset queries                                    
*/                                    
POOL_STATUS pool_process_query(POOL_CONNECTION *frontend,                                    
                        POOL_CONNECTION_POOL *backend,            
                        int reset_request)            
{                                    
    …                                
    for (;;)                                
    {                                
        …                            
        /*                            
        * If we are prcessing query, process it.                            
        */                            
        if (pool_is_query_in_progress())                            
        {                            
            status = ProcessBackendResponse(frontend, backend, &state, &num_fields);                        
            if (status != POOL_CONTINUE)                        
                return status;                    
        }                            
        /*                            
        * If frontend and all backends do not have any pending data in                            
        * the receiving data cache, then issue select(2) to wait for new                            
        * data arrival                            
        */                            
        else if (is_cache_empty(frontend, backend))                            
        {                            
            bool cont = true;                        
            status = read_packets_and_process(frontend, backend, reset_request,                        
                                    &state, &num_fields, &cont);
            if (status != POOL_CONTINUE)                        
                return status;                    
            else if (!c/o*n Dt)etected admin shutdown */                        
                return status;                    
        }                            
        else                            
        {                            
            …                        
        }                            
        …                            
    }                                
    return POOL_CONTINUE;                                
}

检查是否有等待的数据，如果没有则调用read_packets_and_process函数并等待数据到达，其中read_packets_and_process函数定义如下：

/*                                    
* Read packet from either frontend or backend and process it.                                    
*/                                    
static POOL_STATUS read_packets_and_process(POOL_CONNECTION *frontend,                                    
POOL_CONNECTION_POOL *backend, int reset_request, int *state, short *num_fields, bool *cont)                                    
{                                    
    …                                
    if (!reset_request)                                
    {                                
        if (FD_ISSET(frontend->fd, &exceptmask))                            
            return POOL_END;                        
        else if (FD_ISSET(frontend->fd, &readmask))                            
        {                            
            status = ProcessFrontendResponse(frontend, backend);                        
            if (status != POOL_CONTINUE)                        
                return status;                    
        }                            
    }                                
    …                                
    return POOL_CONTINUE;                                
}

其中调用了ProcessFrontendResponse函数，其定义如下：

POOL_STATUS ProcessFrontendResponse(POOL_CONNECTION *frontend,                                    
                            POOL_CONNECTION_POOL *backend)        
{                                    
    …                                
    switch (fkind)                                
    {                                
        …                            
        case 'X': /* Terminate */                            
            free(contents);                        
            return POOL_END;                        
        case 'Q': /* Query */                            
            allow_close_transaction = 1;                        
            status = SimpleQuery(frontend, backend, len, contents);                        
            break;                        
        …                            
        default:                            
            pool_error("ProcessFrontendResponse: unknown message type %c(%02x)", fkind, fkind);                        
            status = POOL_ERROR;                        
    }                                
    free(contents);                                
    if (status != POOL_CONTINUE)                                
        status = POOL_ERROR;                            
    return status;                                
}

在switch语句中，当case为query时，调用SimpleQuery函数，代码如下：

/*                                    
* Process Query('Q') message                                    
* Query messages include an SQL string.                                    
*/                                    
POOL_STATUS SimpleQuery(POOL_CONNECTION *frontend,                                    
                    POOL_CONNECTION_POOL *backend, int len, char *contents)                
{                                    
    …                                
    /* log query to log file if necessary */                                
    if (pool_config->log_statement)                                
    {                                
        pool_log("statement: %s", contents);                            
    }                                
    else                                
    {                                
        pool_debug("statement2: %s", contents);                            
    }                                
    …                                
    if (parse_tree_list != NIL)                                
    {                                
        …                            
        /*                            
        * Decide where to send query                            
        */                            
        pool_where_to_send(query_context, query_context->original_query,                            
                        query_context->parse_tree);            
        …                            
    }                                
    …                                
    /* switch memory context */                                
    pool_memory_context_switch_to(old_context);                                
    return POOL_CONTINUE;                                
}

其中调用了决定发送查询的位置的函数pool_where_to_send，其定义如下：

/*                                    
* Decide where to send queries(thus expecting response)                                    
*/                                    
void pool_where_to_send(POOL_QUERY_CONTEXT *query_context, char *query, Node *node)                                    
{                                    
    …                                
    /*                                
    * In raw mode, we send only to master node. Simple enough.                                
    */                                
    if (RAW_MODE)                                
    {                                
        pool_set_node_to_be_sent(query_context, REAL_MASTER_NODE_ID);                            
    }                                
    else if (MASTER_SLAVE && query_context->is_multi_statement)                                
    {                                
    …                                
    }                                
    else if (MASTER_SLAVE)                                
    {                                
        POOL_DEST dest;                            
        POOL_MEMORY_POOL *old_context;                            
        old_context = pool_memory_context_switch_to(query_context->memory_context);                            
        dest = send_to_where(node, query);                            
        pool_memory_context_switch_to(old_context);                            
        pool_debug("send_to_where: %d query: %s", dest, query);                            
        /* Should be sent to primary only? */                            
        if (dest == POOL_PRIMARY)                            
        {                            
            pool_set_node_to_be_sent(query_context, PRIMARY_NODE_ID);                        
        }                            
        /* Should be sent to both primary and standby? */                            
        else if (dest == POOL_BOTH)                            
        {                            
            pool_setall_node_to_be_sent(query_context);                        
        }                            
        /*                            
        * Ok, we might be able to load balance the SELECT query.                            
        */                            
        else                            
        {                            
            …                        
        }                            
    }                                
    else if (REPLICATION || PARALLEL_MODE)                                
    {                                
        …                            
    }                                
    else                                
    {                                
        pool_error("pool_where_to_send: unknown mode");                            
        return;                            
    }                                
    …                                
    return;                                
}

当处在Master/Slave模式的时候，调用了函数send_to_where，代码如下：

/*                                    
* From syntactically analysis decide the statement to be sent to the                                    
* primary, the standby or either or both in master/slave+HR/SR mode.                                    
*/                                    
static POOL_DEST send_to_where(Node *node, char *query)                                    
{                                    
    if (bsearch(&nodeTag(node), nodemap, sizeof(nodemap)/sizeof(nodemap[0]),                                
            sizeof(NodeTag), compare) != NULL)                        
    {                                
        /*                            
        * SELECT INTO                            
        * SELECT FOR SHARE or UPDATE                            
        */                            
        if (IsA(node, SelectStmt))                            
        {                            
            /* SELECT INTO or SELECT FOR SHARE or UPDATE ? */                        
            if (pool_has_insertinto_or_locking_clause(node))                        
                return POOL_PRIMARY;                    
            return POOL_EITHER;                        
        }                            
        …                            
        /*                            
        * Transaction commands                            
        */                            
        else if (IsA(node, TransactionStmt))                            
        {                            
            /*                        
            * Check "BEGIN READ WRITE" "START TRANSACTION READ WRITE"                        
            */                        
            if (is_start_transaction_query(node))                        
            {                        
                /* But actually, we send BEGIN to standby if it's                    
                BEGIN READ WRITE or START TRANSACTION READ WRITE */                    
                if (is_read_write((TransactionStmt *)node))                    
                    return POOL_BOTH;                
                /* Other TRANSACTION start commands are sent to both primary                    
                    and standby */                
                else                    
                    return POOL_BOTH;                
            }                        
            /* SAVEPOINT related commands are sent to both primary and standby */                        
            else if (is_savepoint_query(node))                        
                return POOL_BOTH;                    
            /*                        
            * 2PC commands                        
            */                        
            else if (is_2pc_transaction_query(node))                        
                return POOL_PRIMARY;                    
            else                        
            /* COMMIT etc. */                        
                return POOL_BOTH;                    
        }                            
        …                            
        /*                            
        * EXECUTE                            
        */                            
        else if (IsA(node, ExecuteStmt))                            
        {                            
            /* This is temporary decision. where_to_send will inherit                        
            * same destination AS PREPARE.                        
            */                        
            return POOL_PRIMARY;                        
        }                            
        …                            
        /*                            
        * Other statements are sent to primary                            
        */                            
        return POOL_PRIMARY;                            
    }                                
                                    
    /*                                
    * All unknown statements are sent to primary                                
    */                                
    return POOL_PRIMARY;                                
}

send_to_where函数中，处在Master/Slave模式的时候，数据的增、删、改指令只向PrimaryDB发送。begin/commit这样的事务有关的指令，则既向Master送信，也向Slave送信。

pgpool-II进程池

配置文件pgpool.conf中有配置选项num_init_children，其为预先生成的 pgpool-II 服务进程数。默认为 32。num_init_children 也是 pgpool-II 支持的从客户端发起的最大并发连接数。如果超过 num_init_children 数的客户端尝试连接到 pgpool-II，它们将被阻塞（而不是拒绝连接），直到到任何一个 pgpool-II 进程的连接被关闭为止。最多有 2*num_init_children 可以被放入等待队列。

main函数中的部分代码如下：

/*                    
* pgpool main program                    
*/                    
int main(int argc, char **argv)                    
{                    
    ……                
    /* create unix domain socket */                
    unix_fd = create_unix_domain_socket(un_addr);                
                    
    /* create inet domain socket if any */                
    if (pool_config->listen_addresses[0])                
    {                
        inet_fd = create_inet_domain_socket
          (pool_config->listen_addresses, pool_config->port);            
    }                
                    
    ……                
    /*                
     * We need to block signal here. Otherwise child might send some               
     * signals, for example SIGUSR1(fail over).  Children will inherit             
     * signal blocking but they do unblock signals at the very beginning           
     * of process.  So this is harmless.                
     */                
    POOL_SETMASK(&BlockSig);                
                    
    /* fork the children */                
    for (i=0;i<pool_config->num_init_children;i++){                
        process_info[i].pid = fork_a_child(unix_fd, inet_fd, i);            
        process_info[i].start_time = time(NULL);            
    }                
                    
    /* set up signal handlers */                               
    pool_signal(SIGTERM, exit_handler);                
    pool_signal(SIGINT, exit_handler);                
    pool_signal(SIGQUIT, exit_handler);                
    pool_signal(SIGCHLD, reap_handler);                
    pool_signal(SIGUSR1, failover_handler);                
    pool_signal(SIGUSR2, wakeup_handler);                
    pool_signal(SIGHUP, reload_config_handler);                
                    
    /* create pipe for delivering event */                
    if (pipe(pipe_fds) < 0){                
        pool_error("failed to create pipe");            
        myexit(1);            
    }                
             
    pool_log("%s successfully started. version %s (%s)", 
               PACKAGE, VERSION, PGPOOLVERSION);                
                    
    …… main loop is here                
            
    pool_shmem_exit(0);                
}

在fork the children的for循环中，每一个num_init_children都fork一个子进程，所以有多少个num_init_children，就fork多少个子进程。fork_a_child定义如下：

/*                    
* fork a child                    
*/                    
pid_t fork_a_child(int unix_fd, int inet_fd, int id)                    
{                    
    pid_t pid;                
                    
    pid = fork();                
                    
    if (pid == 0)                
    {                
        ……         
        /* call child main */            
        POOL_SETMASK(&UnBlockSig);            
        reload_config_request = 0;            
        my_proc_id = id;            
        run_as_pcp_child = false;            
        do_child(unix_fd, inet_fd);            
    }                
    else if (pid == -1)                
    {                
        pool_error("fork() failed. reason: %s", strerror(errno));            
        myexit(1);            
    }                
    return pid;                
}

其中调用了do child函数，而且每fork一个进程就会执行一次do child函数，各个子进程就开始工作了，do child函数定义如下：

/*                    
* child main loop                    
*/                    
void do_child(int unix_fd, int inet_fd)                    
{                
    ……                
                    
    for (;;)                
    {                
        ……            
        /* perform accept() */            
        frontend = do_accept(unix_fd, inet_fd, &timeout);            
                    
        if (frontend == NULL)/* connection request from frontend timed out */
        {            
            ……        
        }  
        ……          
        /* query process loop */            
        for (;;)            
        {            
            ……        
        }            
        ……            
    }                
    child_exit(0);                
}

do_child函数里面调用 do_accept函数，如果客户端有请求，就会开始响应客户端，开始工作。

转载于:https://my.oschina.net/u/3308173/blog/906579