msplit

最新推荐文章于 2024-09-27 17:31:02 发布
weixin_34186128
最新推荐文章于 2024-09-27 17:31:02 发布
阅读量203
点赞数
文章标签： python c/c++
原文链接：https://my.oschina.net/u/572632/blog/289242
版权
2019独角兽企业重金招聘Python工程师标准>>>
简介

阅读时发现常常见到msplit的使用，因此阅读了该部分代码做一记录.
代码

/****************************************************************
 *
 * Function: mSplit()
 *
 * Purpose: Splits a string into tokens non-destructively.
 *
 * Parameters:
 *  char *
 *      The string to be split
 *  char *
 *      A string of token seperaters
 *  int
 *      The maximum number of tokens to be returned. A value
 *      of 0 means to get them all.
 *  int *
 *      Place to store the number of tokens returned
 *  char
 *      The "escape metacharacter", treat the character after
 *      this character as a literal and "escape" a seperator.
 *
 *  Note if max_toks is reached, the last tok in the returned
 *  token array will possibly have separator characters in it.
 *
 *  Returns:
 *      2D char array with one token per "row" of the returned
 *      array.
 *
 ****************************************************************/
char ** mSplit(const char *str, const char *sep_chars, const int max_toks,
               int *num_toks, const char meta_char)
{
    size_t cur_tok = 0;  /* current token index into array of strings */
    size_t tok_start;    /* index to start of token */
    size_t i, j;
    int escaped = 0;
    /* It's rare we'll need more than this even if max_toks is set really
     * high.  Store toks here until finished, then allocate.  If more than
     * this is necessary, then allocate max toks */
    char *toks_buf[TOKS_BUF_SIZE];
    size_t toks_buf_size = TOKS_BUF_SIZE;
    int toks_buf_size_increment = 10;
    char **toks_alloc = NULL;   /* Used if the static buf isn't enough */
    char **toks = toks_buf;     /* Pointer to one of the two above */
    char **retstr;
    char *whitespace = " \t";

    if (num_toks == NULL)
        return NULL;

    *num_toks = 0;
     /** 被分割字串为空或者切割标志串不为空且长度为0这停止处理*/
    if ((str == NULL) || (strlen(str) == 0) ||
        ((sep_chars != NULL) && (strlen(sep_chars) == 0)))
    {
        return NULL;
    }

    /** 如果切割标志串为空则替换为" \t"*/
    if (sep_chars == NULL)
        sep_chars = whitespace;
    /** 切割标志中不能有结尾标志*/
    /* Meta char cannot also be a separator char */
    for (i = 0; i < strlen(sep_chars); i++)
    {
        if (sep_chars[i] == meta_char)
            return NULL;
    }

    /** 跳过被分割串头部所有的切割标志以及空格字符*/
    /* Move past initial separator characters and whitespace */
    for (i = 0; i < strlen(str); i++)
    {
        for (j = 0; j < strlen(sep_chars); j++)
        {
            if ((str[i] == sep_chars[j]) ||
                isspace((int)str[i]))
            {
                break;
            }
        }

        /* Not a separator character or whitespace */
        if (j == strlen(sep_chars))
            break;
    }

    /*整个串中都是空格字符或者分割标志*/
    if (i == strlen(str))
    {
        /* Nothing but separator characters or whitespace in string */
        return NULL;
    }

    /* User only wanted one tok so return the rest of the string in
     * one tok */
     /**用户只需要一个字串时直接返回处理后的串*/
    if ((cur_tok + 1) == (size_t)max_toks)
    {
        retstr = (char **)SnortAlloc(sizeof(char *));
        retstr[cur_tok] = SnortStrndup(&str[i], strlen(str) - i);
        if (retstr[cur_tok] == NULL)
        {
            mSplitFree(&retstr, cur_tok + 1);
            return NULL;
        }

        *num_toks = cur_tok + 1;
        return retstr;
    }

    /* Mark the beginning of the next tok */
    tok_start = i;
    for (; i < strlen(str); i++)
    {
        if (!escaped)                           /** 是否扫描到结尾标志*/
        {
            /* Got an escape character.  Don't include it now, but
             * must be a character after it. */
            if (str[i] == meta_char)         /**遇到结尾标志打上标记继续下个循环*/
            {
                escaped = 1;
                continue;
            }

            /* See if the current character is a separator */
            /**检查是否是分割标志*/
            for (j = 0; j < strlen(sep_chars); j++) 
            {
             /**找到直接进入下个*/
                if (str[i] == sep_chars[j])        
                    break;
            }

            /* It's a normal character */
            /**不是分割标志或结尾标志继续检查下个字符*/
            if (j == strlen(sep_chars))             
                continue;

            /* Current character matched a separator character.  Trim off
             * whitespace previous to the separator.  If we get here, there
             * is at least one savable character */
             /**让j 指向分割标志左端的第一个非空字符*/
            for (j = i; j > tok_start; j--)         
            {
                if (!isspace((int)str[j - 1]))
                    break;
            }
             /** 获取分割出的字串*/                                       
            /* Allocate a buffer.  The length will not have included the
             * meta char of escaped separators */
            toks[cur_tok] = mSplitAddTok(&str[tok_start], j - tok_start, sep_chars, meta_char);

            /* Increment current token index */
            cur_tok++;

            /** 跳过被分割串头部所有的切割标志以及空格字符*/
            /* Move past any more separator characters or whitespace */
            for (; i < strlen(str); i++)
            {
                for (j = 0; j < strlen(sep_chars); j++)
                {
                    if ((str[i] == sep_chars[j]) ||
                        isspace((int)str[i]))
                    {
                        break;
                    }
                }

                /* Not a separator character or whitespace */
                if (j == strlen(sep_chars))
                    break;
            }

            /**若果剩下的全是分割字符以及空格字符可以返回结构*/
            /* Nothing but separator characters or whitespace left in the string */
            if (i == strlen(str))
            {
                *num_toks = cur_tok;

                if (toks != toks_alloc)
                {
                    retstr = (char **)SnortAlloc(sizeof(char *) * cur_tok);
                    memcpy(retstr, toks, (sizeof(char *) * cur_tok));
                }
                else
                {
                    retstr = toks;
                }

                return retstr;
            }

            /** 下面的是存储空间的处理*/
            /* Reached the size of our current string buffer and need to
             * allocate something bigger.  Only get here once if max toks
             * set to something other than 0 because we'll just allocate
             * max toks in that case. */
            if (cur_tok == toks_buf_size)
            {
                char **tmp;

                if (toks_alloc != NULL)
                    tmp = toks_alloc;
                else
                    tmp = toks_buf;

                if (max_toks != 0)
                    toks_buf_size = max_toks;
                else
                    toks_buf_size = cur_tok + toks_buf_size_increment;

                toks_alloc = (char **)SnortAlloc(sizeof(char *) * toks_buf_size);
                memcpy(toks_alloc, tmp, (sizeof(char *) * cur_tok));
                toks = toks_alloc;

                if (tmp != toks_buf)
                    free(tmp);
            }

            if ((max_toks != 0) && ((cur_tok + 1) == (size_t)max_toks))
            {
                /* Return rest of string as last tok */
                *num_toks = cur_tok + 1;

                /* Already got a ret string */
                if (toks != toks_alloc)
                {
                    retstr = (char **)SnortAlloc(sizeof(char *) * (cur_tok + 1));
                    memcpy(retstr, toks, (sizeof(char *) * (cur_tok + 1)));
                }
                else
                {
                    retstr = toks;
                }

                /* Trim whitespace at end of last tok */
                for (j = strlen(str); j > tok_start; j--)
                {
                    if (!isspace((int)str[j - 1]))
                        break;
                }

                retstr[cur_tok] = SnortStrndup(&str[i], j - i);
                if (retstr[cur_tok] == NULL)
                {
                    mSplitFree(&retstr, cur_tok + 1);
                    return NULL;
                }

                return retstr;
            }

            tok_start = i;
        }
        else
        {
            /* This character is escaped with the meta char */
            escaped = 0;
        }
    }

    /* Last character was an escape character */
    if (escaped)
    {
        for (i = 0; i < cur_tok; i++)
            free(toks[i]);

        if (toks == toks_alloc)
            free(toks_alloc);

        return NULL;
    }

    /* Trim whitespace at end of last tok */
    for (j = i; j > tok_start; j--)
    {
        if (!isspace((int)str[j - 1]))
            break;
    }

    /* Last character was not a separator character so we've got
     * one more tok.  Unescape escaped sepatator charactors */
    if (toks != toks_alloc)
    {
        retstr = (char **)SnortAlloc(sizeof(char *) * (cur_tok + 1));
        memcpy(retstr, toks, (sizeof(char *) * (cur_tok + 1)));
    }
    else
    {
        retstr = toks;
    }

    retstr[cur_tok] = mSplitAddTok(&str[tok_start], j - tok_start, sep_chars, meta_char);

    /* Just add one to cur_tok index instead of incrementing
     * since we're done */
    *num_toks = cur_tok + 1;
    return retstr;
}

/* Will not return NULL.  SnortAlloc will fatal if it fails */
static char * mSplitAddTok(const char *str, const int len, const char *sep_chars, const char meta_char)
{
    size_t i, j, k;
    char *tok;
    int tok_len = 0;
    int got_meta = 0;

    /* Get the length of the returned tok
     * Could have a maximum token length and use a fixed sized array and
     * fill it in as we go but don't want to put on that constraint */
    for (i = 0; (int)i < len; i++)
    {
        if (!got_meta)
        {
            if (str[i] == meta_char)
            {
                got_meta = 1;
                continue;
            }
        }
        else
        {
            /* See if the current character is a separator */
            for (j = 0; j < strlen(sep_chars); j++)
            {
                if (str[i] == sep_chars[j])
                    break;
            }

            /* It's a non-separator character, so include
             * the meta character in the return tok */
            if (j == strlen(sep_chars))
                tok_len++;

            got_meta = 0;
        }

        tok_len++;
    }

    /* Allocate it and fill it in */
    tok = (char *)SnortAlloc(tok_len + 1);
    for (i = 0, k = 0; (int)i < len; i++)
    {
        if (!got_meta)
        {
            if (str[i] == meta_char)
            {
                got_meta = 1;
                continue;
            }
        }
        else
        {
            /* See if the current character is a separator */
            for (j = 0; j < strlen(sep_chars); j++)
            {
                if (str[i] == sep_chars[j])
                    break;
            }

            /* It's a non-separator character, so include
             * the meta character in the return tok */
            if (j == strlen(sep_chars))
                tok[k++] = meta_char;

            got_meta = 0;
        }

        tok[k++] = str[i];
    }

    return tok;
}