初步实现带有数字的字符串的排序（数字区别大小）

最新推荐文章于 2025-03-14 16:05:02 发布

ManLikeTheWind

最新推荐文章于 2025-03-14 16:05:02 发布

阅读量3k

点赞数

分类专栏：算法 java 基础

java 基础同时被 2 个专栏收录

6 篇文章

订阅专栏

算法

2 篇文章

订阅专栏

介绍一种用于字符串中数字部分按数值大小进行排序的算法，解决了传统字典排序无法正确处理带前导零数字的问题。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

 版权声明：本文为博主原创文章，未经博主允许不得转载。

目录(?)[-]
描述
返回值
遵循于

问题

对于fss2，fss10，fss1，fss11，fss02排序，如果想对数字识别大小排序，传统排序是字典排序，不能达到目标，如下表：

散列	字典排序	目标排序
fss2	fss1	fss1
fss10	fss02	fss2
fss1	fss10	fss02
fss11	fss11	fss10
fss02	fss2	fss11

我们对字符串比较是对数字进行识别，数字就按数值大小排列。

1、假设比较单元，非数字字符就是一个比较单元；连续数字是一个比较单元。

2、数字前字符串连续相同。

3、数值相等但有前缀0若干个的，设定多的为大，升序排列靠下。

4、数字主要针对整数，可无限大，小数其实就是3个比较单元：数字、点、数字。

C源码

[cpp]view plaincopy 
    
 #include <stdio.h>  
 #include <tchar.h>  
 #include <stdlib.h>  
 #include <locale.h>  
   
 #ifdef _UNICODE  
 #define LogicCompare LogicCompareW  
 #define findDigitEnd findDigitEndW  
 #else  
 #define LogicCompare LogicCompareA  
 #define findDigitEnd findDigitEndA  
 #endif  
   
 int LogicCompareW(wchar_t *psza, wchar_t *pszb);  
 int LogicCompareA(char *psza, char *pszb);  
 wchar_t * findDigitEnd(wchar_t **pszBuffer);  
 char * findDigitEnd(char **pszBuffer);  
   
 //unsigned....return the none-zero poniter or null-end and if find '0', it will move the input pointer.  
 wchar_t * findDigitEndW(wchar_t **pszBuffer)  
 {  
     wchar_t *pszEnd = *pszBuffer;  
     bool hasFirstZero = *pszEnd == _T('0');  
     while (*pszEnd)  
     {  
         //high case firstly!  
         if (*pszEnd < _T('0') || *pszEnd > _T('9'))  
         {  
             break;  
         }  
         else if (hasFirstZero && _T('0') == *pszEnd)  
         {  
             (*pszBuffer)++;  
         }  
         pszEnd++;  
           
     }  
     return pszEnd;  
 }  
   
 char * findDigitEndA(char **pszBuffer)  
 {  
     char *pszEnd = *pszBuffer;  
     bool hasFirstZero = *pszEnd == _T('0');  
     while (*pszEnd)  
     {  
         //high case firstly!  
         if (*pszEnd < _T('0') || *pszEnd > _T('9'))  
         {  
             break;  
         }  
         else if (hasFirstZero && _T('0') == *pszEnd)  
         {  
             (*pszBuffer)++;  
         }  
         pszEnd++;  
   
     }  
     return pszEnd;  
 }  
   
 //fang0001san022san: a alpha(f,a,...) is a compared unit, but a number(0001,022) will be condsided as one too.  
 int LogicCompareW(wchar_t *psza, wchar_t *pszb){  
     wchar_t *paCur = psza, *pbCur = pszb;  
     if (psza != NULL && NULL != pszb)  
     {  
         wchar_t *paDigitEnd, *pbDigitEnd;  
         wchar_t *paNonZero, *pbNonZero;  
         while (*paCur && *paCur){  
             paNonZero = paCur;  
             pbNonZero = pbCur;  
             //allow for:    0    ...    0      1      2...  
             //            paCur            paNonZero  
             paDigitEnd = findDigitEndW(&paNonZero);  
             pbDigitEnd = findDigitEndW(&pbNonZero);  
   
             //compare by number   
             if (paDigitEnd > paCur && pbDigitEnd > pbCur)  
             {                 
                 int aDigitLength = paDigitEnd - paNonZero;  
                 int bDigitLength = pbDigitEnd - pbNonZero;  
                 //compare by digit   
                 if (aDigitLength != bDigitLength)  
                     return aDigitLength - bDigitLength;  
                 //the number of their digit is same.  
                 while (paNonZero < paDigitEnd){  
                     if (*paNonZero != *pbNonZero)  
                         return *paNonZero - *pbNonZero;  
                     paNonZero++;  
                     pbNonZero++;  
                 }  
   
                 //if they are equal compared by number, compare the number of '0' when start with "0"   
                 aDigitLength = paNonZero - paCur;  
                 bDigitLength = pbNonZero - pbCur;  
                 if (aDigitLength != bDigitLength)  
                     return bDigitLength - aDigitLength;  
                 paCur = paDigitEnd;  
                 pbCur = pbDigitEnd;  
             }  
             else{  
                 if (*paCur != *pbCur)  
                     return *paCur - *pbCur;  
                 paCur++;  
                 pbCur++;  
             }  
         }  
     }  
     //last condtion should be never reached.  
     return NULL == paCur ? -1 : NULL == pbCur ? -1 : *paCur - *pbCur;  
 }  
   
 //fang0001san022san: a alpha(f,a,...) is a compared unit, but a number(0001,022) will be condsided as one too.  
 int LogicCompareA(char *psza, char *pszb){  
     char *paCur = psza, *pbCur = pszb;  
     if (psza != NULL && NULL != pszb)  
     {  
         char *paDigitEnd, *pbDigitEnd;  
         char *paNonZero, *pbNonZero;  
         while (*paCur && *paCur){  
             paNonZero = paCur;  
             pbNonZero = pbCur;  
             //allow for:    0    ...    0      1      2...  
             //            paCur            paNonZero  
             paDigitEnd = findDigitEndA(&paNonZero);  
             pbDigitEnd = findDigitEndA(&pbNonZero);  
   
             //compare by number   
             if (paDigitEnd > paCur && pbDigitEnd > pbCur)  
             {  
                 int aDigitLength = paDigitEnd - paNonZero;  
                 int bDigitLength = pbDigitEnd - pbNonZero;  
                 //compare by digit   
                 if (aDigitLength != bDigitLength)  
                     return aDigitLength - bDigitLength;  
                 //the number of their digit is same.  
                 while (paNonZero < paDigitEnd){  
                     if (*paNonZero != *pbNonZero)  
                         return *paNonZero - *pbNonZero;  
                     paNonZero++;  
                     pbNonZero++;  
                 }  
   
                 //if they are equal compared by number, compare the number of '0' when start with "0"   
                 //ps note: paNonZero and pbNonZero can be added the above loop "while", but it is changed meanwhile.  
                 //so, the following comparsion is ok.  
                 aDigitLength = paNonZero - paCur;  
                 bDigitLength = pbNonZero - pbCur;  
                 if (aDigitLength != bDigitLength)  
                     return bDigitLength - aDigitLength;  
                 paCur = paDigitEnd;  
                 pbCur = pbDigitEnd;  
             }  
             else{  
                 if (*paCur != *pbCur)  
                     return *paCur - *pbCur;  
                 paCur++;  
                 pbCur++;  
             }  
         }  
     }  
     //last condition should be never reached.  
     return NULL == paCur ? -1 : NULL == pbCur ? -1 : *paCur - *pbCur;  
 }  
   
 int LogicCompareWithPrint(TCHAR *psza, TCHAR *pszb){  
     _tprintf(_T("LogicCompare : %s%*s%s%*s"), psza, 12 - _tcslen(psza), ",", pszb, 10 - _tcslen(pszb), " ");  
     return LogicCompare(psza, pszb);  
 }  
   
 int _tmain(int argc, _TCHAR* argv[])  
 {  
     _tprintf(_T("\t: %d\n"), LogicCompareWithPrint(_T("a00011b01"), _T("a011b01")));  
     _tprintf(_T("\t: %d\n"), LogicCompareWithPrint(_T("a0011"), _T("a02")));  
     _tprintf(_T("\t: %d\n"), LogicCompareWithPrint(_T("a011b011"), _T("a011b11")));  
     _tprintf(_T("\t: %d\n"), LogicCompareWithPrint(_T("a011b11"), _T("a011b2")));  
     _tprintf(_T("\t: %d\n"), LogicCompareWithPrint(_T("a11"), _T("a2")));  
   
     _tprintf(_T("\t: %d\n"), LogicCompareWithPrint(_T("0"), _T("_")));  
     _tprintf(_T("'0'(48) - '_'(95) = %d\n\n"), _T('0') - _T('_'));  
   
     TCHAR *values[] = {_T("ss1"), _T("f00000111111111111111111111111111111111111111111111111111111111"), _T("ss01"), _T("房-"), _T("f0001"),  
         _T("ss_1"), _T("ss002"), _T("房-01s2"), _T("f111111111111111111111111111111111111111111111111111111111"), _T("房-01s10") };  
     TCHAR **ppa = values;  
     TCHAR **ppb;  
     TCHAR **ppend= values+9;  
     while (ppa < ppend){  
         ppb = ppa+1;  
         while (ppb <= ppend)  
         {  
             if (LogicCompare(*ppa, *ppb) > 0)  
             {  
                 TCHAR *pTemp = *ppa;  
                 *ppa = *ppb;  
                 *ppb = pTemp;  
             }  
             ppb++;  
         }  
         ppa++;  
     }  
     ppa = values;  
   
   
     _tsetlocale(LC_ALL, _T("chs"));//LC_ALL = 0; make "wprintf" output a UNICODE string.  
   
     while (ppa <= ppend){  
         _tprintf(_T("%s\n"), *ppa);  
         ppa++;  
     }  
   
     getchar();  
     return 0;  
 }  

输出截图

案例输出（右边是windows-shell-对文件名逻辑排序（xp以上还是vista）的比较图）：

以发现不同：下划线_和数字0

Java实现

[java]view plaincopy 
    
 package san;  
   
 import java.util.Arrays;  
 import java.util.Comparator;  
 import java.util.regex.Matcher;  
 import java.util.regex.Pattern;  
   
 public class Demo {  
   
     public static void main(String[] args) {  
         // TODO Auto-generated method stub  
         String fileNames[] = { "fss01", "fss2", "fss01_22", "fss3", "fss1", "fss10", "fss20", "fss4", "fss30", "fss21", "fss12","fss01_3" };  
         char chFileNames[][] = new char[fileNames.length][];  
         String[] oldSortedNames = new String[fileNames.length];  
         for (int i = 0; i < fileNames.length; i++) {  
             chFileNames[i] = fileNames[i].toCharArray();  
             oldSortedNames[i] = fileNames[i];  
         }  
   
         // Arrays.sort(fileNames, StrLogicCmp);  
         Arrays.sort(chFileNames, ChsLogicCmp);  
         System.out.println("_Random_" + "\t" + "_Tradion_" + "\t" + "_Target_");  
         String line;  
         for (int i = 0; i < fileNames.length; i++) {  
             line = fileNames[i] + (fileNames[i].length() >= 8 ? "\t" : "\t\t");  
             line += oldSortedNames[i] + (oldSortedNames[i].length() >= 8 ? "\t" : "\t\t");  
             line += new String(chFileNames[i]);  
             System.out.println(line);  
               
         }  
           
           
     }  
       
     static Comparator<String> StrLogicCmp = new Comparator<String>() {  
   
         @Override  
         public int compare(String o1, String o2) {  
             // TODO Auto-generated method stub  
             return 0;  
         }  
           
     };  
       
     // "f01s2s22", "f1s02s2"  
     static Comparator<char[]> ChsLogicCmp = new Comparator<char[]>() {  
         class Int{  
             public int i;  
         }  
         public int findDigitEnd(char[] arrChar, Int at) {  
             int k = at.i;  
             char c = arrChar[k];  
             boolean bFirstZero = (c == '0');  
             while (k < arrChar.length) {  
                 c = arrChar[k];  
                 //first non-digit which is a high chance.  
                 if (c > '9' || c < '0') {  
                     break;  
                 }  
                 else if (bFirstZero && c == '0') {  
                     at.i++;   
                 }  
                 k++;  
             }  
             return k;  
         }  
   
         @Override  
         public int compare(char[] a, char[] b) {  
             if(a != null || b != null){  
                 Int aNonzeroIndex = new Int();  
                 Int bNonzeroIndex = new Int();  
                 int aIndex = 0, bIndex = 0,   
                 aComparedUnitTailIndex, bComparedUnitTailIndex;  
       
 //              Pattern pattern = Pattern.compile("D*(d+)D*");  
 //              Matcher matcher1 = pattern.matcher(a);  
 //              Matcher matcher2 = pattern.matcher(b);  
 //              if(matcher1.find() && matcher2.find()) {  
 //                  String s1 = matcher1.group(1);  
 //                  String s2 = matcher2.group(1);  
 //              }  
                       
                 while(aIndex < a.length && bIndex < b.length){  
                     //aIndex <   
                     aNonzeroIndex.i = aIndex;  
                     bNonzeroIndex.i = bIndex;  
                     aComparedUnitTailIndex = findDigitEnd(a, aNonzeroIndex);  
                     bComparedUnitTailIndex = findDigitEnd(b, bNonzeroIndex);  
                     //compare by number   
                     if (aComparedUnitTailIndex > aIndex && bComparedUnitTailIndex > bIndex)  
                     {  
                         int aDigitIndex = aNonzeroIndex.i;  
                         int bDigitIndex = bNonzeroIndex.i;  
                         int aDigit = aComparedUnitTailIndex - aDigitIndex;  
                         int bDigit = bComparedUnitTailIndex - bDigitIndex;  
                         //compare by digit   
                         if(aDigit != bDigit)  
                             return aDigit - bDigit;  
                         //the number of their digit is same.  
                         while (aDigitIndex < aComparedUnitTailIndex){  
                             if (a[aDigitIndex] != b[bDigitIndex])  
                                 return a[aDigitIndex] - b[bDigitIndex];  
                             aDigitIndex++;  
                             bDigitIndex++;  
                         }  
                         //if they are equal compared by number, compare the number of '0' when start with "0"   
                         //ps note: paNonZero and pbNonZero can be added the above loop "while", but it is changed meanwhile.  
                         //so, the following comparsion is ok.  
                         aDigit = aNonzeroIndex.i - aIndex;  
                         bDigit = bNonzeroIndex.i - bIndex;  
                         if (aDigit != bDigit)  
                             return aDigit - bDigit;  
                         aIndex = aComparedUnitTailIndex;  
                         bIndex = bComparedUnitTailIndex;  
                     }else{  
                         if (a[aIndex] != b[bIndex])  
                             return a[aIndex] - b[bIndex];  
                         aIndex++;  
                         bIndex++;  
                     }  
                       
                 }  
                   
             }  
             return a.length - b.length;  
         }  
   
     };  
 }  

输出结果

_Random_ _Tradion_ _Target_
fss01 fss01 fss1
fss2 fss2 fss01
fss01_22 fss01_22 fss01_3
fss3 fss3 fss01_22
fss1 fss1 fss2
fss10 fss10 fss3
fss20 fss20 fss4
fss4 fss4 fss10
fss30 fss30 fss12
fss21 fss21 fss20
fss12 fss12 fss21
fss01_3 fss01_3 fss30

引用

这里2001年就有了

#define _GNU_SOURCE /* 见 feature_test_macros(7) */

#include <string.h>

int strverscmp(const char *s1, const char *s2);

描述

经常有这样的文件 jan1 、 jan2 、……、 jan9 、 jan10 ，……并且感觉 ls (1) 排序为 jan1 、 jan10 、……、 jan2 、……、 jan9 是不正确的。为了改变这个情况，GNU 允许使用 -v 选项的 ls (1)，它的实现使用 versionsort (3)，这个函数使用了 strverscmp ()。

因此，strverscmp() 是比较两个字符串并发现“正确”的顺序，而 strcmp(3) 只是发现字典顺序。这个函数不使用区域选项 LC_COLLATE，也就是说它期待字符串是 ASCII 字符串。

这个函数做下面的事。如果两个字符串相同，返回 0。否则找出分界字符，两个字串在其前都相同而其它却不同。找出最大的连续不断的数字字符(开始于，或结束于)这个位置。如果一个或两个这个子串都是空的，则像 strcmp(3) 一样返回(字节值的数值顺序)。否则，比较两个数字串的数值，如果前面有十进制点则起始的一个或多个零也会被处理(因此有多个零的子串将比零的个数较少的串更小)。所以，这个顺序大概像 000、00、01、010、09、0、1、9、10。

返回值

strverscmp () 函数一个小于、等于或大于零的整数，如果发现 s1 小于、等于或大于 s2 。

遵循于

这个函数是 GNU 扩展。

源码

[cpp]view plaincopy 
    
 /** Compare strings while treating digits characters numerically. 
    Copyright (C) 1997, 2002, 2005 Free Software Foundation, Inc. 
    This file is part of the libiberty library. 
    Contributed by Jean-François Bignolles <bignolle@ecoledoc.ibp.fr>, 1997. 
  
    Libiberty is free software; you can redistribute it and/or 
    modify it under the terms of the GNU Lesser General Public 
    License as published by the Free Software Foundation; either 
    version 2.1 of the License, or (at your option) any later version. 
  
    Libiberty is distributed in the hope that it will be useful, 
    but WITHOUT ANY WARRANTY; without even the implied warranty of 
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
    Lesser General Public License for more details. 
  
    You should have received a copy of the GNU Lesser General Public 
    License along with the GNU C Library; if not, write to the Free 
    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 
    02110-1301 USA.  */  
   
 #include "libiberty.h"  
 #include "safe-ctype.h"  
   
 /**  
 @deftypefun int strverscmp (const char *@var{s1}, const char *@var{s2}) 
 The @code{strverscmp} function compares the string @var{s1} against 
 @var{s2}, considering them as holding indices/version numbers.  Return 
 value follows the same conventions as found in the @code{strverscmp} 
 function.  In fact, if @var{s1} and @var{s2} contain no digits, 
 @code{strverscmp} behaves like @code{strcmp}. 
  
 Basically, we compare strings normally (character by character), until 
 we find a digit in each string - then we enter a special comparison 
 mode, where each sequence of digits is taken as a whole.  If we reach the 
 end of these two parts without noticing a difference, we return to the 
 standard comparison mode.  There are two types of numeric parts: 
 "integral" and "fractional" (those  begin with a '0'). The types 
 of the numeric parts affect the way we sort them: 
  
 @itemize @bullet 
 @item 
 integral/integral: we compare values as you would expect. 
  
 @item 
 fractional/integral: the fractional part is less than the integral one. 
 Again, no surprise. 
  
 @item 
 fractional/fractional: the things become a bit more complex. 
 If the common prefix contains only leading zeroes, the longest part is less 
 than the other one; else the comparison behaves normally. 
 @end itemize 
  
 @smallexample 
 strverscmp ("no digit", "no digit") 
     @result{} 0    // @r{same behavior as strcmp.} 
 strverscmp ("item#99", "item#100") 
     @result{} <0   // @r{same prefix, but 99 < 100.} 
 strverscmp ("alpha1", "alpha001") 
     @result{} >0   // @r{fractional part inferior to integral one.} 
 strverscmp ("part1_f012", "part1_f01") 
     @result{} >0   // @r{two fractional parts.} 
 strverscmp ("foo.009", "foo.0") 
     @result{} <0   // @r{idem, but with leading zeroes only.} 
 @end smallexample 
  
 This function is especially useful when dealing with filename sorting, 
 because filenames frequently hold indices/version numbers. 
 @end deftypefun 
  
 */  
   
 /** states: S_N: normal, S_I: comparing integral part, S_F: comparing 
            fractional parts, S_Z: idem but with leading Zeroes only */  
 #define  S_N    0x0  
 #define  S_I    0x4  
 #define  S_F    0x8  
 #define  S_Z    0xC  
   
 /** result_type: CMP: return diff; LEN: compare using len_diff/diff */  
 #define  CMP    2  
 #define  LEN    3  
   
   
 /** Compare S1 and S2 as strings holding indices/version numbers, 
    returning less than, equal to or greater than zero if S1 is less than, 
    equal to or greater than S2 (for more info, see the Glibc texinfo doc).  */  
   
 int  
 strverscmp (const char *s1, const char *s2)  
 {  
   const unsigned char *p1 = (const unsigned char *) s1;  
   const unsigned char *p2 = (const unsigned char *) s2;  
   unsigned char c1, c2;  
   int state;  
   int diff;  
   
   /** Symbol(s)    0       [1-9]   others  (padding) 
      Transition   (10) 0  (01) d  (00) x  (11) -   */  
   static const unsigned int next_state[] =  
     {  
       /** state    x    d    0    - */  
       /** S_N */  S_N, S_I, S_Z, S_N,  
       /** S_I */  S_N, S_I, S_I, S_I,  
       /** S_F */  S_N, S_F, S_F, S_F,  
       /** S_Z */  S_N, S_F, S_Z, S_Z  
     };  
   
   static const int result_type[] =  
     {  
       /** state   x/x  x/d  x/0  x/-  d/x  d/d  d/0  d/- 
                  0/x  0/d  0/0  0/-  -/x  -/d  -/0  -/- */  
   
       /** S_N */  CMP, CMP, CMP, CMP, CMP, LEN, CMP, CMP,  
                  CMP, CMP, CMP, CMP, CMP, CMP, CMP, CMP,  
       /** S_I */  CMP, -1,  -1,  CMP, +1,  LEN, LEN, CMP,  
                  +1,  LEN, LEN, CMP, CMP, CMP, CMP, CMP,  
       /** S_F */  CMP, CMP, CMP, CMP, CMP, LEN, CMP, CMP,  
                  CMP, CMP, CMP, CMP, CMP, CMP, CMP, CMP,  
       /** S_Z */  CMP, +1,  +1,  CMP, -1,  CMP, CMP, CMP,  
                  -1,  CMP, CMP, CMP  
     };  
   
   if (p1 == p2)  
     return 0;  
   
   c1 = *p1++;  
   c2 = *p2++;  
   /** Hint: '0' is a digit too.  */  
   state = S_N | ((c1 == '0') + (ISDIGIT (c1) != 0));  
   
   while ((diff = c1 - c2) == 0 && c1 != '\0')  
     {  
       state = next_state[state];  
       c1 = *p1++;  
       c2 = *p2++;  
       state |= (c1 == '0') + (ISDIGIT (c1) != 0);  
     }  
   
   state = result_type[state << 2 | (((c2 == '0') + (ISDIGIT (c2) != 0)))];  
   
   switch (state)  
     {  
     case CMP:  
       return diff;  
         
     case LEN:  
       while (ISDIGIT (*p1++))  
     if (!ISDIGIT (*p2++))  
       return 1;  
         
       return ISDIGIT (*p2) ? -1 : diff;  
         
     default:  
       return state;  
     }  
 }