C硬核：字符串操作总结（带Unicode和ASCII比较）

最新推荐文章于 2022-04-16 21:19:20 发布

Cerman

最新推荐文章于 2022-04-16 21:19:20 发布

阅读量1.2k

点赞数

分类专栏： C语言硬件编程文章标签： c语言字符串函数式编程

本文链接：https://blog.csdn.net/Cerman/article/details/120118765

版权

C语言硬件编程专栏收录该内容

7 篇文章 1 订阅

订阅专栏

一. 字符串长度与大小

因为ASCII字符串以NULL或者说以'\0'结尾，UNICODE以L'\0'结尾，那么字符串的大小和长度显示是不同的：字符串大小 = 字符串长度 + 1；

获取一个字符串的长度采用的库函数是

size_t strlen( char const *string );

获取字符串大小的函数可以根据和长度的关系自行构造。下面列举UEFI中的两个函数的实现，

ASCII码

UINTN
EFIAPI
AsciiStrLen (
  IN      CONST CHAR8               *String
  )
{
  UINTN                             Length;

  ASSERT (String != NULL);

  for (Length = 0; *String != '\0'; String++, Length++) {
    //
    // If PcdMaximumUnicodeStringLength is not zero,
    // length should not more than PcdMaximumUnicodeStringLength
    //
    if (PcdGet32 (PcdMaximumAsciiStringLength) != 0) {
      ASSERT (Length < PcdGet32 (PcdMaximumAsciiStringLength));
    }
  }
  return Length;
}

UINTN
EFIAPI
AsciiStrSize (
  IN      CONST CHAR8               *String
  )
{
  return (AsciiStrLen (String) + 1) * sizeof (*String);
}

UNICODE码

UINTN
EFIAPI
StrLen (
  IN      CONST CHAR16              *String
  )
{
  UINTN                             Length;

  ASSERT (String != NULL);
  ASSERT (((UINTN) String & BIT0) == 0);

  for (Length = 0; *String != L'\0'; String++, Length++) {
    //
    // If PcdMaximumUnicodeStringLength is not zero,
    // length should not more than PcdMaximumUnicodeStringLength
    //
    if (PcdGet32 (PcdMaximumUnicodeStringLength) != 0) {
      ASSERT (Length < PcdGet32 (PcdMaximumUnicodeStringLength));
    }
  }
  return Length;
}

UINTN
EFIAPI
StrSize (
  IN      CONST CHAR16              *String
  )
{
  return (StrLen (String) + 1) * sizeof (*String);
}

二. 字符串比较

所谓字符串比较，就是看两个字符串是否相互包含，或者对比最先不匹配的字符，哪个在字符集中的序数较小。函数原型是：

int strcmp( char const *s1, char const *s2 )

可以看到该函数的返回值类型为INT，那么如果s1小于s2，返回值小于0；如果s1大于s2，返回值大于0；如果s1等于s2，返回值等于0。

UEFI中该库函数的实现如下：

ASCII码

INTN
EFIAPI
AsciiStrCmp (
  IN      CONST CHAR8               *FirstString,
  IN      CONST CHAR8               *SecondString
  )
{
  //
  // ASSERT both strings are less long than PcdMaximumAsciiStringLength
  //
  ASSERT (AsciiStrSize (FirstString));
  ASSERT (AsciiStrSize (SecondString));

  while ((*FirstString != '\0') && (*FirstString == *SecondString)) {
    FirstString++;
    SecondString++;
  }

  return *FirstString - *SecondString;
}

UNICODE码

INTN
EFIAPI
StrCmp (
  IN      CONST CHAR16              *FirstString,
  IN      CONST CHAR16              *SecondString
  )
{
  //
  // ASSERT both strings are less long than PcdMaximumUnicodeStringLength
  //
  ASSERT (StrSize (FirstString) != 0);
  ASSERT (StrSize (SecondString) != 0);

  while ((*FirstString != L'\0') && (*FirstString == *SecondString)) {
    FirstString++;
    SecondString++;
  }
  return *FirstString - *SecondString;
}

三. 字符串变换

这里的字符串变换包括不设置长度的strcpy/strcat，以及设置长度的strncpy/strncat。

strcpy和strncpy的函数原型如下：

char *strcpy( char *dst, char const *src );
char *strncpy( char *dst, char const *src, size_t len );

strcpy一定会把字符串最后的NULL也拷贝到dst中，而strncpy则会根据len的大小决定是否拷贝到dst；两者在UEFI中的实现如下：

ASCII码

CHAR8 *
EFIAPI
AsciiStrCpy (
  OUT     CHAR8                     *Destination,
  IN      CONST CHAR8               *Source
  )
{
  CHAR8                             *ReturnValue;

  //
  // Destination cannot be NULL
  //
  ASSERT (Destination != NULL);

  //
  // Destination and source cannot overlap
  //
  ASSERT ((UINTN)(Destination - Source) > AsciiStrLen (Source));
  ASSERT ((UINTN)(Source - Destination) > AsciiStrLen (Source));

  ReturnValue = Destination;
  while (*Source != 0) {
    *(Destination++) = *(Source++);
  }
  *Destination = 0;
  return ReturnValue;
}

CHAR8 *
EFIAPI
AsciiStrnCpy (
  OUT     CHAR8                     *Destination,
  IN      CONST CHAR8               *Source,
  IN      UINTN                     Length
  )
{
  CHAR8                             *ReturnValue;

  if (Length == 0) {
    return Destination;
  }

  //
  // Destination cannot be NULL
  //
  ASSERT (Destination != NULL);

  //
  // Destination and source cannot overlap
  //
  ASSERT ((UINTN)(Destination - Source) > AsciiStrLen (Source));
  ASSERT ((UINTN)(Source - Destination) >= Length);

  if (PcdGet32 (PcdMaximumAsciiStringLength) != 0) {
    ASSERT (Length <= PcdGet32 (PcdMaximumAsciiStringLength));
  }

  ReturnValue = Destination;

  while (*Source != 0 && Length > 0) {
    *(Destination++) = *(Source++);
    Length--;
  }

  ZeroMem (Destination, Length * sizeof (*Destination));
  return ReturnValue;
}

UNICODE码

CHAR16 *
EFIAPI
StrCpy (
  OUT     CHAR16                    *Destination,
  IN      CONST CHAR16              *Source
  )
{
  CHAR16                            *ReturnValue;

  //
  // Destination cannot be NULL
  //
  ASSERT (Destination != NULL);
  ASSERT (((UINTN) Destination & BIT0) == 0);

  //
  // Destination and source cannot overlap
  //
  ASSERT ((UINTN)(Destination - Source) > StrLen (Source));
  ASSERT ((UINTN)(Source - Destination) > StrLen (Source));

  ReturnValue = Destination;
  while (*Source != 0) {
    *(Destination++) = *(Source++);
  }
  *Destination = 0;
  return ReturnValue;
}

CHAR16 *
EFIAPI
StrnCpy (
  OUT     CHAR16                    *Destination,
  IN      CONST CHAR16              *Source,
  IN      UINTN                     Length
  )
{
  CHAR16                            *ReturnValue;

  if (Length == 0) {
    return Destination;
  }

  //
  // Destination cannot be NULL if Length is not zero
  //
  ASSERT (Destination != NULL);
  ASSERT (((UINTN) Destination & BIT0) == 0);

  //
  // Destination and source cannot overlap
  //
  ASSERT ((UINTN)(Destination - Source) > StrLen (Source));
  ASSERT ((UINTN)(Source - Destination) >= Length);

  if (PcdGet32 (PcdMaximumUnicodeStringLength) != 0) {
    ASSERT (Length <= PcdGet32 (PcdMaximumUnicodeStringLength));
  }

  ReturnValue = Destination;

  while ((*Source != L'\0') && (Length > 0)) {
    *(Destination++) = *(Source++);
    Length--;
  }

  ZeroMem (Destination, Length * sizeof (*Destination));
  return ReturnValue;
}

strcat表示将str2粘贴到str1的后面排放，strncat表示只操作str2的n个字符，其函数原型是

char *strcat(char *dest, const char *src)
char *strncat(char *dest, const char *src, size_t n)

在UEFI中的具体实现如下

ASCII码

CHAR8 *
EFIAPI
AsciiStrCat (
  IN OUT CHAR8    *Destination,
  IN CONST CHAR8  *Source
  )
{
  AsciiStrCpy (Destination + AsciiStrLen (Destination), Source);

  //
  // Size of the resulting string should never be zero.
  // PcdMaximumUnicodeStringLength is tested inside StrLen().
  //
  ASSERT (AsciiStrSize (Destination) != 0);
  return Destination;
}

CHAR8 *
EFIAPI
AsciiStrnCat (
  IN OUT  CHAR8                     *Destination,
  IN      CONST CHAR8               *Source,
  IN      UINTN                     Length
  )
{
  UINTN   DestinationLen;

  DestinationLen = AsciiStrLen (Destination);
  AsciiStrnCpy (Destination + DestinationLen, Source, Length);
  Destination[DestinationLen + Length] = '\0';

  //
  // Size of the resulting string should never be zero.
  // PcdMaximumUnicodeStringLength is tested inside StrLen().
  //
  ASSERT (AsciiStrSize (Destination) != 0);
  return Destination;
}

UNICODE码

CHAR16 *
EFIAPI
StrCat (
  IN OUT  CHAR16                    *Destination,
  IN      CONST CHAR16              *Source
  )
{
  StrCpy (Destination + StrLen (Destination), Source);

  //
  // Size of the resulting string should never be zero.
  // PcdMaximumUnicodeStringLength is tested inside StrLen().
  //
  ASSERT (StrSize (Destination) != 0);
  return Destination;
}

CHAR16 *
EFIAPI
StrnCat (
  IN OUT  CHAR16                    *Destination,
  IN      CONST CHAR16              *Source,
  IN      UINTN                     Length
  )
{
  UINTN   DestinationLen;

  DestinationLen = StrLen (Destination);
  StrnCpy (Destination + DestinationLen, Source, Length);
  Destination[DestinationLen + Length] = L'\0';

  //
  // Size of the resulting string should never be zero.
  // PcdMaximumUnicodeStringLength is tested inside StrLen().
  //
  ASSERT (StrSize (Destination) != 0);
  return Destination;
}

四. 字符串查找

字符串查找相关的函数比较多，有strstr/strchr/strrchr/strpbrk/strspn/strcspn等等，其中strstr代表在某一字符串中查找另一个字符串，strchr表示在某一字符串中查找某一个字符，strrchr表示从右往左查找第一个匹配的字符串，strpbrk表示在字符串中查找某一个字符串组任意元素第一次出现的位置，strspn用于检索字符串 group中第一个不在字符串 str中出现的字符下标，strcspn用于检索字符串 group中第一个在字符串 str中出现的字符下标。

这些函数的声明如下：

char *strchr(const char *str, int c);
char *strrchr(const char *str, int c);
char *strstr(const char *haystack, const char *needle);
size_t strspn(const char *str1, const char *str2);
size_t strcspn(const char *str1, const char *str2);
char *strpbrk(const char *str1, const char *str2);

看起来关于查找的字符串很多，但最常用的还是strstr，其在UEFI中的具体实现如下：

ASCII码

CHAR8 *
EFIAPI
AsciiStrStr (
  IN      CONST CHAR8               *String,
  IN      CONST CHAR8               *SearchString
  )
{
  CONST CHAR8 *FirstMatch;
  CONST CHAR8 *SearchStringTmp;

  //
  // ASSERT both strings are less long than PcdMaximumAsciiStringLength
  //
  ASSERT (AsciiStrSize (String) != 0);
  ASSERT (AsciiStrSize (SearchString) != 0);

  if (*SearchString == '\0') {
    return (CHAR8 *) String;
  }

  while (*String != '\0') {
    SearchStringTmp = SearchString;
    FirstMatch = String;

    while ((*String == *SearchStringTmp)
            && (*String != '\0')) {
      String++;
      SearchStringTmp++;
    }

    if (*SearchStringTmp == '\0') {
      return (CHAR8 *) FirstMatch;
    }

    if (*String == '\0') {
      return NULL;
    }

    String = FirstMatch + 1;
  }

  return NULL;
}

UNICODE码

CHAR16 *
EFIAPI
StrStr (
  IN      CONST CHAR16              *String,
  IN      CONST CHAR16              *SearchString
  )
{
  CONST CHAR16 *FirstMatch;
  CONST CHAR16 *SearchStringTmp;

  //
  // ASSERT both strings are less long than PcdMaximumUnicodeStringLength.
  // Length tests are performed inside StrLen().
  //
  ASSERT (StrSize (String) != 0);
  ASSERT (StrSize (SearchString) != 0);

  if (*SearchString == L'\0') {
    return (CHAR16 *) String;
  }

  while (*String != L'\0') {
    SearchStringTmp = SearchString;
    FirstMatch = String;

    while ((*String == *SearchStringTmp)
            && (*String != L'\0')) {
      String++;
      SearchStringTmp++;
    }

    if (*SearchStringTmp == L'\0') {
      return (CHAR16 *) FirstMatch;
    }

    if (*String == L'\0') {
      return NULL;
    }

    String = FirstMatch + 1;
  }

  return NULL;
}

五. 好用的strtok

经常会遇到某个字符串中出现某些字符或字串规律排放的情况，比如实际工程中遇到的：

char str[] = {0, 1, 8, 9};

有时需要统计出现的数字字符个数，或者需要将每个数字字符取出来，那么这时strtok就再合适不过了：

char *strtok(char *str, const char *delim)
//分解字符串 str 为一组字符串，delim 为分隔符。

具体将0/1/8/9打印出来的方法如下：

 char str[] = "0, 1, 8, 9";
   const char s[2] = ",";
   char *token;
   
   /* 获取第一个子字符串 */
   token = strtok(str, s);
   
   /* 继续获取其他的子字符串 */
   while( token != NULL ) {
      printf( "%s\n", token );
    
      token = strtok(NULL, s);
   }

总结

用好C语言字符串相关库函数，可以解决相当一部分字符输入输出等用于字符处理的场景，UEFI中库函数的具体实现，也为我们自行定义库函数中没有但实际需要的功能function提供了一种参考。祝大家在C的应用上越来越得心应手！

Cerman

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
打赏
0
评论
C硬核：字符串操作总结（带Unicode和ASCII比较）

字符串操作在C编程项目和工程中是不可或缺的存在，有些情况下，我们可以很方便的使用string.h中提供的库函数，有些时候需要自行去编写。这里谈谈那些可以被使用的字符串操作库函数，并且针对常用的几种，采用UNICODE和ASCII两种源码实现展现出来。
复制链接

扫一扫