题目描述:
UTF-8 中的一个字符可能的长度为 1 到 4 字节,遵循以下的规则:
对于 1 字节的字符,字节的第一位设为0,后面7位为这个符号的unicode码。
对于 n 字节的字符 (n > 1),第一个字节的前 n 位都设为1,第 n+1 位设为0,后面字节的前两位一律设为10。剩下的没有提及的二进制位,全部为这个符号的unicode码。
这是 UTF-8 编码的工作方式:
Char. number range | UTF-8 octet sequence
(hexadecimal) | (binary)
--------------------±--------------------------------------------
0000 0000-0000 007F | 0xxxxxxx
0000 0080-0000 07FF | 110xxxxx 10xxxxxx
0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
给定一个表示数据的整数数组,返回它是否为有效的 utf-8 编码。
注意:
输入是整数数组。只有每个整数的最低 8 个有效位用来存储数据。这意味着每个整数只表示 1 字节的数据。
解题思路
很简单的思路,将数字化为8位二进制,然后按照题目中的规则进行逻辑判断
wa了两次,一次是因为没注意字节数最多只有四位,一次是没考率到没写完,输出不完整的情况
时间,空间复杂度0(n)
- 其他
练习了进制转换
#include <iostream>
#include <cstdio>
#include <cstring>
#include <cmath>
#include <cstdlib>
#include <algorithm>
#include <map>
#include <set>
#include <vector>
#include <stack>
#include <cstring>
#include <queue>
#define LL long long
using namespace std;
bool validUtf8(vector<int>& data){
int len = data.size();
vector <string> bin;
for(int i=0;i<len;i++){
int num = data[i];
string s = "";
while(num){
s.push_back((char)(num%2+'0'));
num/=2;
}
int sl = s.size();
if(s.size()<8){
for(int _=0;_<8-sl;_++){
s.push_back('0');
}
}
for(int _=0;_<s.size()/2;_++){
char temp = s[_];
s[_] = s[s.size()-_-1];
s[s.size()-_-1] = temp;
}
bin.push_back(s);
}
int fs = 0;
for(int i=0;i<bin.size();i++){
cout<<bin[i]<<endl;
int cnt = 0;
for(int _=0;_<8;_++){
if(bin[i][_]=='1'){
cnt++;
}else{
break;
}
}
if(cnt>4){
return false;
}
if(cnt==0&&fs==0){
continue;
}else if(cnt==1&&fs>0){
if(i==bin.size()-1){
if(fs>1){
return false;
}
}
fs--;
continue;
}else if(cnt>1&&fs==0){
if(i==bin.size()-1){
return false;
}
fs=cnt-1;
}else{
return false;
}
}
return true;
}
int main()
{
vector <int> a;
a.push_back(250);
a.push_back(145);
a.push_back(145);
a.push_back(145);
a.push_back(145);
cout<<validUtf8(a)<<endl;
return 0;
}
进阶方法:
- 位运算,可以将空间复杂度将为0(1)
class Solution:
def validUtf8(self, data):
"""
:type data: List[int]
:rtype: bool
"""
# Number of bytes in the current UTF-8 character
n_bytes = 0
# Mask to check if the most significant bit (8th bit from the left) is set or not
mask1 = 1 << 7
# Mask to check if the second most significant bit is set or not
mask2 = 1 << 6
for num in data:
# Get the number of set most significant bits in the byte if
# this is the starting byte of an UTF-8 character.
mask = 1 << 7
if n_bytes == 0:
while mask & num:
n_bytes += 1
mask = mask >> 1
# 1 byte characters
if n_bytes == 0:
continue
# Invalid scenarios according to the rules of the problem.
if n_bytes == 1 or n_bytes > 4:
return False
else:
# If this byte is a part of an existing UTF-8 character, then we
# simply have to look at the two most significant bits and we make
# use of the masks we defined before.
if not (num & mask1 and not (num & mask2)):
return False
n_bytes -= 1
return n_bytes == 0
作者:LeetCode
链接:https://leetcode-cn.com/problems/utf-8-validation/solution/utf-8-bian-ma-yan-zheng-by-leetcode/
来源:力扣(LeetCode)
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
- 子串用substr
if(remain==0)
{
if(x[0]=='0') continue;
else if(x.substr(0,3)=="110") remain+=1;
else if(x.substr(0,4)=="1110") remain+=2;
else if(x.substr(0,5)=="11110") remain+=3;
else return false;
}