https://leetcode.com/problems/repeated-dna-sequences/
DNA序列由A、C、G和T构成,从一串序列中找出长度为10的重复出现的子串
若直接对长度为10的子串使用Hash Table,则会Memory Limit Exceeded
考虑到DNA序列的特殊性,只有四个字母出现,能否把Hash Table的键缩短?
A – 0x41 – 0x0100 0001 – 65
C – 0x43 – 0x0100 0011 – 67
G – 0x47 – 0x0100 0111 – 71
T – 0x54 – 0x0101 0100 – 84
只考虑后三位,3*10=30,t = t << 3 & 0x3fffffff | s[i] & 0x7
由于只有四个字母,是否可以使用两位来代表这四个字母呢?很神奇的减64模5,然而,效率并没有提高……
(65 - 64) % 5 = 1
(67 - 64) % 5 = 3
(71 - 64) % 5 = 2
(84 - 64) % 5 = 0
// 112 ms
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> ret;
unordered_map<int, bool> m;
for (int i = 0, t = 0; i < s.size(); i++) {
t = t << 3 & 0x3fffffff | s[i] & 0x7;
// t = t << 2 & 0xfffff | (s[i] - 64) % 5; // 112 ms
if (m.find(t) == m.end()) {
m[t] = true;
} else if (m[t]) {
m[t] = false;
ret.push_back(s.substr(i - 9, 10));
}
}
return ret;
}
};
上面的代码在t左移两位的情况下是有问题的,无法通过TTTTTTTTTT这样的测试,因为T对应于0,而对于0的移位和异或结果依旧为0。
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> ret;
if (s.size() < 11) {
return ret;
}
unordered_map<int, bool> m;
int t = 0;
for (int i = 0; i < 9; i++) {
t = t << 2 | (s[i] - 64) % 5;
}
for (int i = 9; i < s.size(); i++) {
t = t << 2 & 0xfffff | (s[i] - 64) % 5;
if (m.find(t) == m.end()) {
m[t] = true;
} else if (m[t]) {
m[t] = false;
ret.push_back(s.substr(i - 9, 10));
}
}
return ret;
}
};
如何提高效率呢?
注意到2 ^ 20 = 1048576,考虑通过数组来计数,实测发现char类型的数组要比int类型的数组快。但在LeetCode的讨论部分aileengw指出,如果相同的字符串出现次数超过256次则会出问题。
// Runtime: 8 ms
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> ret;
if (s.size() < 11) {
return ret;
}
// int hashmap[1048576] = {0}; // 20 ms
char hashmap[1048576] = {0}; // 8 ms
// memset(hashmap, 0, sizeof(hashmap));
int t = 0;
for (int i = 0; i < 9; i++) {
t = t << 2 | (s[i] - 64) % 5;
}
for (int i = 9; i < s.size(); i++) {
t = t << 2 & 0xfffff | (s[i] - 64) % 5;
// t = t << 2 & 0xfffff | (s[i] & 0x1f) % 5; // 12 ms,减64要比与操作更快……
if (hashmap[t]++ == 1) {
ret.push_back(s.substr(i - 9, 10));
// hashmap[t] = 2; // 12 ms,考虑重复出现次数超过1次的恒定为2,避免越界发生,但hashmap也便无法记录字符串实际重复出现的次数了
}
}
return ret;
}
};
更进一步,也可以考虑直接建立A、C、G、T和0-3的映射关系,可以使用unordered_map,也可以使用char
vector<string> findRepeatedDnaSequences(string s) {
vector<string> ret;
int size = s.size();
if (size < 11) {
return ret;
}
char hashmap[1048576] = {0}; // 8 ms
// unordered_map<char, int> m; // Runtime: 24 ms
// char m[256] = {0}; // Runtime: 12 ms
char m[85] = {0}; // Runtime: 8 ms
m['A'] = 0;
m['C'] = 1;
m['G'] = 2;
m['T'] = 3;
int t = 0;
for (int i = 0; i < 9; i++) {
t = t << 2 | m[s[i]];
}
for (int i = 9; i < size; i++) {
t = t << 2 & 0xfffff | m[s[i]];
if (hashmap[t]++ == 1) {
ret.push_back(s.substr(i - 9, 10));
}
}
return ret;
}