题目:
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: “ACGAATTCCG”. When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = “AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT”,
Return:
[“AAAAACCCCC”, “CCCCCAAAAA”].
先吐槽一下,在leetcode上做题真的是很烦,在内存上各种做文章,感觉真的很没意思。
吐槽完毕。
这个题因为一共只有4中字母,所以可以用2位二进制位表示,所以10个字母总共20个二进制位,可以直接用数组存储,判重。
本来一个int型vis数组就可以搞定,可是leetcode显示MLE。所以就加了一个ck数组辅助vis进行记录。
class Solution {
public:
bool vis[(1 << 20) - 1];
bool ck[(1 << 20) - 1];
vector<string> findRepeatedDnaSequences(string s) {
map<char, int> mp;
mp['A'] = 0;
mp['T'] = 1;
mp['C'] = 2;
mp['G'] = 3;
memset(vis, 0, sizeof vis);
memset(ck, 0, sizeof ck);
int mask = (1 << 20) - 1;
vector<string> ans;
int bc = 0;
if (s.length() < 10) return ans;
for (int i = 0; i < 10; i++) {
bc = (bc << 2) | mp[s[i]];
}
vis[bc]++;
for (int i = 10; i < s.length(); i++) {
bc = ((bc << 2) & mask) | mp[s[i]];
if (vis[bc]) {
if (!ck[bc]) {
tostr(bc, ans);
ck[bc] = true;
}
}
else vis[bc] = true;
}
return ans;
}
void tostr(int x, vector<string>& ans) {
string str;
int y;
for (int i = 0; i < 10; i++) {
y = x & 3;
x >>= 2;
if (y == 0) str.push_back('A');
else if (y == 1) str.push_back('T');
else if (y == 2) str.push_back('C');
else str.push_back('G');
}
for (int i = 0; i < str.length() / 2; i++) {
swap(str[i], str[str.length() - i - 1]);
}
ans.push_back(str);
}
};