All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].
注意用long来储存编码,防止溢出。还可以用bitwise来存储,一个int就可以,在此不提
public class Solution {
private long encode(long input, char character){
long value = 0;
switch(character){
case 'C':
value = 1;
break;
case 'G':
value = 2;
break;
case 'T':
value = 3;
break;
}
input *= 10;
input += value;
return input;
}
private long removeAndEncode(long input, char character){
int digitToRemove = (int) (input / Math.pow(10,9));
input -= digitToRemove * Math.pow(10,9);
return encode(input, character);
}
public List<String> findRepeatedDnaSequences(String s) {
if(s == null || s.length() <= 10){
return new ArrayList<String>();
}
Map<Long, Integer> record = new HashMap<>();
List<String> result = new ArrayList<>();
long code = 0;
for(int i = 0; i < 10; i++){
code = encode(code,s.charAt(i));
}
record.put(code,1);
for(int i = 10; i < s.length(); i++){
code = removeAndEncode(code, s.charAt(i));
if(!record.containsKey(code)){
record.put(code, 1);
}
else{
if(record.get(code) == 1){
result.add(s.substring(i-9 , i+1));
record.put(code, 2);
}
}
}
return result;
}
}