All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].-----------------------------------------------------
Solution #1, unordered_map
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> rt;
hash<string> hash_fn;
if (s.length() < 11) return rt;
unordered_map<int,int> map;
for (int i = 0; i < s.length() - 9; i++) {
string sub = s.substr(i,10);
int hashN = hash_fn(sub);
if (map.find(hashN) == map.end()) {
map[hashN] = 1;
}else if (map[hashN] == 1){
rt.push_back(sub);
map[hashN]++;
}
}
return rt;
}
};
Solution #2, convert ACGT to 4-bit representation, so the space cost of one key is reduced from 10 * 8 bits to 2 bits public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> rt;
if (s.length() < 11) return rt;
unordered_map<char,int> map;
map['A'] = 0;
map['C'] = 1;
map['G'] = 2;
map['T'] = 3;
unordered_map<int,int> count;
unsigned hash = 0;
for (int i = 0; i < s.length(); i++) {
hash <<= 2;
hash += map[s[i]];
if (i > 8) {
hash &= (1 << 20) - 1;
if (count.find(hash) == count.end()) {
count[hash] = 1;
}else if (count[hash] == 1) {
count[hash]++;
rt.push_back(s.substr(i - 9,10));
}
}
}
return rt;
}
};
another solution https://leetcode.com/discuss/24478/i-did-it-in-10-lines-of-c
No comments:
Post a Comment