Finish Week 4 Tasks
This commit is contained in:
parent
38c2044964
commit
fc9c624c67
737
Notebook.org
737
Notebook.org
@ -196,7 +196,7 @@ It's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(
|
||||
In /Escherichia Coli/ we plotted the result of our program:
|
||||
|
||||
#+CAPTION: Symbol array for Cytosine in E. Coli Genome]
|
||||
[[./Assets/e-coli.png]]
|
||||
[[./assets/e-coli.png]]
|
||||
|
||||
We can conclude that ori is located around position 4000000,
|
||||
because that's where the Cytosine concentration is the lowest,
|
||||
@ -229,7 +229,7 @@ def SkewArray(Genome):
|
||||
We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/:
|
||||
|
||||
#+CAPTION: Symbol array for Cytosine in E. Coli Genome]
|
||||
[[./Assets/skew_diagram.png]]
|
||||
[[./assets/skew_diagram.png]]
|
||||
|
||||
Ori should be located where the skew is at its minimum value.
|
||||
|
||||
@ -705,7 +705,740 @@ print(Score(Motifs))
|
||||
|
||||
Our algorithm is pretty fast, but it's not optimal, and that's just a
|
||||
characteristic of Greedy Algorithms, they trade optimality for speed.
|
||||
** Week 4
|
||||
|
||||
*** The circadian clock (II)
|
||||
|
||||
The French mathematician Laplace estimated the probability that the sun will not raise tomorrow, this approach plays an important role in statistics, we cannot simplify the probability of a low-probability event to zero.
|
||||
|
||||
**** Motif finding with pseudocounts
|
||||
|
||||
In order to account for this problem, bioinformaticians often substitute zeroes with small numbers called pseudocounts.
|
||||
|
||||
***** Exercise: Create a count matrix with pseudocounts
|
||||
|
||||
We are going to generate the count matrix, while adding 1 to each value as a pseudocount. As a starting point we'll use our Count(Motifs) function, and we'll tweak it to achieve our objective.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def CountWithPseudocounts(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
We only modified the third line of the function, by starting the count at 1 instead of 0. With this simple adjustement, we have solved the problem.
|
||||
|
||||
***** Exercise: Create a profile matrix with pseudocounts
|
||||
|
||||
Now that we have our count matrix, we can generate a profile matrix. We'll adjust our Profile(Motifs) function for this purpose.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def ProfileWithPseudocounts(Motifs):
|
||||
t = len(Motifs) + 4
|
||||
profile = CountWithPseudocounts(Motifs)
|
||||
for key, v in profile.items():
|
||||
v[:] = [x / t for x in v]
|
||||
return profile
|
||||
|
||||
|
||||
def CountWithPseudocounts(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
We have only modified the value of t, adding 4 because the total sum of each column is different. Now that we have pseudocounts, we initialize each cell to 1, and because we have 4 rows, the total sum is now t+4;
|
||||
|
||||
***** Exercise: An improved Greedy Motif Search algorithm
|
||||
|
||||
We have all the required functions to construct a Greedy Motif Search algorithm with pseudocounts. As with all the previous exercises, we'll start with our GreedyMotifSearch(Motifs) function.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def GreedyMotifSearchWithPseudocounts(Dna, k, t):
|
||||
BestMotifs = []
|
||||
for i in range(0, t):
|
||||
BestMotifs.append(Dna[i][0:k])
|
||||
n = len(Dna[0])
|
||||
for i in range(n - k + 1):
|
||||
Motifs = []
|
||||
Motifs.append(Dna[0][i : i + k])
|
||||
for j in range(1, t):
|
||||
P = ProfileWithPseudocounts(Motifs[0:j])
|
||||
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
|
||||
if Score(Motifs) < Score(BestMotifs):
|
||||
BestMotifs = Motifs
|
||||
return BestMotifs
|
||||
|
||||
|
||||
def ProfileWithPseudocounts(Motifs):
|
||||
t = len(Motifs) + 4
|
||||
profile = CountWithPseudocounts(Motifs)
|
||||
for key, v in profile.items():
|
||||
v[:] = [x / t for x in v]
|
||||
return profile
|
||||
|
||||
|
||||
def CountWithPseudocounts(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
|
||||
|
||||
def Score(Motifs):
|
||||
score = 0
|
||||
for i in range(len(Motifs)):
|
||||
score += HammingDistance(Motifs[i], Consensus(Motifs))
|
||||
return score
|
||||
|
||||
|
||||
def Consensus(Motifs):
|
||||
consensus = ""
|
||||
count = Count(Motifs)
|
||||
k = len(Motifs[0])
|
||||
for j in range(k):
|
||||
m = 0
|
||||
frequentSymbol = ""
|
||||
for symbol in "ACGT":
|
||||
if count[symbol][j] > m:
|
||||
m = count[symbol][j]
|
||||
frequentSymbol = symbol
|
||||
consensus += frequentSymbol
|
||||
return consensus
|
||||
|
||||
|
||||
def Count(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {"A": [0] * k, "C": [0] * k, "G": [0] * k, "T": [0] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
|
||||
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def Profile(Motifs):
|
||||
t = len(Motifs)
|
||||
profile = Count(Motifs)
|
||||
for key, v in profile.items():
|
||||
v[:] = [x / t for x in v]
|
||||
return profile
|
||||
|
||||
|
||||
def ProfileMostProbableKmer(text, k, profile):
|
||||
kmer = ""
|
||||
keys = ["A", "C", "G", "T"]
|
||||
d = dict(zip(keys, profile))
|
||||
prob = -1
|
||||
for i in range(len(text) - k + 1):
|
||||
if Pr((text[i : i + k]), profile) > prob:
|
||||
prob = Pr(text[i : i + k], profile)
|
||||
kmer = text[i : i + k]
|
||||
return kmer
|
||||
|
||||
|
||||
def Pr(Text, Profile):
|
||||
probability = 1
|
||||
k = len(Text)
|
||||
for i in range(k):
|
||||
probability *= Profile[Text[i]][i]
|
||||
return probability
|
||||
|
||||
|
||||
Dna = [
|
||||
"GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
|
||||
"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
|
||||
"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
|
||||
"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
|
||||
"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
|
||||
"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
|
||||
"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
|
||||
"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
|
||||
"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
|
||||
"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC",
|
||||
]
|
||||
|
||||
t = 10
|
||||
k = 15
|
||||
|
||||
Motifs = GreedyMotifSearchWithPseudocounts(Dna, k, t)
|
||||
|
||||
print(Motifs)
|
||||
print(Score(Motifs))
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
: ['GCAGACCTTTCGCCC', 'CCGATCGGCATCACT', 'ACCGTCGATGTGCCC', 'GGGTCAGGTATATTT', 'GTAGGTCAAACCGGG', 'CCGCTGGCGACGCTG', 'GGCCAACTGCACCGC', 'GTACATGTCCAGAGC', 'GGCAGCTGTCGGCAA', 'TCAGCACCATGACCG']
|
||||
: 86
|
||||
|
||||
All we had to do was replace the function call Profile(Motifs) with ProfileWithPseudocounts(Motif).
|
||||
|
||||
The performance of the new algorithm is better, but we are still not satisfied. We are going to try a different motif finding algorithm.
|
||||
|
||||
**** Randomized Motif Search
|
||||
|
||||
A randomized algorithm sounds like a very bad idea, luck doesn't seem to be a good scientific asset. Well, as nonintuitive as it might sound they are used in many cases.
|
||||
|
||||
We will consider *Monte Carlo algorithms* for our use case.
|
||||
|
||||
***** Exercise: Generate the most probable /k-mers/
|
||||
|
||||
We'll start off our pipeline by crafting a function that outputs the most probable /k-mers/ using a profile matrix, and a DNA sequence.
|
||||
|
||||
We'll reuse our ProfileMostProbableKmer and Pr functions:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def Motifs(Profile, Dna):
|
||||
kmer_list = []
|
||||
for sequence in Dna:
|
||||
kmer = ProfileMostProbableKmer(text=sequence, k=len(Profile), profile=Profile)
|
||||
kmer_list.append(kmer)
|
||||
return kmer_list
|
||||
|
||||
|
||||
|
||||
def ProfileMostProbableKmer(text, k, profile):
|
||||
kmer = ""
|
||||
keys = ["A", "C", "G", "T"]
|
||||
d = dict(zip(keys,profile))
|
||||
prob = -1
|
||||
for i in range(len(text)-k+1):
|
||||
if (Pr((text[i:i+k]), profile) > prob):
|
||||
prob = Pr(text[i:i+k], profile)
|
||||
kmer = text[i:i+k]
|
||||
return kmer
|
||||
|
||||
|
||||
def Pr(Text, Profile):
|
||||
probability = 1
|
||||
k = len(Text)
|
||||
for i in range(k):
|
||||
probability *= Profile[Text[i]][i]
|
||||
return probability
|
||||
#+END_SRC
|
||||
|
||||
We obtain a list of the most probable /k-mer/ for each DNA sequence.
|
||||
|
||||
A common approach to is starting from a collection of randomly chosen /k-mers/ Motifs, construct a profile matrix and use this matrix to generate a new collection of /k-mers/.
|
||||
|
||||
We'll iterate many times, hoping to get a better result and seeing if the score keeps improving. This is basically what a Randomized Motif Search does, now that we understand the underlying mechanism, we'll implement a random number generator.
|
||||
|
||||
***** Exercise: Implement a random /k-mer/ generator
|
||||
|
||||
In order to implement our random /k-mer/ generator, we need a way to generate random integers. Each integer will be used as the index for a character in our sequences.
|
||||
|
||||
Python's *random* module is our choice of tool for this exercise, specifically the *randint* function.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
import random
|
||||
|
||||
|
||||
def RandomMotifs(Dna, k, t):
|
||||
random_motifs = []
|
||||
t = len(Dna)
|
||||
l = len(Dna[0])
|
||||
for i in range(t):
|
||||
random_index = random.randint(1, l-k)
|
||||
random_motifs.append(Dna[i][random_index:random_index+k])
|
||||
return random_motifs
|
||||
#+END_SRC
|
||||
|
||||
We are ready to develop our Randomized Search algorithm, we just need to iterate over the generation of random motifs until we stop getting good results.
|
||||
|
||||
***** Exercise: Random Motif Search algorithm
|
||||
|
||||
Finally, we only have to assemble our functions to iterate through the random motifs while the score keeps improving.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
import random
|
||||
|
||||
|
||||
def RandomizedMotifSearch(Dna, k, t):
|
||||
M = RandomMotifs(Dna, k, t)
|
||||
BestMotifs = M
|
||||
while True:
|
||||
Profile = ProfileWithPseudocounts(M)
|
||||
M = Motifs(Profile, Dna)
|
||||
if Score(M) < Score(BestMotifs):
|
||||
BestMotifs = M
|
||||
else:
|
||||
return BestMotifs
|
||||
|
||||
|
||||
def RandomMotifs(Dna, k, t):
|
||||
random_motifs = []
|
||||
t = len(Dna)
|
||||
l = len(Dna[0])
|
||||
for i in range(t):
|
||||
random_index = random.randint(1, l-k)
|
||||
random_motifs.append(Dna[i][random_index:random_index+k])
|
||||
return random_motifs
|
||||
|
||||
|
||||
def Motifs(Profile, Dna):
|
||||
kmer_list = []
|
||||
for sequence in Dna:
|
||||
kmer = ProfileMostProbableKmer(text=sequence, k=len(Profile["A"]), profile=Profile)
|
||||
kmer_list.append(kmer)
|
||||
return kmer_list
|
||||
|
||||
|
||||
|
||||
def ProfileMostProbableKmer(text, k, profile):
|
||||
kmer = ""
|
||||
keys = ["A", "C", "G", "T"]
|
||||
d = dict(zip(keys,profile))
|
||||
prob = -1
|
||||
for i in range(len(text)-k+1):
|
||||
if (Pr((text[i:i+k]), profile) > prob):
|
||||
prob = Pr(text[i:i+k], profile)
|
||||
kmer = text[i:i+k]
|
||||
return kmer
|
||||
|
||||
|
||||
def Pr(Text, Profile):
|
||||
probability = 1
|
||||
k = len(Text)
|
||||
for i in range(k):
|
||||
probability *= Profile[Text[i]][i]
|
||||
return probability
|
||||
|
||||
|
||||
def ProfileWithPseudocounts(Motifs):
|
||||
t = len(Motifs) + 4
|
||||
profile = CountWithPseudocounts(Motifs)
|
||||
for key, v in profile.items():
|
||||
v[:] = [x / t for x in v]
|
||||
return profile
|
||||
|
||||
|
||||
def CountWithPseudocounts(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
|
||||
|
||||
|
||||
def Score(Motifs):
|
||||
score = 0
|
||||
for i in range(len(Motifs)):
|
||||
score += HammingDistance(Motifs[i], Consensus(Motifs))
|
||||
return score
|
||||
|
||||
|
||||
def Consensus(Motifs):
|
||||
consensus = ""
|
||||
count = Count(Motifs)
|
||||
k = len(Motifs[0])
|
||||
for j in range(k):
|
||||
m = 0
|
||||
frequentSymbol = ""
|
||||
for symbol in "ACGT":
|
||||
if count[symbol][j] > m:
|
||||
m = count[symbol][j]
|
||||
frequentSymbol = symbol
|
||||
consensus += frequentSymbol
|
||||
return consensus
|
||||
|
||||
|
||||
def Count(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
|
||||
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
Each time that we execute our algorithm, we'll get a different solution. We're going to do an experiment by running our algorithm multiple times and checking the final score.
|
||||
***** Exercise: Evaluate performance of the algorithm
|
||||
|
||||
In order to evaluate the performance of this algorithm, we are going to run 100 iterations while keeping the motif with the greatest motif.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
import random
|
||||
Dna = ["GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
|
||||
"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
|
||||
"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
|
||||
"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
|
||||
"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
|
||||
"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
|
||||
"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
|
||||
"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
|
||||
"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
|
||||
"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC"]
|
||||
t = 10
|
||||
k = 15
|
||||
N = 100
|
||||
|
||||
|
||||
def RandomizedMotifSearch(Dna, k, t):
|
||||
M = RandomMotifs(Dna, k, t)
|
||||
BestMotifs = M
|
||||
while True:
|
||||
Profile = ProfileWithPseudocounts(M)
|
||||
M = Motifs(Profile, Dna)
|
||||
if Score(M) < Score(BestMotifs):
|
||||
BestMotifs = M
|
||||
else:
|
||||
return BestMotifs
|
||||
|
||||
|
||||
def RandomMotifs(Dna, k, t):
|
||||
random_motifs = []
|
||||
t = len(Dna)
|
||||
l = len(Dna[0])
|
||||
for i in range(t):
|
||||
random_index = random.randint(1, l-k)
|
||||
random_motifs.append(Dna[i][random_index:random_index+k])
|
||||
return random_motifs
|
||||
|
||||
|
||||
def Motifs(Profile, Dna):
|
||||
kmer_list = []
|
||||
for sequence in Dna:
|
||||
kmer = ProfileMostProbableKmer(text=sequence, k=len(Profile["A"]), profile=Profile)
|
||||
kmer_list.append(kmer)
|
||||
return kmer_list
|
||||
|
||||
|
||||
|
||||
def ProfileMostProbableKmer(text, k, profile):
|
||||
kmer = ""
|
||||
keys = ["A", "C", "G", "T"]
|
||||
d = dict(zip(keys,profile))
|
||||
prob = -1
|
||||
for i in range(len(text)-k+1):
|
||||
if (Pr((text[i:i+k]), profile) > prob):
|
||||
prob = Pr(text[i:i+k], profile)
|
||||
kmer = text[i:i+k]
|
||||
return kmer
|
||||
|
||||
|
||||
def Pr(Text, Profile):
|
||||
probability = 1
|
||||
k = len(Text)
|
||||
for i in range(k):
|
||||
probability *= Profile[Text[i]][i]
|
||||
return probability
|
||||
|
||||
|
||||
def ProfileWithPseudocounts(Motifs):
|
||||
t = len(Motifs) + 4
|
||||
profile = CountWithPseudocounts(Motifs)
|
||||
for key, v in profile.items():
|
||||
v[:] = [x / t for x in v]
|
||||
return profile
|
||||
|
||||
|
||||
def CountWithPseudocounts(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
|
||||
|
||||
|
||||
def Score(Motifs):
|
||||
score = 0
|
||||
for i in range(len(Motifs)):
|
||||
score += HammingDistance(Motifs[i], Consensus(Motifs))
|
||||
return score
|
||||
|
||||
|
||||
def Consensus(Motifs):
|
||||
consensus = ""
|
||||
count = Count(Motifs)
|
||||
k = len(Motifs[0])
|
||||
for j in range(k):
|
||||
m = 0
|
||||
frequentSymbol = ""
|
||||
for symbol in "ACGT":
|
||||
if count[symbol][j] > m:
|
||||
m = count[symbol][j]
|
||||
frequentSymbol = symbol
|
||||
consensus += frequentSymbol
|
||||
return consensus
|
||||
|
||||
|
||||
def Count(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
|
||||
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
BestMotifs = RandomizedMotifSearch(Dna, k, t)
|
||||
for i in range(N):
|
||||
m = RandomizedMotifSearch(Dna, k, t)
|
||||
if Score(m) < Score(BestMotifs):
|
||||
BestMotifs = m
|
||||
|
||||
print(BestMotifs)
|
||||
print(Score(BestMotifs))
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
: ['GCGGTGGTCGGATTT', 'CGTGGTCGTACCTTC', 'AACGGGCCGCCGGTC', 'GCCAGGGCGCCGGGC', 'CGGGAAGGGGCGAAA', 'GTTACGGCGATTACC', 'GCTCTTGATGACATC', 'TTCCGGGTGGTGACA', 'GTTGCGTCTGTCATC', 'ACCACTGATCCAAGT']
|
||||
: 75
|
||||
|
||||
As we can see, this algorithm finds a pretty good solution.
|
||||
|
||||
**** Gibbs Sampling
|
||||
|
||||
Thanks to our previous experiments, we now consider random search algorithms as adequate tools for our problem. The caveat of our algorithm is that it discards all previous motifs in each iteration.
|
||||
|
||||
A more cautious alternative is Gibbs Sampling, which discards a single /k-mer/ from the current set of motifs at each iteration.
|
||||
|
||||
***** Exercise: Normalize the probabilities
|
||||
|
||||
The algorithm chooses randomly at each iteration which /k-mer/ will be dropped, and it chooses the replacement of that /k-mer/.
|
||||
|
||||
We use pseudocounts to generate the next profile matrix, which gives us some extravagant probabilities. We need to normalize them, so their sum equals 1.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def Normalize(Probabilities):
|
||||
d = {}
|
||||
for k,v in Probabilities.items():
|
||||
d[k] = Probabilities[k]/sum(Probabilities.values())
|
||||
return d
|
||||
#+END_SRC
|
||||
|
||||
***** Exercise: Simulate rolling a die
|
||||
|
||||
Now that our probabilities are normalized, we can focus on simulating a weighted die.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
import random
|
||||
|
||||
def WeightedDie(Probabilities):
|
||||
count = 0
|
||||
p = random.uniform(0,1)
|
||||
for k,v in Probabilities.items():
|
||||
count = count+v
|
||||
if p < count:
|
||||
return k
|
||||
#+END_SRC
|
||||
|
||||
Our function returns a single element, in the next step we'll make it a subroutine of a larger function.
|
||||
|
||||
***** Exercise: Generate a k-mer
|
||||
|
||||
We have all the necessary tools to generate a /k-mer/ based on the profile matrix.
|
||||
|
||||
#+BEGIN_SRC python
|
||||
import random
|
||||
|
||||
|
||||
def Pr(Text, Profile):
|
||||
probability = 1
|
||||
k = len(Text)
|
||||
for i in range(k):
|
||||
probability *= Profile[Text[i]][i]
|
||||
return probability
|
||||
|
||||
|
||||
def Normalize(Probabilities):
|
||||
d = {}
|
||||
for k,v in Probabilities.items():
|
||||
d[k] = Probabilities[k]/sum(Probabilities.values())
|
||||
return d
|
||||
|
||||
|
||||
def WeightedDie(Probabilities):
|
||||
count = 0
|
||||
p = random.uniform(0,1)
|
||||
for k,v in Probabilities.items():
|
||||
count = count+v
|
||||
if p < count:
|
||||
return k
|
||||
|
||||
|
||||
def ProfileGeneratedString(Text, profile, k):
|
||||
n = len(Text)
|
||||
probabilities = {}
|
||||
for i in range(0,n-k+1):
|
||||
probabilities[Text[i:i+k]] = Pr(Text[i:i+k], profile)
|
||||
probabilities = Normalize(probabilities)
|
||||
return WeightedDie(probabilities)
|
||||
#+END_SRC
|
||||
***** Exercise: Implement the Gibbs Sampling algorithm
|
||||
|
||||
Finally, we can implement the Gibbs Sampling algorithm:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
import random
|
||||
|
||||
def GibbsSampler(Dna, k, t, N):
|
||||
Motifs = RandomMotifs(Dna, k ,t)
|
||||
BestMotifs = Motifs[:]
|
||||
for i in range(N):
|
||||
p = random.randint(0,t-1)
|
||||
del Motifs[p]
|
||||
profile = ProfileWithPseudocounts(Motifs)
|
||||
Motifs.insert(p, ProfileGeneratedString(Dna[p], profile, k))
|
||||
if Score(Motifs) < Score(BestMotifs):
|
||||
BestMotifs = Motifs
|
||||
return BestMotifs
|
||||
|
||||
|
||||
def RandomMotifs(Dna, k, t):
|
||||
random_motifs = []
|
||||
t = len(Dna)
|
||||
l = len(Dna[0])
|
||||
for i in range(t):
|
||||
random_index = random.randint(1, l-k)
|
||||
random_motifs.append(Dna[i][random_index:random_index+k])
|
||||
return random_motifs
|
||||
|
||||
|
||||
def ProfileWithPseudocounts(Motifs):
|
||||
t = len(Motifs) + 4
|
||||
profile = CountWithPseudocounts(Motifs)
|
||||
for key, v in profile.items():
|
||||
v[:] = [x / t for x in v]
|
||||
return profile
|
||||
|
||||
|
||||
def CountWithPseudocounts(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
|
||||
|
||||
def Pr(Text, Profile):
|
||||
probability = 1
|
||||
k = len(Text)
|
||||
for i in range(k):
|
||||
probability *= Profile[Text[i]][i]
|
||||
return probability
|
||||
|
||||
|
||||
def Normalize(Probabilities):
|
||||
d = {}
|
||||
for k,v in Probabilities.items():
|
||||
d[k] = Probabilities[k]/sum(Probabilities.values())
|
||||
return d
|
||||
|
||||
|
||||
def WeightedDie(Probabilities):
|
||||
count = 0
|
||||
p = random.uniform(0,1)
|
||||
for k,v in Probabilities.items():
|
||||
count = count+v
|
||||
if p < count:
|
||||
return k
|
||||
|
||||
|
||||
def ProfileGeneratedString(Text, profile, k):
|
||||
n = len(Text)
|
||||
probabilities = {}
|
||||
for i in range(0,n-k+1):
|
||||
probabilities[Text[i:i+k]] = Pr(Text[i:i+k], profile)
|
||||
probabilities = Normalize(probabilities)
|
||||
return WeightedDie(probabilities)
|
||||
|
||||
|
||||
def Score(Motifs):
|
||||
score = 0
|
||||
for i in range(len(Motifs)):
|
||||
score += HammingDistance(Motifs[i], Consensus(Motifs))
|
||||
return score
|
||||
|
||||
|
||||
def Consensus(Motifs):
|
||||
consensus = ""
|
||||
count = Count(Motifs)
|
||||
k = len(Motifs[0])
|
||||
for j in range(k):
|
||||
m = 0
|
||||
frequentSymbol = ""
|
||||
for symbol in "ACGT":
|
||||
if count[symbol][j] > m:
|
||||
m = count[symbol][j]
|
||||
frequentSymbol = symbol
|
||||
consensus += frequentSymbol
|
||||
return consensus
|
||||
|
||||
|
||||
def Count(Motifs):
|
||||
k = len(Motifs[0])
|
||||
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
|
||||
t = len(Motifs)
|
||||
for i in range(t):
|
||||
for j in range(k):
|
||||
symbol = Motifs[i][j]
|
||||
count[symbol][j] += 1
|
||||
return count
|
||||
|
||||
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
** Vocabulary
|
||||
|
||||
- k-mer: subsquences of length /k/ in a biological sequence
|
||||
- Frequency map: sequence --> frequency of the sequence
|
||||
|
Loading…
Reference in New Issue
Block a user