Finish Week 4 Tasks

This commit is contained in:
coolneng 2020-10-10 23:28:20 +02:00
parent 38c2044964
commit fc9c624c67
Signed by: coolneng
GPG Key ID: 9893DA236405AF57

View File

@ -196,7 +196,7 @@ It's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(
In /Escherichia Coli/ we plotted the result of our program:
#+CAPTION: Symbol array for Cytosine in E. Coli Genome]
[[./Assets/e-coli.png]]
[[./assets/e-coli.png]]
We can conclude that ori is located around position 4000000,
because that's where the Cytosine concentration is the lowest,
@ -229,7 +229,7 @@ def SkewArray(Genome):
We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/:
#+CAPTION: Symbol array for Cytosine in E. Coli Genome]
[[./Assets/skew_diagram.png]]
[[./assets/skew_diagram.png]]
Ori should be located where the skew is at its minimum value.
@ -705,7 +705,740 @@ print(Score(Motifs))
Our algorithm is pretty fast, but it's not optimal, and that's just a
characteristic of Greedy Algorithms, they trade optimality for speed.
** Week 4
*** The circadian clock (II)
The French mathematician Laplace estimated the probability that the sun will not raise tomorrow, this approach plays an important role in statistics, we cannot simplify the probability of a low-probability event to zero.
**** Motif finding with pseudocounts
In order to account for this problem, bioinformaticians often substitute zeroes with small numbers called pseudocounts.
***** Exercise: Create a count matrix with pseudocounts
We are going to generate the count matrix, while adding 1 to each value as a pseudocount. As a starting point we'll use our Count(Motifs) function, and we'll tweak it to achieve our objective.
#+BEGIN_SRC python
def CountWithPseudocounts(Motifs):
k = len(Motifs[0])
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
#+END_SRC
We only modified the third line of the function, by starting the count at 1 instead of 0. With this simple adjustement, we have solved the problem.
***** Exercise: Create a profile matrix with pseudocounts
Now that we have our count matrix, we can generate a profile matrix. We'll adjust our Profile(Motifs) function for this purpose.
#+BEGIN_SRC python
def ProfileWithPseudocounts(Motifs):
t = len(Motifs) + 4
profile = CountWithPseudocounts(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def CountWithPseudocounts(Motifs):
k = len(Motifs[0])
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
#+END_SRC
We have only modified the value of t, adding 4 because the total sum of each column is different. Now that we have pseudocounts, we initialize each cell to 1, and because we have 4 rows, the total sum is now t+4;
***** Exercise: An improved Greedy Motif Search algorithm
We have all the required functions to construct a Greedy Motif Search algorithm with pseudocounts. As with all the previous exercises, we'll start with our GreedyMotifSearch(Motifs) function.
#+BEGIN_SRC python
def GreedyMotifSearchWithPseudocounts(Dna, k, t):
BestMotifs = []
for i in range(0, t):
BestMotifs.append(Dna[i][0:k])
n = len(Dna[0])
for i in range(n - k + 1):
Motifs = []
Motifs.append(Dna[0][i : i + k])
for j in range(1, t):
P = ProfileWithPseudocounts(Motifs[0:j])
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
def ProfileWithPseudocounts(Motifs):
t = len(Motifs) + 4
profile = CountWithPseudocounts(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def CountWithPseudocounts(Motifs):
k = len(Motifs[0])
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def Score(Motifs):
score = 0
for i in range(len(Motifs)):
score += HammingDistance(Motifs[i], Consensus(Motifs))
return score
def Consensus(Motifs):
consensus = ""
count = Count(Motifs)
k = len(Motifs[0])
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
k = len(Motifs[0])
count = {"A": [0] * k, "C": [0] * k, "G": [0] * k, "T": [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
def Profile(Motifs):
t = len(Motifs)
profile = Count(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def ProfileMostProbableKmer(text, k, profile):
kmer = ""
keys = ["A", "C", "G", "T"]
d = dict(zip(keys, profile))
prob = -1
for i in range(len(text) - k + 1):
if Pr((text[i : i + k]), profile) > prob:
prob = Pr(text[i : i + k], profile)
kmer = text[i : i + k]
return kmer
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
Dna = [
"GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC",
]
t = 10
k = 15
Motifs = GreedyMotifSearchWithPseudocounts(Dna, k, t)
print(Motifs)
print(Score(Motifs))
#+END_SRC
#+RESULTS:
: ['GCAGACCTTTCGCCC', 'CCGATCGGCATCACT', 'ACCGTCGATGTGCCC', 'GGGTCAGGTATATTT', 'GTAGGTCAAACCGGG', 'CCGCTGGCGACGCTG', 'GGCCAACTGCACCGC', 'GTACATGTCCAGAGC', 'GGCAGCTGTCGGCAA', 'TCAGCACCATGACCG']
: 86
All we had to do was replace the function call Profile(Motifs) with ProfileWithPseudocounts(Motif).
The performance of the new algorithm is better, but we are still not satisfied. We are going to try a different motif finding algorithm.
**** Randomized Motif Search
A randomized algorithm sounds like a very bad idea, luck doesn't seem to be a good scientific asset. Well, as nonintuitive as it might sound they are used in many cases.
We will consider *Monte Carlo algorithms* for our use case.
***** Exercise: Generate the most probable /k-mers/
We'll start off our pipeline by crafting a function that outputs the most probable /k-mers/ using a profile matrix, and a DNA sequence.
We'll reuse our ProfileMostProbableKmer and Pr functions:
#+BEGIN_SRC python
def Motifs(Profile, Dna):
kmer_list = []
for sequence in Dna:
kmer = ProfileMostProbableKmer(text=sequence, k=len(Profile), profile=Profile)
kmer_list.append(kmer)
return kmer_list
def ProfileMostProbableKmer(text, k, profile):
kmer = ""
keys = ["A", "C", "G", "T"]
d = dict(zip(keys,profile))
prob = -1
for i in range(len(text)-k+1):
if (Pr((text[i:i+k]), profile) > prob):
prob = Pr(text[i:i+k], profile)
kmer = text[i:i+k]
return kmer
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
#+END_SRC
We obtain a list of the most probable /k-mer/ for each DNA sequence.
A common approach to is starting from a collection of randomly chosen /k-mers/ Motifs, construct a profile matrix and use this matrix to generate a new collection of /k-mers/.
We'll iterate many times, hoping to get a better result and seeing if the score keeps improving. This is basically what a Randomized Motif Search does, now that we understand the underlying mechanism, we'll implement a random number generator.
***** Exercise: Implement a random /k-mer/ generator
In order to implement our random /k-mer/ generator, we need a way to generate random integers. Each integer will be used as the index for a character in our sequences.
Python's *random* module is our choice of tool for this exercise, specifically the *randint* function.
#+BEGIN_SRC python
import random
def RandomMotifs(Dna, k, t):
random_motifs = []
t = len(Dna)
l = len(Dna[0])
for i in range(t):
random_index = random.randint(1, l-k)
random_motifs.append(Dna[i][random_index:random_index+k])
return random_motifs
#+END_SRC
We are ready to develop our Randomized Search algorithm, we just need to iterate over the generation of random motifs until we stop getting good results.
***** Exercise: Random Motif Search algorithm
Finally, we only have to assemble our functions to iterate through the random motifs while the score keeps improving.
#+BEGIN_SRC python
import random
def RandomizedMotifSearch(Dna, k, t):
M = RandomMotifs(Dna, k, t)
BestMotifs = M
while True:
Profile = ProfileWithPseudocounts(M)
M = Motifs(Profile, Dna)
if Score(M) < Score(BestMotifs):
BestMotifs = M
else:
return BestMotifs
def RandomMotifs(Dna, k, t):
random_motifs = []
t = len(Dna)
l = len(Dna[0])
for i in range(t):
random_index = random.randint(1, l-k)
random_motifs.append(Dna[i][random_index:random_index+k])
return random_motifs
def Motifs(Profile, Dna):
kmer_list = []
for sequence in Dna:
kmer = ProfileMostProbableKmer(text=sequence, k=len(Profile["A"]), profile=Profile)
kmer_list.append(kmer)
return kmer_list
def ProfileMostProbableKmer(text, k, profile):
kmer = ""
keys = ["A", "C", "G", "T"]
d = dict(zip(keys,profile))
prob = -1
for i in range(len(text)-k+1):
if (Pr((text[i:i+k]), profile) > prob):
prob = Pr(text[i:i+k], profile)
kmer = text[i:i+k]
return kmer
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
def ProfileWithPseudocounts(Motifs):
t = len(Motifs) + 4
profile = CountWithPseudocounts(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def CountWithPseudocounts(Motifs):
k = len(Motifs[0])
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def Score(Motifs):
score = 0
for i in range(len(Motifs)):
score += HammingDistance(Motifs[i], Consensus(Motifs))
return score
def Consensus(Motifs):
consensus = ""
count = Count(Motifs)
k = len(Motifs[0])
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
k = len(Motifs[0])
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
#+END_SRC
Each time that we execute our algorithm, we'll get a different solution. We're going to do an experiment by running our algorithm multiple times and checking the final score.
***** Exercise: Evaluate performance of the algorithm
In order to evaluate the performance of this algorithm, we are going to run 100 iterations while keeping the motif with the greatest motif.
#+BEGIN_SRC python
import random
Dna = ["GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC"]
t = 10
k = 15
N = 100
def RandomizedMotifSearch(Dna, k, t):
M = RandomMotifs(Dna, k, t)
BestMotifs = M
while True:
Profile = ProfileWithPseudocounts(M)
M = Motifs(Profile, Dna)
if Score(M) < Score(BestMotifs):
BestMotifs = M
else:
return BestMotifs
def RandomMotifs(Dna, k, t):
random_motifs = []
t = len(Dna)
l = len(Dna[0])
for i in range(t):
random_index = random.randint(1, l-k)
random_motifs.append(Dna[i][random_index:random_index+k])
return random_motifs
def Motifs(Profile, Dna):
kmer_list = []
for sequence in Dna:
kmer = ProfileMostProbableKmer(text=sequence, k=len(Profile["A"]), profile=Profile)
kmer_list.append(kmer)
return kmer_list
def ProfileMostProbableKmer(text, k, profile):
kmer = ""
keys = ["A", "C", "G", "T"]
d = dict(zip(keys,profile))
prob = -1
for i in range(len(text)-k+1):
if (Pr((text[i:i+k]), profile) > prob):
prob = Pr(text[i:i+k], profile)
kmer = text[i:i+k]
return kmer
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
def ProfileWithPseudocounts(Motifs):
t = len(Motifs) + 4
profile = CountWithPseudocounts(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def CountWithPseudocounts(Motifs):
k = len(Motifs[0])
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def Score(Motifs):
score = 0
for i in range(len(Motifs)):
score += HammingDistance(Motifs[i], Consensus(Motifs))
return score
def Consensus(Motifs):
consensus = ""
count = Count(Motifs)
k = len(Motifs[0])
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
k = len(Motifs[0])
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
BestMotifs = RandomizedMotifSearch(Dna, k, t)
for i in range(N):
m = RandomizedMotifSearch(Dna, k, t)
if Score(m) < Score(BestMotifs):
BestMotifs = m
print(BestMotifs)
print(Score(BestMotifs))
#+END_SRC
#+RESULTS:
: ['GCGGTGGTCGGATTT', 'CGTGGTCGTACCTTC', 'AACGGGCCGCCGGTC', 'GCCAGGGCGCCGGGC', 'CGGGAAGGGGCGAAA', 'GTTACGGCGATTACC', 'GCTCTTGATGACATC', 'TTCCGGGTGGTGACA', 'GTTGCGTCTGTCATC', 'ACCACTGATCCAAGT']
: 75
As we can see, this algorithm finds a pretty good solution.
**** Gibbs Sampling
Thanks to our previous experiments, we now consider random search algorithms as adequate tools for our problem. The caveat of our algorithm is that it discards all previous motifs in each iteration.
A more cautious alternative is Gibbs Sampling, which discards a single /k-mer/ from the current set of motifs at each iteration.
***** Exercise: Normalize the probabilities
The algorithm chooses randomly at each iteration which /k-mer/ will be dropped, and it chooses the replacement of that /k-mer/.
We use pseudocounts to generate the next profile matrix, which gives us some extravagant probabilities. We need to normalize them, so their sum equals 1.
#+BEGIN_SRC python
def Normalize(Probabilities):
d = {}
for k,v in Probabilities.items():
d[k] = Probabilities[k]/sum(Probabilities.values())
return d
#+END_SRC
***** Exercise: Simulate rolling a die
Now that our probabilities are normalized, we can focus on simulating a weighted die.
#+BEGIN_SRC python
import random
def WeightedDie(Probabilities):
count = 0
p = random.uniform(0,1)
for k,v in Probabilities.items():
count = count+v
if p < count:
return k
#+END_SRC
Our function returns a single element, in the next step we'll make it a subroutine of a larger function.
***** Exercise: Generate a k-mer
We have all the necessary tools to generate a /k-mer/ based on the profile matrix.
#+BEGIN_SRC python
import random
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
def Normalize(Probabilities):
d = {}
for k,v in Probabilities.items():
d[k] = Probabilities[k]/sum(Probabilities.values())
return d
def WeightedDie(Probabilities):
count = 0
p = random.uniform(0,1)
for k,v in Probabilities.items():
count = count+v
if p < count:
return k
def ProfileGeneratedString(Text, profile, k):
n = len(Text)
probabilities = {}
for i in range(0,n-k+1):
probabilities[Text[i:i+k]] = Pr(Text[i:i+k], profile)
probabilities = Normalize(probabilities)
return WeightedDie(probabilities)
#+END_SRC
***** Exercise: Implement the Gibbs Sampling algorithm
Finally, we can implement the Gibbs Sampling algorithm:
#+BEGIN_SRC python
import random
def GibbsSampler(Dna, k, t, N):
Motifs = RandomMotifs(Dna, k ,t)
BestMotifs = Motifs[:]
for i in range(N):
p = random.randint(0,t-1)
del Motifs[p]
profile = ProfileWithPseudocounts(Motifs)
Motifs.insert(p, ProfileGeneratedString(Dna[p], profile, k))
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
def RandomMotifs(Dna, k, t):
random_motifs = []
t = len(Dna)
l = len(Dna[0])
for i in range(t):
random_index = random.randint(1, l-k)
random_motifs.append(Dna[i][random_index:random_index+k])
return random_motifs
def ProfileWithPseudocounts(Motifs):
t = len(Motifs) + 4
profile = CountWithPseudocounts(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def CountWithPseudocounts(Motifs):
k = len(Motifs[0])
count = {'A': [1]*k, 'C': [1]*k, 'G': [1]*k, 'T': [1] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
def Normalize(Probabilities):
d = {}
for k,v in Probabilities.items():
d[k] = Probabilities[k]/sum(Probabilities.values())
return d
def WeightedDie(Probabilities):
count = 0
p = random.uniform(0,1)
for k,v in Probabilities.items():
count = count+v
if p < count:
return k
def ProfileGeneratedString(Text, profile, k):
n = len(Text)
probabilities = {}
for i in range(0,n-k+1):
probabilities[Text[i:i+k]] = Pr(Text[i:i+k], profile)
probabilities = Normalize(probabilities)
return WeightedDie(probabilities)
def Score(Motifs):
score = 0
for i in range(len(Motifs)):
score += HammingDistance(Motifs[i], Consensus(Motifs))
return score
def Consensus(Motifs):
consensus = ""
count = Count(Motifs)
k = len(Motifs[0])
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
k = len(Motifs[0])
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
#+END_SRC
** Vocabulary
- k-mer: subsquences of length /k/ in a biological sequence
- Frequency map: sequence --> frequency of the sequence