Finish Week 3 Tasks using Literal Programming
This commit is contained in:
parent
f75408cb6b
commit
fe23029144
375
Notebook.org
375
Notebook.org
@ -116,11 +116,382 @@
|
|||||||
** Week 3
|
** Week 3
|
||||||
|
|
||||||
*** The circadian clock
|
*** The circadian clock
|
||||||
|
|
||||||
Variation in gene expression permits the cell to keep track of time.
|
Variation in gene expression permits the cell to keep track of time.
|
||||||
|
|
||||||
**** Computational approaches to find regulatory motifs
|
**** Computational approaches to find regulatory motifs
|
||||||
|
|
||||||
|
***** Exercise: Find the most common nucleotides in each position
|
||||||
|
|
||||||
|
|
||||||
|
We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case,
|
||||||
|
and the nucleotide in lower case (if there's no popular one).
|
||||||
|
Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters.
|
||||||
|
We'll use a *4 x k* Count Matrix, one row for each base. We'll first generate
|
||||||
|
the Matrix:
|
||||||
|
|
||||||
|
#+BEGIN_SRC python
|
||||||
|
def Count(Motifs):
|
||||||
|
k = len(Motifs[0])
|
||||||
|
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
|
||||||
|
t = len(Motifs)
|
||||||
|
for i in range(t):
|
||||||
|
for j in range(k):
|
||||||
|
symbol = Motifs[i][j]
|
||||||
|
count[symbol][j] += 1
|
||||||
|
return count
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
Now that we have a Count Matrix, we will generate a Profile Matrix, which has
|
||||||
|
the frequency of the nucleotide instead of the count:
|
||||||
|
|
||||||
|
#+BEGIN_SRC python
|
||||||
|
def Profile(Motifs):
|
||||||
|
t = len(Motifs)
|
||||||
|
profile = Count(Motifs)
|
||||||
|
for key, v in profile.items():
|
||||||
|
v[:] = [x / t for x in v]
|
||||||
|
return profile
|
||||||
|
|
||||||
|
|
||||||
|
def Count(Motifs):
|
||||||
|
k = len(Motifs[0])
|
||||||
|
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
|
||||||
|
t = len(Motifs)
|
||||||
|
for i in range(t):
|
||||||
|
for j in range(k):
|
||||||
|
symbol = Motifs[i][j]
|
||||||
|
count[symbol][j] += 1
|
||||||
|
return count
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
***** Exercise: Form the most frequent sequence of nucleotides
|
||||||
|
|
||||||
|
Finally, we can form a Consensus string, to get a candidate regulatory motif:
|
||||||
|
#+BEGIN_SRC python
|
||||||
|
def Consensus(Motifs):
|
||||||
|
consensus = ""
|
||||||
|
count = Count(Motifs)
|
||||||
|
k = len(Motifs[0])
|
||||||
|
for j in range(k):
|
||||||
|
m = 0
|
||||||
|
frequentSymbol = ""
|
||||||
|
for symbol in "ACGT":
|
||||||
|
if count[symbol][j] > m:
|
||||||
|
m = count[symbol][j]
|
||||||
|
frequentSymbol = symbol
|
||||||
|
consensus += frequentSymbol
|
||||||
|
return consensus
|
||||||
|
|
||||||
|
|
||||||
|
def Count(Motifs):
|
||||||
|
k = len(Motifs[0])
|
||||||
|
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
|
||||||
|
t = len(Motifs)
|
||||||
|
for i in range(t):
|
||||||
|
for j in range(k):
|
||||||
|
symbol = Motifs[i][j]
|
||||||
|
count[symbol][j] += 1
|
||||||
|
return count
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
After obtaining the Consensus string, all we need to do is obtains the total
|
||||||
|
score of our selected /k-mers/:
|
||||||
|
|
||||||
|
#+BEGIN_SRC python
|
||||||
|
def Score(Motifs):
|
||||||
|
score = 0
|
||||||
|
for i in range(len(Motifs)):
|
||||||
|
score += HammingDistance(Motifs[i], Consensus(Motifs))
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def Consensus(Motifs):
|
||||||
|
consensus = ""
|
||||||
|
count = Count(Motifs)
|
||||||
|
k = len(Motifs[0])
|
||||||
|
for j in range(k):
|
||||||
|
m = 0
|
||||||
|
frequentSymbol = ""
|
||||||
|
for symbol in "ACGT":
|
||||||
|
if count[symbol][j] > m:
|
||||||
|
m = count[symbol][j]
|
||||||
|
frequentSymbol = symbol
|
||||||
|
consensus += frequentSymbol
|
||||||
|
return consensus
|
||||||
|
|
||||||
|
|
||||||
|
def Count(Motifs):
|
||||||
|
k = len(Motifs[0])
|
||||||
|
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
|
||||||
|
t = len(Motifs)
|
||||||
|
for i in range(t):
|
||||||
|
for j in range(k):
|
||||||
|
symbol = Motifs[i][j]
|
||||||
|
count[symbol][j] += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def HammingDistance(p, q):
|
||||||
|
count = 0
|
||||||
|
for i in range(0, len(p)):
|
||||||
|
if p[i] != q[i]:
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
***** Exercise: Find a set of /k-mers/ that minimize the score
|
||||||
|
|
||||||
|
|
||||||
|
Applying a brute force approach for this task is not viable, we'll use a Greedy Algorithm. For that, we'll first determine the probability
|
||||||
|
of a sequence, we'll use:
|
||||||
|
|
||||||
|
#+BEGIN_SRC python
|
||||||
|
def Pr(Text, Profile):
|
||||||
|
probability = 1
|
||||||
|
k = len(Text)
|
||||||
|
for i in range(k):
|
||||||
|
probability *= Profile[Text[i]][i]
|
||||||
|
return probability
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
We'll use this function to find the most probable /k-mer/ in a sequence:
|
||||||
|
|
||||||
|
#+BEGIN_SRC python
|
||||||
|
def ProfileMostProbableKmer(text, k, profile):
|
||||||
|
kmer = ""
|
||||||
|
keys = ["A", "C", "G", "T"]
|
||||||
|
d = dict(zip(keys,profile))
|
||||||
|
prob = -1
|
||||||
|
for i in range(len(text)-k+1):
|
||||||
|
if (Pr((text[i:i+k]), profile) > prob):
|
||||||
|
prob = Pr(text[i:i+k], profile)
|
||||||
|
kmer = text[i:i+k]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def Pr(Text, Profile):
|
||||||
|
probability = 1
|
||||||
|
k = len(Text)
|
||||||
|
for i in range(k):
|
||||||
|
probability *= Profile[Text[i]][i]
|
||||||
|
return probability
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
Now we're finally ready to assemble all the pieces and implement a Greedy Motif
|
||||||
|
Search Algorithm:
|
||||||
|
|
||||||
|
#+BEGIN_SRC python
|
||||||
|
def GreedyMotifSearch(Dna, k, t):
|
||||||
|
BestMotifs = []
|
||||||
|
for i in range(0, t):
|
||||||
|
BestMotifs.append(Dna[i][0:k])
|
||||||
|
n = len(Dna[0])
|
||||||
|
for i in range(n-k+1):
|
||||||
|
Motifs = []
|
||||||
|
Motifs.append(Dna[0][i:i+k])
|
||||||
|
for j in range(1, t):
|
||||||
|
P = Profile(Motifs[0:j])
|
||||||
|
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
|
||||||
|
if Score(Motifs) < Score(BestMotifs):
|
||||||
|
BestMotifs = Motifs
|
||||||
|
return BestMotifs
|
||||||
|
|
||||||
|
def Score(Motifs):
|
||||||
|
score = 0
|
||||||
|
for i in range(len(Motifs)):
|
||||||
|
score += HammingDistance(Motifs[i], Consensus(Motifs))
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def Consensus(Motifs):
|
||||||
|
consensus = ""
|
||||||
|
count = Count(Motifs)
|
||||||
|
k = len(Motifs[0])
|
||||||
|
for j in range(k):
|
||||||
|
m = 0
|
||||||
|
frequentSymbol = ""
|
||||||
|
for symbol in "ACGT":
|
||||||
|
if count[symbol][j] > m:
|
||||||
|
m = count[symbol][j]
|
||||||
|
frequentSymbol = symbol
|
||||||
|
consensus += frequentSymbol
|
||||||
|
return consensus
|
||||||
|
|
||||||
|
|
||||||
|
def Count(Motifs):
|
||||||
|
k = len(Motifs[0])
|
||||||
|
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
|
||||||
|
t = len(Motifs)
|
||||||
|
for i in range(t):
|
||||||
|
for j in range(k):
|
||||||
|
symbol = Motifs[i][j]
|
||||||
|
count[symbol][j] += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def HammingDistance(p, q):
|
||||||
|
count = 0
|
||||||
|
for i in range(0, len(p)):
|
||||||
|
if p[i] != q[i]:
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def Profile(Motifs):
|
||||||
|
t = len(Motifs)
|
||||||
|
profile = Count(Motifs)
|
||||||
|
for key, v in profile.items():
|
||||||
|
v[:] = [x / t for x in v]
|
||||||
|
return profile
|
||||||
|
|
||||||
|
|
||||||
|
def ProfileMostProbableKmer(text, k, profile):
|
||||||
|
kmer = ""
|
||||||
|
keys = ["A", "C", "G", "T"]
|
||||||
|
d = dict(zip(keys,profile))
|
||||||
|
prob = -1
|
||||||
|
for i in range(len(text)-k+1):
|
||||||
|
if (Pr((text[i:i+k]), profile) > prob):
|
||||||
|
prob = Pr(text[i:i+k], profile)
|
||||||
|
kmer = text[i:i+k]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def Pr(Text, Profile):
|
||||||
|
probability = 1
|
||||||
|
k = len(Text)
|
||||||
|
for i in range(k):
|
||||||
|
probability *= Profile[Text[i]][i]
|
||||||
|
return probability
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
***** Motifs in tuberculosis
|
||||||
|
|
||||||
|
Tuberculosis is an infectious disease, cause by a bacteria called /Mycobacterium
|
||||||
|
tuberculosis/. The bacteria can stay latent in the host for decades, in hypoxic
|
||||||
|
environments.
|
||||||
|
Our Greedy Algorithm can help us identify a motif that might be involved in the process.
|
||||||
|
|
||||||
|
The transcription factor behind this behaviour is *DosR*, we'll identify the
|
||||||
|
binding sites:
|
||||||
|
|
||||||
|
#+BEGIN_SRC python :results output
|
||||||
|
def GreedyMotifSearch(Dna, k, t):
|
||||||
|
BestMotifs = []
|
||||||
|
for i in range(0, t):
|
||||||
|
BestMotifs.append(Dna[i][0:k])
|
||||||
|
n = len(Dna[0])
|
||||||
|
for i in range(n - k + 1):
|
||||||
|
Motifs = []
|
||||||
|
Motifs.append(Dna[0][i : i + k])
|
||||||
|
for j in range(1, t):
|
||||||
|
P = Profile(Motifs[0:j])
|
||||||
|
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
|
||||||
|
if Score(Motifs) < Score(BestMotifs):
|
||||||
|
BestMotifs = Motifs
|
||||||
|
return BestMotifs
|
||||||
|
|
||||||
|
|
||||||
|
def Score(Motifs):
|
||||||
|
score = 0
|
||||||
|
for i in range(len(Motifs)):
|
||||||
|
score += HammingDistance(Motifs[i], Consensus(Motifs))
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def Consensus(Motifs):
|
||||||
|
consensus = ""
|
||||||
|
count = Count(Motifs)
|
||||||
|
k = len(Motifs[0])
|
||||||
|
for j in range(k):
|
||||||
|
m = 0
|
||||||
|
frequentSymbol = ""
|
||||||
|
for symbol in "ACGT":
|
||||||
|
if count[symbol][j] > m:
|
||||||
|
m = count[symbol][j]
|
||||||
|
frequentSymbol = symbol
|
||||||
|
consensus += frequentSymbol
|
||||||
|
return consensus
|
||||||
|
|
||||||
|
|
||||||
|
def Count(Motifs):
|
||||||
|
k = len(Motifs[0])
|
||||||
|
count = {"A": [0] * k, "C": [0] * k, "G": [0] * k, "T": [0] * k}
|
||||||
|
t = len(Motifs)
|
||||||
|
for i in range(t):
|
||||||
|
for j in range(k):
|
||||||
|
symbol = Motifs[i][j]
|
||||||
|
count[symbol][j] += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def HammingDistance(p, q):
|
||||||
|
count = 0
|
||||||
|
for i in range(0, len(p)):
|
||||||
|
if p[i] != q[i]:
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def Profile(Motifs):
|
||||||
|
t = len(Motifs)
|
||||||
|
profile = Count(Motifs)
|
||||||
|
for key, v in profile.items():
|
||||||
|
v[:] = [x / t for x in v]
|
||||||
|
return profile
|
||||||
|
|
||||||
|
|
||||||
|
def ProfileMostProbableKmer(text, k, profile):
|
||||||
|
kmer = ""
|
||||||
|
keys = ["A", "C", "G", "T"]
|
||||||
|
d = dict(zip(keys, profile))
|
||||||
|
prob = -1
|
||||||
|
for i in range(len(text) - k + 1):
|
||||||
|
if Pr((text[i : i + k]), profile) > prob:
|
||||||
|
prob = Pr(text[i : i + k], profile)
|
||||||
|
kmer = text[i : i + k]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def Pr(Text, Profile):
|
||||||
|
probability = 1
|
||||||
|
k = len(Text)
|
||||||
|
for i in range(k):
|
||||||
|
probability *= Profile[Text[i]][i]
|
||||||
|
return probability
|
||||||
|
|
||||||
|
|
||||||
|
Dna = [
|
||||||
|
"GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
|
||||||
|
"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
|
||||||
|
"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
|
||||||
|
"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
|
||||||
|
"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
|
||||||
|
"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
|
||||||
|
"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
|
||||||
|
"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
|
||||||
|
"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
|
||||||
|
"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC",
|
||||||
|
]
|
||||||
|
|
||||||
|
t = len(Dna)
|
||||||
|
k = 15
|
||||||
|
|
||||||
|
Motifs = GreedyMotifSearch(Dna, k, t)
|
||||||
|
|
||||||
|
print(Motifs)
|
||||||
|
print(Score(Motifs))
|
||||||
|
#+END_SRC
|
||||||
|
|
||||||
|
#+RESULTS:
|
||||||
|
: ['GTTAGGGCCGGAAGT', 'CCGATCGGCATCACT', 'ACCGTCGATGTGCCC', 'GGGTCAGGTATATTT', 'GTGACCGACGTCCCC', 'CTGTTCGCCGGCAGC', 'CTGTTCGATATCACC', 'GTACATGTCCAGAGC', 'GCGATAGGTGAGATT', 'CTCATCGCTGTCATC']
|
||||||
|
: 64
|
||||||
|
|
||||||
|
Our algorithm is pretty fast, but it's not optimal, and that's just a
|
||||||
|
characteristic of Greedy Algorithms, they trade optimality for speed.
|
||||||
|
|
||||||
|
|
||||||
** Vocabulary
|
** Vocabulary
|
||||||
- k-mer: subsquences of length /k/ in a biological sequence
|
- k-mer: subsquences of length /k/ in a biological sequence
|
||||||
- Frequency map: sequence --> frequency of the sequence
|
- Frequency map: sequence --> frequency of the sequence
|
||||||
|
Loading…
Reference in New Issue
Block a user