Finish Week 3 Tasks using Literal Programming

This commit is contained in:
coolneng 2019-12-05 02:19:19 +01:00
parent f75408cb6b
commit fe23029144
Signed by: coolneng
GPG Key ID: 9893DA236405AF57

View File

@ -121,6 +121,377 @@
**** Computational approaches to find regulatory motifs **** Computational approaches to find regulatory motifs
***** Exercise: Find the most common nucleotides in each position
We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case,
and the nucleotide in lower case (if there's no popular one).
Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters.
We'll use a *4 x k* Count Matrix, one row for each base. We'll first generate
the Matrix:
#+BEGIN_SRC python
def Count(Motifs):
k = len(Motifs[0])
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
#+END_SRC
Now that we have a Count Matrix, we will generate a Profile Matrix, which has
the frequency of the nucleotide instead of the count:
#+BEGIN_SRC python
def Profile(Motifs):
t = len(Motifs)
profile = Count(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def Count(Motifs):
k = len(Motifs[0])
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
#+END_SRC
***** Exercise: Form the most frequent sequence of nucleotides
Finally, we can form a Consensus string, to get a candidate regulatory motif:
#+BEGIN_SRC python
def Consensus(Motifs):
consensus = ""
count = Count(Motifs)
k = len(Motifs[0])
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
k = len(Motifs[0])
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
#+END_SRC
After obtaining the Consensus string, all we need to do is obtains the total
score of our selected /k-mers/:
#+BEGIN_SRC python
def Score(Motifs):
score = 0
for i in range(len(Motifs)):
score += HammingDistance(Motifs[i], Consensus(Motifs))
return score
def Consensus(Motifs):
consensus = ""
count = Count(Motifs)
k = len(Motifs[0])
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
k = len(Motifs[0])
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
#+END_SRC
***** Exercise: Find a set of /k-mers/ that minimize the score
Applying a brute force approach for this task is not viable, we'll use a Greedy Algorithm. For that, we'll first determine the probability
of a sequence, we'll use:
#+BEGIN_SRC python
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
#+END_SRC
We'll use this function to find the most probable /k-mer/ in a sequence:
#+BEGIN_SRC python
def ProfileMostProbableKmer(text, k, profile):
kmer = ""
keys = ["A", "C", "G", "T"]
d = dict(zip(keys,profile))
prob = -1
for i in range(len(text)-k+1):
if (Pr((text[i:i+k]), profile) > prob):
prob = Pr(text[i:i+k], profile)
kmer = text[i:i+k]
return kmer
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
#+END_SRC
Now we're finally ready to assemble all the pieces and implement a Greedy Motif
Search Algorithm:
#+BEGIN_SRC python
def GreedyMotifSearch(Dna, k, t):
BestMotifs = []
for i in range(0, t):
BestMotifs.append(Dna[i][0:k])
n = len(Dna[0])
for i in range(n-k+1):
Motifs = []
Motifs.append(Dna[0][i:i+k])
for j in range(1, t):
P = Profile(Motifs[0:j])
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
def Score(Motifs):
score = 0
for i in range(len(Motifs)):
score += HammingDistance(Motifs[i], Consensus(Motifs))
return score
def Consensus(Motifs):
consensus = ""
count = Count(Motifs)
k = len(Motifs[0])
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
k = len(Motifs[0])
count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
def Profile(Motifs):
t = len(Motifs)
profile = Count(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def ProfileMostProbableKmer(text, k, profile):
kmer = ""
keys = ["A", "C", "G", "T"]
d = dict(zip(keys,profile))
prob = -1
for i in range(len(text)-k+1):
if (Pr((text[i:i+k]), profile) > prob):
prob = Pr(text[i:i+k], profile)
kmer = text[i:i+k]
return kmer
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
#+END_SRC
***** Motifs in tuberculosis
Tuberculosis is an infectious disease, cause by a bacteria called /Mycobacterium
tuberculosis/. The bacteria can stay latent in the host for decades, in hypoxic
environments.
Our Greedy Algorithm can help us identify a motif that might be involved in the process.
The transcription factor behind this behaviour is *DosR*, we'll identify the
binding sites:
#+BEGIN_SRC python :results output
def GreedyMotifSearch(Dna, k, t):
BestMotifs = []
for i in range(0, t):
BestMotifs.append(Dna[i][0:k])
n = len(Dna[0])
for i in range(n - k + 1):
Motifs = []
Motifs.append(Dna[0][i : i + k])
for j in range(1, t):
P = Profile(Motifs[0:j])
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
def Score(Motifs):
score = 0
for i in range(len(Motifs)):
score += HammingDistance(Motifs[i], Consensus(Motifs))
return score
def Consensus(Motifs):
consensus = ""
count = Count(Motifs)
k = len(Motifs[0])
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def Count(Motifs):
k = len(Motifs[0])
count = {"A": [0] * k, "C": [0] * k, "G": [0] * k, "T": [0] * k}
t = len(Motifs)
for i in range(t):
for j in range(k):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
def Profile(Motifs):
t = len(Motifs)
profile = Count(Motifs)
for key, v in profile.items():
v[:] = [x / t for x in v]
return profile
def ProfileMostProbableKmer(text, k, profile):
kmer = ""
keys = ["A", "C", "G", "T"]
d = dict(zip(keys, profile))
prob = -1
for i in range(len(text) - k + 1):
if Pr((text[i : i + k]), profile) > prob:
prob = Pr(text[i : i + k], profile)
kmer = text[i : i + k]
return kmer
def Pr(Text, Profile):
probability = 1
k = len(Text)
for i in range(k):
probability *= Profile[Text[i]][i]
return probability
Dna = [
"GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC",
]
t = len(Dna)
k = 15
Motifs = GreedyMotifSearch(Dna, k, t)
print(Motifs)
print(Score(Motifs))
#+END_SRC
#+RESULTS:
: ['GTTAGGGCCGGAAGT', 'CCGATCGGCATCACT', 'ACCGTCGATGTGCCC', 'GGGTCAGGTATATTT', 'GTGACCGACGTCCCC', 'CTGTTCGCCGGCAGC', 'CTGTTCGATATCACC', 'GTACATGTCCAGAGC', 'GCGATAGGTGAGATT', 'CTCATCGCTGTCATC']
: 64
Our algorithm is pretty fast, but it's not optimal, and that's just a
characteristic of Greedy Algorithms, they trade optimality for speed.
** Vocabulary ** Vocabulary
- k-mer: subsquences of length /k/ in a biological sequence - k-mer: subsquences of length /k/ in a biological sequence
- Frequency map: sequence --> frequency of the sequence - Frequency map: sequence --> frequency of the sequence