Convert all the content to Literal Programming

This commit is contained in:
coolneng 2019-12-09 13:35:28 +01:00
parent fe23029144
commit 6b04676fbb
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
12 changed files with 253 additions and 207 deletions

View File

@ -1,14 +0,0 @@
def ApproximatePatternCount(Pattern, Text, d):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern or HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
count += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count

View File

@ -1,16 +0,0 @@
def ApproximatePatternMatching(Text, Pattern, d):
positions = []
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
positions.append(i)
elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
positions.append(i)
return positions
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count

View File

@ -1,20 +0,0 @@
def FasterSymbolArray(Genome, symbol):
array = {}
n = len(Genome)
ExtendedGenome = Genome + Genome[0:n//2]
array[0] = PatternCount(symbol, Genome[0:n//2])
for i in range(1, n):
array[i] = array[i-1]
if ExtendedGenome[i-1] == symbol:
array[i] = array[i]-1
if ExtendedGenome[i+(n//2)-1] == symbol:
array[i] = array[i]+1
return array
def PatternCount(Text, Pattern):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
count = count+1
return count

View File

@ -1,20 +0,0 @@
def FrequentWords(Text, k):
words = []
freq = FrequencyMap(Text, k)
m = max(freq.values())
for key in freq:
if freq[key] == m:
words.append(key)
return words
def FrequencyMap(Text, k):
freq = {}
n = len(Text)
for i in range(n - k + 1):
Pattern = Text[i:i + k]
freq[Pattern] = 0
for i in range(n - k + 1):
Pattern = Text[i:i + k]
freq[Pattern] += 1
return freq

View File

@ -1,6 +0,0 @@
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count

View File

@ -1,18 +0,0 @@
def MinimumSkew(Genome):
positions = []
skew = SkewArray(Genome)
minimum = min(skew)
return [i for i in range(0, len(Genome)) if skew[i] == minimum]
def SkewArray(Genome):
Skew = []
Skew.append(0)
for i in range(0, len(Genome)):
if Genome[i] == "G":
Skew.append(Skew[i] + 1)
elif Genome[i] == "C":
Skew.append(Skew[i] - 1)
else:
Skew.append(Skew[i])
return Skew

View File

@ -1,6 +0,0 @@
def PatternCount(Text, Pattern):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
count = count+1
return count

View File

@ -1,6 +0,0 @@
def PatternMatching(Pattern, Genome):
positions = []
for i in range(len(Genome)-len(Pattern)+1):
if Genome[i:i+len(Pattern)] == Pattern:
positions.append(i)
return positions

View File

@ -1,17 +0,0 @@
def ReverseComplement(Pattern):
Pattern = Reverse(Pattern)
Pattern = Complement(Pattern)
return Pattern
def Reverse(Pattern):
reversed = Pattern[::-1]
return reversed
def Complement(Pattern):
compl = ""
complement_letters = {"A": "T", "T": "A", "C": "G", "G": "C"}
for char in Pattern:
compl += complement_letters[char]
return compl

View File

@ -1,11 +0,0 @@
def SkewArray(Genome):
Skew = []
Skew.append(0)
for i in range(0, len(Genome)):
if Genome[i] == "G":
Skew.append(Skew[i] + 1)
elif Genome[i] == "C":
Skew.append(Skew[i] - 1)
else:
Skew.append(Skew[i])
return Skew

View File

@ -1,15 +0,0 @@
def SymbolArray(Genome, symbol):
array = {}
n = len(Genome)
ExtendedGenome = Genome + Genome[0:n//2]
for i in range(n):
array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol)
return array
def PatternCount(Text, Pattern):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
count = count+1
return count

View File

@ -12,21 +12,85 @@
***** Exercise: find Pattern ***** Exercise: find Pattern
We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use the function [[./Code/PatternCount.py][PatternCount]] to find out how many times We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use this function to find out how many times
does a sequence appear in the genome. does a sequence appear in the genome:
For the second part, we're going to calculate the frequency map of the sequences of length /k/, for that purpose we'll use [[./Code/FrequentWords.py][FrequentWords]] #+BEGIN_SRC python
def PatternCount(Text, Pattern):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
count = count+1
return count
#+END_SRC
For the second part, we're going to calculate the frequency map of the sequences
of length /k/, for that purpose we'll use:
#+BEGIN_SRC python
def FrequentWords(Text, k):
words = []
freq = FrequencyMap(Text, k)
m = max(freq.values())
for key in freq:
if freq[key] == m:
words.append(key)
return words
def FrequencyMap(Text, k):
freq = {}
n = len(Text)
for i in range(n - k + 1):
Pattern = Text[i:i + k]
freq[Pattern] = 0
for i in range(n - k + 1):
Pattern = Text[i:i + k]
freq[Pattern] += 1
return freq
#+END_SRC
***** Exercise: Find the reverse complement of a sequence ***** Exercise: Find the reverse complement of a sequence
We're going to generate the reverse complement of a sequence, which is the complement of a sequence, read in the same direction (5' -> 3'). We're going to generate the reverse complement of a sequence, which is the complement of a sequence, read in the same direction (5' -> 3').
In this case, we're going to use [[./Code/ReverseComplement.py][ReverseComplement]] In this case, we're going to use:
#+BEGIN_SRC python
def ReverseComplement(Pattern):
Pattern = Reverse(Pattern)
Pattern = Complement(Pattern)
return Pattern
def Reverse(Pattern):
reversed = Pattern[::-1]
return reversed
def Complement(Pattern):
compl = ""
complement_letters = {"A": "T", "T": "A", "C": "G", "G": "C"}
for char in Pattern:
compl += complement_letters[char]
return compl
#+END_SRC
After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent /k-mers/ are reverse complements of other frequent ones. After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent /k-mers/ are reverse complements of other frequent ones.
***** Exercise: Find a subsequence within a sequence ***** Exercise: Find a subsequence within a sequence
We're going to find the ocurrences of a subsquence inside a sequence, and save the index of the first letter in the sequence. We're going to find the ocurrences of a subsquence inside a sequence, and save the index of the first letter in the sequence.
This time, we'll use [[./Code/PatternMatching.py][PatternMatching]] This time, we'll use:
#+BEGIN_SRC python
def PatternMatching(Pattern, Genome):
positions = []
for i in range(len(Genome)-len(Pattern)+1):
if Genome[i:i+len(Pattern)] == Pattern:
positions.append(i)
return positions
#+END_SRC
After using our function on the /Vibrio Cholerae's/ genome, we find out that the /9-mers/ with the highest frequency appear in cluster. After using our function on the /Vibrio Cholerae's/ genome, we find out that the /9-mers/ with the highest frequency appear in cluster.
This is strong statistical evidence that our subsequences are /DnaA boxes/. This is strong statistical evidence that our subsequences are /DnaA boxes/.
@ -34,14 +98,23 @@
**** Computational approaches to find ori in any bacteria **** Computational approaches to find ori in any bacteria
Now that we're pretty confident about the /DnaA boxes/ sequences that we found, we are going to check if they are a common pattern in the rest of bacterias. Now that we're pretty confident about the /DnaA boxes/ sequences that we found, we are going to check if they are a common pattern in the rest of bacterias.
We're going to find the ocurrences of the sequences in /Thermotoga petrophila/ using [[./Code/PatternCount.py][PatternCount]] We're going to find the ocurrences of the sequences in /Thermotoga petrophila/
with:
#+BEGIN_SRC python
def PatternCount(Text, Pattern):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
count = count+1
return count
#+END_SRC
After the execution, we observe that there are *no* ocurrences of the sequences found in /Vibrio Cholerae/. After the execution, we observe that there are *no* ocurrences of the sequences found in /Vibrio Cholerae/.
We can conclude that different bacterias have different /DnaA boxes/. We can conclude that different bacterias have different /DnaA boxes/.
We have to try another computational approach then, find clusters of /k-mers/ repeated in a small interval. We have to try another computational approach then, find clusters of /k-mers/ repeated in a small interval.
** Week 2 ** Week 2
*** DNA replication (II) *** DNA replication (II)
@ -60,12 +133,57 @@
***** Exercise: count the ocurrences of cytosine ***** Exercise: count the ocurrences of cytosine
We're going to count the ocurrences of the bases in a genome and include them in a symbol array, for that purpose we'll use [[./Code/SymbolArray.py][SymbolArray]] We're going to count the ocurrences of the bases in a genome and include them in
a symbol array, for that purpose we'll use:
#+BEGIN_SRC python
def SymbolArray(Genome, symbol):
array = {}
n = len(Genome)
ExtendedGenome = Genome + Genome[0:n//2]
for i in range(n):
array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol)
return array
def PatternCount(Text, Pattern):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
count = count+1
return count
#+END_SRC
After executing the program, we realize that the algorithm is too inefficient. After executing the program, we realize that the algorithm is too inefficient.
***** Exercise: find a better algorithm for the previous exercise ***** Exercise: find a better algorithm for the previous exercise
This time, we are going to evaluate an element /i+1/, using the element /i/. We'll use [[./Code/FasterSymbolArray.py][FasterSymbolArray]] to achieve this This time, we are going to evaluate an element /i+1/, using the element /i/.
We'll use the following algorithm:
#+BEGIN_SRC python
def FasterSymbolArray(Genome, symbol):
array = {}
n = len(Genome)
ExtendedGenome = Genome + Genome[0:n//2]
array[0] = PatternCount(symbol, Genome[0:n//2])
for i in range(1, n):
array[i] = array[i-1]
if ExtendedGenome[i-1] == symbol:
array[i] = array[i]-1
if ExtendedGenome[i+(n//2)-1] == symbol:
array[i] = array[i]+1
return array
def PatternCount(Text, Pattern):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
count = count+1
return count
#+END_SRC
After executing the program we see that it's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(n²)/. After executing the program we see that it's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(n²)/.
In /Escherichia Coli/ we plotted the result of our program: In /Escherichia Coli/ we plotted the result of our program:
@ -81,7 +199,23 @@
***** Exercise: Synthetize a Skew Array ***** Exercise: Synthetize a Skew Array
We're going to make a Skew Diagram, for that we'll first need a Skew Array. For that purpose we wrote [[./Code/SkewArray.py][SkewArray]] We're going to make a Skew Diagram, for that we'll first need a Skew Array. For
that purpose we wrote:
#+BEGIN_SRC python
def SkewArray(Genome):
Skew = []
Skew.append(0)
for i in range(0, len(Genome)):
if Genome[i] == "G":
Skew.append(Skew[i] + 1)
elif Genome[i] == "C":
Skew.append(Skew[i] - 1)
else:
Skew.append(Skew[i])
return Skew
#+END_SRC
We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/: We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/:
#+CAPTION: Symbol array for Cytosine in E. Coli Genome] #+CAPTION: Symbol array for Cytosine in E. Coli Genome]
@ -91,7 +225,30 @@
***** Exercise: Efficient algorithm for locating ori ***** Exercise: Efficient algorithm for locating ori
Now that we know more about ori's skew value, we're going to construct a better algorithm to find it. We'll do that in [[./Code/MinimumSkew.py][MinimumSkew]] Now that we know more about ori's skew value, we're going to construct a better
algorithm to find it:
#+BEGIN_SRC python
def MinimumSkew(Genome):
positions = []
skew = SkewArray(Genome)
minimum = min(skew)
return [i for i in range(0, len(Genome)) if skew[i] == minimum]
def SkewArray(Genome):
Skew = []
Skew.append(0)
for i in range(0, len(Genome)):
if Genome[i] == "G":
Skew.append(Skew[i] + 1)
elif Genome[i] == "C":
Skew.append(Skew[i] - 1)
else:
Skew.append(Skew[i])
return Skew
#+END_SRC
**** Finding /DnaA boxes/ **** Finding /DnaA boxes/
@ -104,11 +261,50 @@
***** Exercise: Find approximate patterns ***** Exercise: Find approximate patterns
Now that we have our Hamming distance, we have to find the approximate sequences. We'll do this in [[./Code/ApproximatePatternMatching.py][ApproximatePatternMatching.py]] Now that we have our Hamming distance, we have to find the approximate
sequences:
#+BEGIN_SRC python
def ApproximatePatternMatching(Text, Pattern, d):
positions = []
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern:
positions.append(i)
elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
positions.append(i)
return positions
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
#+END_SRC
***** Exercise: Count the approximate patterns ***** Exercise: Count the approximate patterns
The final part is counting the approximate sequences, for that we'll use [[./Code/ApproximatePatternCount.py][ApproximatePatternCount.py]] The final part is counting the approximate sequences:
#+BEGIN_SRC python
def ApproximatePatternCount(Pattern, Text, d):
count = 0
for i in range(len(Text)-len(Pattern)+1):
if Text[i:i+len(Pattern)] == Pattern or HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
count += 1
return count
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
#+END_SRC
After trying out our ApproximatePatternCount in the hypothesized ori region, we find a frequent /k-mer/ with its reverse complement in /Escherichia Coli/. After trying out our ApproximatePatternCount in the hypothesized ori region, we find a frequent /k-mer/ with its reverse complement in /Escherichia Coli/.
We've finally found a computational method to find ori that seems correct. We've finally found a computational method to find ori that seems correct.
@ -123,7 +319,6 @@ Variation in gene expression permits the cell to keep track of time.
***** Exercise: Find the most common nucleotides in each position ***** Exercise: Find the most common nucleotides in each position
We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case, We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case,
and the nucleotide in lower case (if there's no popular one). and the nucleotide in lower case (if there's no popular one).
Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters. Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters.