Convert all the content to Literal Programming
This commit is contained in:
parent
fe23029144
commit
6b04676fbb
@ -1,14 +0,0 @@
|
||||
def ApproximatePatternCount(Pattern, Text, d):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern or HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
@ -1,16 +0,0 @@
|
||||
def ApproximatePatternMatching(Text, Pattern, d):
|
||||
positions = []
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
positions.append(i)
|
||||
elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
|
||||
positions.append(i)
|
||||
return positions
|
||||
|
||||
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
@ -1,20 +0,0 @@
|
||||
def FasterSymbolArray(Genome, symbol):
|
||||
array = {}
|
||||
n = len(Genome)
|
||||
ExtendedGenome = Genome + Genome[0:n//2]
|
||||
array[0] = PatternCount(symbol, Genome[0:n//2])
|
||||
for i in range(1, n):
|
||||
array[i] = array[i-1]
|
||||
if ExtendedGenome[i-1] == symbol:
|
||||
array[i] = array[i]-1
|
||||
if ExtendedGenome[i+(n//2)-1] == symbol:
|
||||
array[i] = array[i]+1
|
||||
return array
|
||||
|
||||
|
||||
def PatternCount(Text, Pattern):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
count = count+1
|
||||
return count
|
@ -1,20 +0,0 @@
|
||||
def FrequentWords(Text, k):
|
||||
words = []
|
||||
freq = FrequencyMap(Text, k)
|
||||
m = max(freq.values())
|
||||
for key in freq:
|
||||
if freq[key] == m:
|
||||
words.append(key)
|
||||
return words
|
||||
|
||||
|
||||
def FrequencyMap(Text, k):
|
||||
freq = {}
|
||||
n = len(Text)
|
||||
for i in range(n - k + 1):
|
||||
Pattern = Text[i:i + k]
|
||||
freq[Pattern] = 0
|
||||
for i in range(n - k + 1):
|
||||
Pattern = Text[i:i + k]
|
||||
freq[Pattern] += 1
|
||||
return freq
|
@ -1,6 +0,0 @@
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
@ -1,18 +0,0 @@
|
||||
def MinimumSkew(Genome):
|
||||
positions = []
|
||||
skew = SkewArray(Genome)
|
||||
minimum = min(skew)
|
||||
return [i for i in range(0, len(Genome)) if skew[i] == minimum]
|
||||
|
||||
|
||||
def SkewArray(Genome):
|
||||
Skew = []
|
||||
Skew.append(0)
|
||||
for i in range(0, len(Genome)):
|
||||
if Genome[i] == "G":
|
||||
Skew.append(Skew[i] + 1)
|
||||
elif Genome[i] == "C":
|
||||
Skew.append(Skew[i] - 1)
|
||||
else:
|
||||
Skew.append(Skew[i])
|
||||
return Skew
|
@ -1,6 +0,0 @@
|
||||
def PatternCount(Text, Pattern):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
count = count+1
|
||||
return count
|
@ -1,6 +0,0 @@
|
||||
def PatternMatching(Pattern, Genome):
|
||||
positions = []
|
||||
for i in range(len(Genome)-len(Pattern)+1):
|
||||
if Genome[i:i+len(Pattern)] == Pattern:
|
||||
positions.append(i)
|
||||
return positions
|
@ -1,17 +0,0 @@
|
||||
def ReverseComplement(Pattern):
|
||||
Pattern = Reverse(Pattern)
|
||||
Pattern = Complement(Pattern)
|
||||
return Pattern
|
||||
|
||||
|
||||
def Reverse(Pattern):
|
||||
reversed = Pattern[::-1]
|
||||
return reversed
|
||||
|
||||
|
||||
def Complement(Pattern):
|
||||
compl = ""
|
||||
complement_letters = {"A": "T", "T": "A", "C": "G", "G": "C"}
|
||||
for char in Pattern:
|
||||
compl += complement_letters[char]
|
||||
return compl
|
@ -1,11 +0,0 @@
|
||||
def SkewArray(Genome):
|
||||
Skew = []
|
||||
Skew.append(0)
|
||||
for i in range(0, len(Genome)):
|
||||
if Genome[i] == "G":
|
||||
Skew.append(Skew[i] + 1)
|
||||
elif Genome[i] == "C":
|
||||
Skew.append(Skew[i] - 1)
|
||||
else:
|
||||
Skew.append(Skew[i])
|
||||
return Skew
|
@ -1,15 +0,0 @@
|
||||
def SymbolArray(Genome, symbol):
|
||||
array = {}
|
||||
n = len(Genome)
|
||||
ExtendedGenome = Genome + Genome[0:n//2]
|
||||
for i in range(n):
|
||||
array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol)
|
||||
return array
|
||||
|
||||
|
||||
def PatternCount(Text, Pattern):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
count = count+1
|
||||
return count
|
223
Notebook.org
223
Notebook.org
@ -12,21 +12,85 @@
|
||||
|
||||
***** Exercise: find Pattern
|
||||
|
||||
We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use the function [[./Code/PatternCount.py][PatternCount]] to find out how many times
|
||||
does a sequence appear in the genome.
|
||||
We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use this function to find out how many times
|
||||
does a sequence appear in the genome:
|
||||
|
||||
For the second part, we're going to calculate the frequency map of the sequences of length /k/, for that purpose we'll use [[./Code/FrequentWords.py][FrequentWords]]
|
||||
#+BEGIN_SRC python
|
||||
def PatternCount(Text, Pattern):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
count = count+1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
For the second part, we're going to calculate the frequency map of the sequences
|
||||
of length /k/, for that purpose we'll use:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def FrequentWords(Text, k):
|
||||
words = []
|
||||
freq = FrequencyMap(Text, k)
|
||||
m = max(freq.values())
|
||||
for key in freq:
|
||||
if freq[key] == m:
|
||||
words.append(key)
|
||||
return words
|
||||
|
||||
|
||||
def FrequencyMap(Text, k):
|
||||
freq = {}
|
||||
n = len(Text)
|
||||
for i in range(n - k + 1):
|
||||
Pattern = Text[i:i + k]
|
||||
freq[Pattern] = 0
|
||||
for i in range(n - k + 1):
|
||||
Pattern = Text[i:i + k]
|
||||
freq[Pattern] += 1
|
||||
return freq
|
||||
#+END_SRC
|
||||
|
||||
***** Exercise: Find the reverse complement of a sequence
|
||||
|
||||
We're going to generate the reverse complement of a sequence, which is the complement of a sequence, read in the same direction (5' -> 3').
|
||||
In this case, we're going to use [[./Code/ReverseComplement.py][ReverseComplement]]
|
||||
In this case, we're going to use:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def ReverseComplement(Pattern):
|
||||
Pattern = Reverse(Pattern)
|
||||
Pattern = Complement(Pattern)
|
||||
return Pattern
|
||||
|
||||
|
||||
def Reverse(Pattern):
|
||||
reversed = Pattern[::-1]
|
||||
return reversed
|
||||
|
||||
|
||||
def Complement(Pattern):
|
||||
compl = ""
|
||||
complement_letters = {"A": "T", "T": "A", "C": "G", "G": "C"}
|
||||
for char in Pattern:
|
||||
compl += complement_letters[char]
|
||||
return compl
|
||||
#+END_SRC
|
||||
|
||||
After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent /k-mers/ are reverse complements of other frequent ones.
|
||||
|
||||
***** Exercise: Find a subsequence within a sequence
|
||||
|
||||
We're going to find the ocurrences of a subsquence inside a sequence, and save the index of the first letter in the sequence.
|
||||
This time, we'll use [[./Code/PatternMatching.py][PatternMatching]]
|
||||
This time, we'll use:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def PatternMatching(Pattern, Genome):
|
||||
positions = []
|
||||
for i in range(len(Genome)-len(Pattern)+1):
|
||||
if Genome[i:i+len(Pattern)] == Pattern:
|
||||
positions.append(i)
|
||||
return positions
|
||||
#+END_SRC
|
||||
|
||||
After using our function on the /Vibrio Cholerae's/ genome, we find out that the /9-mers/ with the highest frequency appear in cluster.
|
||||
This is strong statistical evidence that our subsequences are /DnaA boxes/.
|
||||
|
||||
@ -34,14 +98,23 @@
|
||||
**** Computational approaches to find ori in any bacteria
|
||||
|
||||
Now that we're pretty confident about the /DnaA boxes/ sequences that we found, we are going to check if they are a common pattern in the rest of bacterias.
|
||||
We're going to find the ocurrences of the sequences in /Thermotoga petrophila/ using [[./Code/PatternCount.py][PatternCount]]
|
||||
We're going to find the ocurrences of the sequences in /Thermotoga petrophila/
|
||||
with:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def PatternCount(Text, Pattern):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
count = count+1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
After the execution, we observe that there are *no* ocurrences of the sequences found in /Vibrio Cholerae/.
|
||||
We can conclude that different bacterias have different /DnaA boxes/.
|
||||
|
||||
We have to try another computational approach then, find clusters of /k-mers/ repeated in a small interval.
|
||||
|
||||
|
||||
** Week 2
|
||||
|
||||
*** DNA replication (II)
|
||||
@ -60,12 +133,57 @@
|
||||
|
||||
***** Exercise: count the ocurrences of cytosine
|
||||
|
||||
We're going to count the ocurrences of the bases in a genome and include them in a symbol array, for that purpose we'll use [[./Code/SymbolArray.py][SymbolArray]]
|
||||
We're going to count the ocurrences of the bases in a genome and include them in
|
||||
a symbol array, for that purpose we'll use:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def SymbolArray(Genome, symbol):
|
||||
array = {}
|
||||
n = len(Genome)
|
||||
ExtendedGenome = Genome + Genome[0:n//2]
|
||||
for i in range(n):
|
||||
array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol)
|
||||
return array
|
||||
|
||||
|
||||
def PatternCount(Text, Pattern):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
count = count+1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
After executing the program, we realize that the algorithm is too inefficient.
|
||||
|
||||
***** Exercise: find a better algorithm for the previous exercise
|
||||
|
||||
This time, we are going to evaluate an element /i+1/, using the element /i/. We'll use [[./Code/FasterSymbolArray.py][FasterSymbolArray]] to achieve this
|
||||
This time, we are going to evaluate an element /i+1/, using the element /i/.
|
||||
We'll use the following algorithm:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def FasterSymbolArray(Genome, symbol):
|
||||
array = {}
|
||||
n = len(Genome)
|
||||
ExtendedGenome = Genome + Genome[0:n//2]
|
||||
array[0] = PatternCount(symbol, Genome[0:n//2])
|
||||
for i in range(1, n):
|
||||
array[i] = array[i-1]
|
||||
if ExtendedGenome[i-1] == symbol:
|
||||
array[i] = array[i]-1
|
||||
if ExtendedGenome[i+(n//2)-1] == symbol:
|
||||
array[i] = array[i]+1
|
||||
return array
|
||||
|
||||
|
||||
def PatternCount(Text, Pattern):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
count = count+1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
After executing the program we see that it's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(n²)/.
|
||||
In /Escherichia Coli/ we plotted the result of our program:
|
||||
|
||||
@ -81,7 +199,23 @@
|
||||
|
||||
***** Exercise: Synthetize a Skew Array
|
||||
|
||||
We're going to make a Skew Diagram, for that we'll first need a Skew Array. For that purpose we wrote [[./Code/SkewArray.py][SkewArray]]
|
||||
We're going to make a Skew Diagram, for that we'll first need a Skew Array. For
|
||||
that purpose we wrote:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def SkewArray(Genome):
|
||||
Skew = []
|
||||
Skew.append(0)
|
||||
for i in range(0, len(Genome)):
|
||||
if Genome[i] == "G":
|
||||
Skew.append(Skew[i] + 1)
|
||||
elif Genome[i] == "C":
|
||||
Skew.append(Skew[i] - 1)
|
||||
else:
|
||||
Skew.append(Skew[i])
|
||||
return Skew
|
||||
#+END_SRC
|
||||
|
||||
We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/:
|
||||
|
||||
#+CAPTION: Symbol array for Cytosine in E. Coli Genome]
|
||||
@ -91,7 +225,30 @@
|
||||
|
||||
***** Exercise: Efficient algorithm for locating ori
|
||||
|
||||
Now that we know more about ori's skew value, we're going to construct a better algorithm to find it. We'll do that in [[./Code/MinimumSkew.py][MinimumSkew]]
|
||||
Now that we know more about ori's skew value, we're going to construct a better
|
||||
algorithm to find it:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def MinimumSkew(Genome):
|
||||
positions = []
|
||||
skew = SkewArray(Genome)
|
||||
minimum = min(skew)
|
||||
return [i for i in range(0, len(Genome)) if skew[i] == minimum]
|
||||
|
||||
|
||||
def SkewArray(Genome):
|
||||
Skew = []
|
||||
Skew.append(0)
|
||||
for i in range(0, len(Genome)):
|
||||
if Genome[i] == "G":
|
||||
Skew.append(Skew[i] + 1)
|
||||
elif Genome[i] == "C":
|
||||
Skew.append(Skew[i] - 1)
|
||||
else:
|
||||
Skew.append(Skew[i])
|
||||
return Skew
|
||||
#+END_SRC
|
||||
|
||||
|
||||
**** Finding /DnaA boxes/
|
||||
|
||||
@ -104,11 +261,50 @@
|
||||
|
||||
***** Exercise: Find approximate patterns
|
||||
|
||||
Now that we have our Hamming distance, we have to find the approximate sequences. We'll do this in [[./Code/ApproximatePatternMatching.py][ApproximatePatternMatching.py]]
|
||||
Now that we have our Hamming distance, we have to find the approximate
|
||||
sequences:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def ApproximatePatternMatching(Text, Pattern, d):
|
||||
positions = []
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern:
|
||||
positions.append(i)
|
||||
elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
|
||||
positions.append(i)
|
||||
return positions
|
||||
|
||||
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
|
||||
***** Exercise: Count the approximate patterns
|
||||
|
||||
The final part is counting the approximate sequences, for that we'll use [[./Code/ApproximatePatternCount.py][ApproximatePatternCount.py]]
|
||||
The final part is counting the approximate sequences:
|
||||
|
||||
#+BEGIN_SRC python
|
||||
def ApproximatePatternCount(Pattern, Text, d):
|
||||
count = 0
|
||||
for i in range(len(Text)-len(Pattern)+1):
|
||||
if Text[i:i+len(Pattern)] == Pattern or HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def HammingDistance(p, q):
|
||||
count = 0
|
||||
for i in range(0, len(p)):
|
||||
if p[i] != q[i]:
|
||||
count += 1
|
||||
return count
|
||||
#+END_SRC
|
||||
|
||||
|
||||
After trying out our ApproximatePatternCount in the hypothesized ori region, we find a frequent /k-mer/ with its reverse complement in /Escherichia Coli/.
|
||||
We've finally found a computational method to find ori that seems correct.
|
||||
@ -123,7 +319,6 @@ Variation in gene expression permits the cell to keep track of time.
|
||||
|
||||
***** Exercise: Find the most common nucleotides in each position
|
||||
|
||||
|
||||
We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case,
|
||||
and the nucleotide in lower case (if there's no popular one).
|
||||
Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters.
|
||||
|
Loading…
x
Reference in New Issue
Block a user