From 81a57657fecfa3adc55bdcf183f8914225a1614e Mon Sep 17 00:00:00 2001
From: coolneng <akasroua@gmail.com>
Date: Mon, 3 May 2021 21:51:32 +0200
Subject: [PATCH] Fix HVR end position computation

---
 src/alignment.r | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/alignment.r b/src/alignment.r
index 83d8eeb..5d9adcc 100644
--- a/src/alignment.r
+++ b/src/alignment.r
@@ -70,31 +70,32 @@ get_cys_coordinates <- function(alignment) {
   return(list("start" = cys_start, "end" = cys_end))
 }
 
-# TODO Refactor this mess
-get_hvr_sequences <- function(sequences, vdj_segments) {
+get_hvr_sequences <- function(sequences, vdj_segments, cores = detectCores()) {
   df <- fetch_vj_sequences(sequences, vdj_segments)
-  v_alignment <- parallel::mcmapply(sequences, df$v_seq, FUN = align_sequence)
-  j_alignment <- parallel::mcmapply(sequences, df$j_seq, FUN = align_sequence)
+  v_alignment <- parallel::mcmapply(sequences,
+    df$v_seq,
+    FUN = align_sequence,
+    mc.cores = cores
+  )
   cys_coordinates <- parallel::mclapply(v_alignment, FUN = get_cys_coordinates)
   cys_df <- as.data.frame(do.call(rbind, cys_coordinates))
+  remaining <- Biostrings::subseq(sequences, start = unlist(cys_df$end))
+  j_alignment <- parallel::mcmapply(remaining,
+    df$j_seq,
+    FUN = align_sequence,
+    mc.cores = cores
+  )
   j_start <- parallel::mclapply(
     j_alignment,
-    function(x) start(Biostrings::Views(x))
-  )
-  hvr <- Biostrings::subseq(sequences,
-    start = unlist(cys_df$start),
-    end = unlist(j_start) + 2
+    function(x) start(Biostrings::Views(x)),
+    mc.cores = cores
   )
+  hvr_start <- unlist(cys_df$start)
+  hvr_end <- unlist(cys_df$start) + unlist(j_start) + 2
+  hvr <- Biostrings::subseq(sequences, start = hvr_start, end = hvr_end)
   return(hvr)
 }
 
-save_data <- function(data) {
-  Biostrings::writeXStringSet(data, "data/CuReSim-HVR.fastq", format = "fastq")
-}
-
 data <- parse_data(file = "data/curesim_sequence.fastq")
-hvr <- get_hvr_sequences(
-  sequences = data[[1]],
-  vdj_segments = data[[2]]
-)
+hvr <- get_hvr_sequences(sequences = data[[1]], vdj_segments = data[[2]])
 Biostrings::writeXStringSet(hvr, "data/CuReSim-HVR.fastq", format = "fastq")
\ No newline at end of file