From b035c496f7018e9cfec57a2f9ee275f9bee7fd2f Mon Sep 17 00:00:00 2001
From: coolneng <akasroua@gmail.com>
Date: Fri, 26 Feb 2021 02:19:40 +0100
Subject: [PATCH] Filter the relevant columns from immuneSIM output

---
 shell.nix        |  1 +
 src/repertoire.r | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/shell.nix b/shell.nix
index 82b696a..94c795b 100644
--- a/shell.nix
+++ b/shell.nix
@@ -6,6 +6,7 @@ mkShell {
   buildInputs = [
     R
     rPackages.immuneSIM
+    rPackages.Biostrings
     jdk
     # Develoment tools
     rPackages.languageserver
diff --git a/src/repertoire.r b/src/repertoire.r
index 1a5cefc..4ed9b55 100644
--- a/src/repertoire.r
+++ b/src/repertoire.r
@@ -1,4 +1,5 @@
 library(immuneSIM)
+library(Biostrings)
 
 generate_repertoires <- function(number_of_sequences) {
   a_chain <- immuneSIM(
@@ -18,6 +19,21 @@ generate_repertoires <- function(number_of_sequences) {
   return(list("a_chain" = a_chain, "b_chain" = b_chain))
 }
 
+process_chain <- function(repertoire) {
+  sequences <- as.character(repertoire$sequence)
+  counts <- as.integer(repertoire$counts)
+  reads <- Biostrings::DNAStringSet(rep(sequences, counts))
+  names(reads) <- seq_len(length(reads))
+  reverse_complement <- Biostrings::reverseComplement(reads)
+  return(reverse_complement)
+}
+
+preprocess_data <- function(repertoires) {
+  filtered_repertoires <- lapply(repertoires, process_chain)
+  names(filtered_repertoires) <- names(repertoires)
+  return(filtered_repertoires)
+}
+
 
 parse_cli_arguments <- function(args) {
   if (length(args) != 1) {
@@ -29,3 +45,4 @@ parse_cli_arguments <- function(args) {
 args <- commandArgs(trailingOnly = TRUE)
 number_of_sequences <- parse_cli_arguments(args)
 sim_repertoire <- generate_repertoires(number_of_sequences)
+processed_data <- preprocess_data(sim_repertoire)