1 Commits

Author SHA1 Message Date
6561aa2caf Convert org mode README to markdown 2021-05-05 12:19:40 +02:00
7 changed files with 257 additions and 108 deletions

View File

@@ -1,68 +1,75 @@
# locigenesis # locigenesis
locigenesis is a tool that generates a human T-cell receptor (TCR), runs locigenesis is a tool that generates a human T-cell receptor (TCR), runs it through a sequence reader simulation tool and extracts CDR3.
it through a sequence reader simulation tool and extracts CDR3.
The goal of this project is to generate both HVR sequences with and The goal of this project is to generate both HVR sequences with and without sequencing errors, in order to create datasets for a Machine Learning algorithm.
without sequencing errors, in order to create datasets for a Machine
Learning algorithm.
<a id="orgb4db211"></a>
## Technologies ## Technologies
- [immuneSIM](https://github.com/GreiffLab/immuneSIM/): in silico - [immuneSIM](https://github.com/GreiffLab/immuneSIM/): in silico generation of human and mouse BCR and TCR repertoires
generation of human and mouse BCR and TCR repertoires - [CuReSim](http://www.pegase-biosciences.com/curesim-a-customized-read-simulator/): read simulator that mimics Ion Torrent sequencing
- [CuReSim](http://www.pegase-biosciences.com/curesim-a-customized-read-simulator/):
read simulator that mimics Ion Torrent sequencing
<a id="orgace1e30"></a>
## Installation ## Installation
This project uses [Nix](https://nixos.org/) to ensure reproducible This project uses [Nix](https://nixos.org/) to ensure reproducible builds.
builds.
1. Install Nix (compatible with MacOS, Linux and 1. Install Nix (compatible with MacOS, Linux and [WSL](https://docs.microsoft.com/en-us/windows/wsl/about)):
[WSL](https://docs.microsoft.com/en-us/windows/wsl/about)):
```bash curl -L https://nixos.org/nix/install | sh
curl -L https://nixos.org/nix/install | sh
```
2. Clone the repository: 1. Clone the repository:
```bash git clone https://git.coolneng.duckdns.org/coolneng/locigenesis
git clone https://git.coolneng.duckdns.org/coolneng/locigenesis
```
3. Change the working directory to the project: 1. Change the working directory to the project:
```bash cd locigenesis
cd locigenesis
```
4. Enter the nix-shell: 1. Enter the nix-shell:
```bash nix-shell
nix-shell
```
After running these commands, you will find yourself in a shell that After running these commands, you will find yourself in a shell that contains all the needed dependencies.
contains all the needed dependencies.
<a id="org531bad5"></a>
## Usage ## Usage
An execution script that accepts 2 parameters is provided, the following An execution script that accepts 2 parameters is provided, the following command invokes it:
command invokes it:
```bash ./generation.sh <number of sequences> <number of reads>
./generation.sh <number of sequences> <number of reads>
```
- \<number of sequences\>: an integer that specifies the number of - <number of sequences>: an integer that specifies the number of different sequences to generate
different sequences to generate - <number of reads>: an integer that specifies the number of reads to perform on each sequence
- \<number of reads\>: an integer that specifies the number of reads
to perform on each sequence
The script will generate 2 files under the data directory: The script will generate 2 files under the data directory:
|HVR.fastq | curesim-HVR.fastq | <table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
|:----:|:-----:|
|Contains the original CDR3 sequence|Contains CDR3 after the read simulation, with sequencing errors |
<colgroup>
<col class="org-left" />
<col class="org-left" />
</colgroup>
<tbody>
<tr>
<td class="org-left">HVR.fastq</td>
<td class="org-left">Contains the original CDR3 sequence</td>
</tr>
<tr>
<td class="org-left">CuReSim-HVR.fastq</td>
<td class="org-left">Contains CDR3 after the read simulation, with sequencing errors</td>
</tr>
</tbody>
</table>

41
flake.lock generated
View File

@@ -1,41 +0,0 @@
{
"nodes": {
"flake-utils": {
"locked": {
"lastModified": 1634851050,
"narHash": "sha256-N83GlSGPJJdcqhUxSCS/WwW5pksYf3VP1M13cDRTSVA=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "c91f3de5adaf1de973b797ef7485e441a65b8935",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1635865339,
"narHash": "sha256-fmI8PxMmL7WXV/O8m6vT9/yW42buxvAYeRNpcABvnKs=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "26a56abd090ec5c8f4c6c9e1189fbfa4bcb8db3f",
"type": "github"
},
"original": {
"id": "nixpkgs",
"type": "indirect"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

View File

@@ -1,13 +0,0 @@
{
description = ''
locigenesis is a tool that generates a human T-cell receptor (TCR), runs
it through a sequence reader simulation tool and extracts CDR3.
'';
inputs.flake-utils.url = "github:numtide/flake-utils";
outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let pkgs = nixpkgs.legacyPackages.${system};
in { devShell = import ./shell.nix { inherit pkgs; }; });
}

26
nix/sources.json Normal file
View File

@@ -0,0 +1,26 @@
{
"niv": {
"branch": "master",
"description": "Easy dependency management for Nix projects",
"homepage": "https://github.com/nmattia/niv",
"owner": "nmattia",
"repo": "niv",
"rev": "af958e8057f345ee1aca714c1247ef3ba1c15f5e",
"sha256": "1qjavxabbrsh73yck5dcq8jggvh3r2jkbr6b5nlz5d9yrqm9255n",
"type": "tarball",
"url": "https://github.com/nmattia/niv/archive/af958e8057f345ee1aca714c1247ef3ba1c15f5e.tar.gz",
"url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
},
"nixpkgs": {
"branch": "release-20.09",
"description": "Nix Packages collection",
"homepage": "",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "a565a2165ab6e195d7c105a8416b8f4b4d0349a4",
"sha256": "1x90qm533lh8xh172rqfcj3pwg8imyx650xgr41rqppmm6fli4w1",
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/a565a2165ab6e195d7c105a8416b8f4b4d0349a4.tar.gz",
"url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
}
}

174
nix/sources.nix Normal file
View File

@@ -0,0 +1,174 @@
# This file has been generated by Niv.
let
#
# The fetchers. fetch_<type> fetches specs of type <type>.
#
fetch_file = pkgs: name: spec:
let
name' = sanitizeName name + "-src";
in
if spec.builtin or true then
builtins_fetchurl { inherit (spec) url sha256; name = name'; }
else
pkgs.fetchurl { inherit (spec) url sha256; name = name'; };
fetch_tarball = pkgs: name: spec:
let
name' = sanitizeName name + "-src";
in
if spec.builtin or true then
builtins_fetchTarball { name = name'; inherit (spec) url sha256; }
else
pkgs.fetchzip { name = name'; inherit (spec) url sha256; };
fetch_git = name: spec:
let
ref =
if spec ? ref then spec.ref else
if spec ? branch then "refs/heads/${spec.branch}" else
if spec ? tag then "refs/tags/${spec.tag}" else
abort "In git source '${name}': Please specify `ref`, `tag` or `branch`!";
in
builtins.fetchGit { url = spec.repo; inherit (spec) rev; inherit ref; };
fetch_local = spec: spec.path;
fetch_builtin-tarball = name: throw
''[${name}] The niv type "builtin-tarball" is deprecated. You should instead use `builtin = true`.
$ niv modify ${name} -a type=tarball -a builtin=true'';
fetch_builtin-url = name: throw
''[${name}] The niv type "builtin-url" will soon be deprecated. You should instead use `builtin = true`.
$ niv modify ${name} -a type=file -a builtin=true'';
#
# Various helpers
#
# https://github.com/NixOS/nixpkgs/pull/83241/files#diff-c6f540a4f3bfa4b0e8b6bafd4cd54e8bR695
sanitizeName = name:
(
concatMapStrings (s: if builtins.isList s then "-" else s)
(
builtins.split "[^[:alnum:]+._?=-]+"
((x: builtins.elemAt (builtins.match "\\.*(.*)" x) 0) name)
)
);
# The set of packages used when specs are fetched using non-builtins.
mkPkgs = sources: system:
let
sourcesNixpkgs =
import (builtins_fetchTarball { inherit (sources.nixpkgs) url sha256; }) { inherit system; };
hasNixpkgsPath = builtins.any (x: x.prefix == "nixpkgs") builtins.nixPath;
hasThisAsNixpkgsPath = <nixpkgs> == ./.;
in
if builtins.hasAttr "nixpkgs" sources
then sourcesNixpkgs
else if hasNixpkgsPath && ! hasThisAsNixpkgsPath then
import <nixpkgs> {}
else
abort
''
Please specify either <nixpkgs> (through -I or NIX_PATH=nixpkgs=...) or
add a package called "nixpkgs" to your sources.json.
'';
# The actual fetching function.
fetch = pkgs: name: spec:
if ! builtins.hasAttr "type" spec then
abort "ERROR: niv spec ${name} does not have a 'type' attribute"
else if spec.type == "file" then fetch_file pkgs name spec
else if spec.type == "tarball" then fetch_tarball pkgs name spec
else if spec.type == "git" then fetch_git name spec
else if spec.type == "local" then fetch_local spec
else if spec.type == "builtin-tarball" then fetch_builtin-tarball name
else if spec.type == "builtin-url" then fetch_builtin-url name
else
abort "ERROR: niv spec ${name} has unknown type ${builtins.toJSON spec.type}";
# If the environment variable NIV_OVERRIDE_${name} is set, then use
# the path directly as opposed to the fetched source.
replace = name: drv:
let
saneName = stringAsChars (c: if isNull (builtins.match "[a-zA-Z0-9]" c) then "_" else c) name;
ersatz = builtins.getEnv "NIV_OVERRIDE_${saneName}";
in
if ersatz == "" then drv else
# this turns the string into an actual Nix path (for both absolute and
# relative paths)
if builtins.substring 0 1 ersatz == "/" then /. + ersatz else /. + builtins.getEnv "PWD" + "/${ersatz}";
# Ports of functions for older nix versions
# a Nix version of mapAttrs if the built-in doesn't exist
mapAttrs = builtins.mapAttrs or (
f: set: with builtins;
listToAttrs (map (attr: { name = attr; value = f attr set.${attr}; }) (attrNames set))
);
# https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/lists.nix#L295
range = first: last: if first > last then [] else builtins.genList (n: first + n) (last - first + 1);
# https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L257
stringToCharacters = s: map (p: builtins.substring p 1 s) (range 0 (builtins.stringLength s - 1));
# https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L269
stringAsChars = f: s: concatStrings (map f (stringToCharacters s));
concatMapStrings = f: list: concatStrings (map f list);
concatStrings = builtins.concatStringsSep "";
# https://github.com/NixOS/nixpkgs/blob/8a9f58a375c401b96da862d969f66429def1d118/lib/attrsets.nix#L331
optionalAttrs = cond: as: if cond then as else {};
# fetchTarball version that is compatible between all the versions of Nix
builtins_fetchTarball = { url, name ? null, sha256 }@attrs:
let
inherit (builtins) lessThan nixVersion fetchTarball;
in
if lessThan nixVersion "1.12" then
fetchTarball ({ inherit url; } // (optionalAttrs (!isNull name) { inherit name; }))
else
fetchTarball attrs;
# fetchurl version that is compatible between all the versions of Nix
builtins_fetchurl = { url, name ? null, sha256 }@attrs:
let
inherit (builtins) lessThan nixVersion fetchurl;
in
if lessThan nixVersion "1.12" then
fetchurl ({ inherit url; } // (optionalAttrs (!isNull name) { inherit name; }))
else
fetchurl attrs;
# Create the final "sources" from the config
mkSources = config:
mapAttrs (
name: spec:
if builtins.hasAttr "outPath" spec
then abort
"The values in sources.json should not have an 'outPath' attribute"
else
spec // { outPath = replace name (fetch config.pkgs name spec); }
) config.sources;
# The "config" used by the fetchers
mkConfig =
{ sourcesFile ? if builtins.pathExists ./sources.json then ./sources.json else null
, sources ? if isNull sourcesFile then {} else builtins.fromJSON (builtins.readFile sourcesFile)
, system ? builtins.currentSystem
, pkgs ? mkPkgs sources system
}: rec {
# The sources, i.e. the attribute set of spec name to spec
inherit sources;
# The "pkgs" (evaluated nixpkgs) to use for e.g. non-builtin fetchers
inherit pkgs;
};
in
mkSources (mkConfig {}) // { __functor = _: settings: mkSources (mkConfig settings); }

View File

@@ -1,4 +1,4 @@
{ pkgs ? import <nixpkgs> { } }: { sources ? import ./nix/sources.nix, pkgs ? import sources.nixpkgs { } }:
with pkgs; with pkgs;
@@ -6,7 +6,8 @@ let
CuReSim = stdenv.mkDerivation rec { CuReSim = stdenv.mkDerivation rec {
name = "CuReSim"; name = "CuReSim";
version = "1.3"; version = "1.3";
src = fetchzip { url = src = fetchzip {
url =
"http://www.pegase-biosciences.com/wp-content/uploads/2015/08/${name}${version}.zip"; "http://www.pegase-biosciences.com/wp-content/uploads/2015/08/${name}${version}.zip";
sha256 = "1hvlpgy4haqgqq52mkxhcl9i1fx67kgwi6f1mijvqzk0xff77hkp"; sha256 = "1hvlpgy4haqgqq52mkxhcl9i1fx67kgwi6f1mijvqzk0xff77hkp";
stripRoot = true; stripRoot = true;

View File

@@ -34,11 +34,7 @@ parse_metadata <- function(metadata) {
#' @return A \code{character} containing the gene sequence #' @return A \code{character} containing the gene sequence
match_id_sequence <- function(names, vdj_segments, id) { match_id_sequence <- function(names, vdj_segments, id) {
matches <- grep(names, pattern = id) matches <- grep(names, pattern = id)
if(id == "TRBJ2-2"){ row <- matches[1]
row <- matches[2]
} else {
row <- matches[1]
}
return(as.character(vdj_segments[row])) return(as.character(vdj_segments[row]))
} }
@@ -110,9 +106,8 @@ get_cys_coordinates <- function(alignment) {
insertion <- unlist(Biostrings::insertion(alignment)) insertion <- unlist(Biostrings::insertion(alignment))
deletion <- unlist(Biostrings::deletion(alignment)) deletion <- unlist(Biostrings::deletion(alignment))
delta_coordinates <- handle_indels(insertion, deletion, cys, alignment) delta_coordinates <- handle_indels(insertion, deletion, cys, alignment)
read_start <- unlist(start(Biostrings::Views(alignment))) cys_start <- cys$start + delta_coordinates$start
cys_start <- cys$start + delta_coordinates$start + read_start - 1 cys_end <- cys$end + delta_coordinates$end
cys_end <- cys$end + delta_coordinates$end + read_start
return(list("start" = cys_start, "end" = cys_end)) return(list("start" = cys_start, "end" = cys_end))
} }
@@ -131,7 +126,7 @@ get_hvr_sequences <- function(sequences, vdj_segments, cores = detectCores()) {
) )
cys_coordinates <- parallel::mclapply(v_alignment, FUN = get_cys_coordinates) cys_coordinates <- parallel::mclapply(v_alignment, FUN = get_cys_coordinates)
cys_df <- as.data.frame(do.call(rbind, cys_coordinates)) cys_df <- as.data.frame(do.call(rbind, cys_coordinates))
remaining <- Biostrings::subseq(sequences, start = unlist(cys_df$end) + 1) remaining <- Biostrings::subseq(sequences, start = unlist(cys_df$end))
j_alignment <- parallel::mcmapply(remaining, j_alignment <- parallel::mcmapply(remaining,
df$j_seq, df$j_seq,
FUN = align_sequence, FUN = align_sequence,
@@ -150,4 +145,4 @@ get_hvr_sequences <- function(sequences, vdj_segments, cores = detectCores()) {
data <- parse_data(file = "data/curesim_sequence.fastq") data <- parse_data(file = "data/curesim_sequence.fastq")
hvr <- get_hvr_sequences(sequences = data[[1]], vdj_segments = data[[2]]) hvr <- get_hvr_sequences(sequences = data[[1]], vdj_segments = data[[2]])
Biostrings::writeXStringSet(hvr, "data/curesim-HVR.fastq", format = "fastq") Biostrings::writeXStringSet(hvr, "data/CuReSim-HVR.fastq", format = "fastq")