#!/bin/bash
shopt -s expand_aliases
alias ~=”:«’~bash’”
:«’~~~bash’
Usage
$ markerIndices=marker1,marker2,... \
minScores=score1,score2,... \
demultiplex.sh \
removeDuplicates_file
---
title: demultiplex.sh
---
flowchart TD
UNIQUE[(
removeDuplicates_file
| R1 | R2 | ... | # |
|---|---|---|---|
| ... | ... | ... | ... |
minScores
| score1 | score2 | ... |
|---|---|---|
| ... | ... | ... |
stdout
| R1 | R2 | ... | # | id | rstart1 | rend1 | qstart1 | qend1 | rstart2 | rend2 | qstart2 | qend2 | ... |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
markerNisbowtie2index of afastafile containing possible alignment targets ofRN.scoreNis feed tobowtie2to filter alignments with low scores. The alignments are in local mode instead of end-to-end mode, and there is no reverse complement. The full alignment setting is--norc --local -L 15 --ma 1 --mp 2,2 --rdg 3,1 --rfg 3,1 --score-min C,scoreN- The fasta name of
marker1andmarker2must the 0-based referenceid, see the core part of rearr. R1andR2are given inremoveDuplicates_file, which isstdoutofremoveDuplicates.sh.demultiplex.shonly output a row inremoveDuplicates_fileonly of bothR1andR2aligned to targets with consistant referenceid.R1,R2and#are copied fromremoveDuplicates_file.rstartNandrendNdenotes the left-close-right-open 0-based range of the aligned part of the target ofRN.qstartNandqendNdenotes that ofRN.- Multiple
marker(more than two) are supported in theory.
Source
mapMarker()
{
# Usage: mapMarker minScore markerIndex <reads
# Map reads to marker by bowtie2
# Input: plain reads
# Output: sam file without header
minScore=$1
markerIndex=$2
bowtie2 --quiet --mm --norc --local -L 15 --ma 1 --mp 2,2 --rdg 3,1 --rfg 3,1 --score-min C,${minScore} -r -x "${markerIndex}" -U - 2>/dev/null | samtools view
}
filterMarkers()
{
# Input: seq1|seq2|...|count|flag1|refId1|markerStart1|markerEnd1|seqStart1|seqEnd1|flag2|refId2|markerStart2|markerEnd2|seqStart2|seqEnd2|...
# Output: seq1|seq2|...|count|refId|markerStart1|markerEnd1|seqStart1|seqEnd1|markerStart2|markerEnd2|seqStart2|seqEnd2|...
# filter out rows with one of the following happens:
# 1. seqN failed to align markerIndexN
# 2. refIdM != refIdN for some M and N
gawk -F "\t" -v OFS="\t" -v firstFlagPos=$((${#markerIndexArray[@]}+2)) '
{
refId = $(firstFlagPos+1)
for (i = firstFlagPos; i <= NF; i += 6) {
if ($i != 0 || $(i+1) != refId) {
next
}
}
for (i = 1; i < firstFlagPos; ++i) {
printf("%s\t", $i)
}
printf("%s\t", $(firstFlagPos+1))
for (i = firstFlagPos; i < NF; ++i) {
if ((i - firstFlagPos) % 6 <= 1) {
continue
}
printf("%s\t", $i)
}
printf("%s\n", $NF)
}
'
}
inputFile=$1
IFS=',' read -r -a markerIndexArray <<< "${markerIndices}"
IFS=',' read -r -a minScoreArray <<< "$minScores"
maps=""
for ii in ${!markerIndexArray[@]}
do
maps="${maps} <(cut -f$((ii+1)) ${inputFile} | mapMarker ${minScoreArray[$ii]} ${markerIndexArray[$ii]} | gawk -f getAlignPos.awk)"
done
eval paste "${inputFile}" $maps | filterMarkers
alias ~~~=":" # This suppresses a warning and is not part of source.