#!/bin/bash

shopt -s expand_aliases

alias ~=”:«’~bash’”

:«’~~~bash’

Usage

spliterIndices=index1,index2,... minScores=score1,score2,... demultiplex.md inputFile >demultiplexFile

Introduction

Each spliterIndice is bowtie2 index of a fasta file containing possible alignment references (i.e. spliters) of the corresponding (depends on the order) output seq of removeDuplicates.md. The Nth spliters of spliterIndices are usually different. However, they must have the same name. The name denotes the 0-based reference id (refId).

bowtie2 either aligns seq to one of the spliters or failed to align. If all seqs in a line output of removeDuplicates.md successfully align to spliters of the same refId, then demultiplex.md will print the following to stdout.

seq1<tab>seq2<tab>...<tab>count<tab>refId<tab>rs1<tab>re1<tab>qs1|qe1|rs2|re2|qs2|qe2|...

seqN and count are copied from inputFile. rsN and reN denotes the left-close-right-open 0-based range of the aligned part of the Nth reference (spliter) in the local alignment. qsN and qeN denotes that of the query (seq).

Alignment details

minScores are feed to bowtie2 to filter alignments with low scores. The alignments are in local mode instead of end-to-end mode, and there is no reverse complement. The full alignment setting is

--norc --local -L 15 --ma 1 --mp 2,2 --rdg 3,1 --rfg 3,1 --score-min C,scoreN

Source

mapSpliter()
{
    # Usage: mapSpliter minScore spliterIndex <reads
    # Map reads to spliter by bowtie2
    # Input: plain reads
    # Output: sam file without header
    minScore=$1
    spliterIndex=$2
    bowtie2 --quiet --mm --norc --local -L 15 --ma 1 --mp 2,2 --rdg 3,1 --rfg 3,1 --score-min C,${minScore} -r -x "${spliterIndex}" -U - 2>/dev/null | samtools view
}

filterSpilters()
{
    # Input: seq1|seq2|...|count|flag1|refId1|spliterStart1|spliterEnd1|seqStart1|seqEnd1|flag2|refId2|spliterStart2|spliterEnd2|seqStart2|seqEnd2|...
    # Output: seq1|seq2|...|count|refId|spliterStart1|spliterEnd1|seqStart1|seqEnd1|spliterStart2|spliterEnd2|seqStart2|seqEnd2|...
    # filter out rows with one of the following happens:
    # 1. seqN failed to align spliterIndexN
    # 2. refIdM != refIdN for some M and N
    gawk -F "\t" -v OFS="\t" -v firstFlagPos=$((${#spliterIndexArray[@]}+2)) '
        {
            refId = $(firstFlagPos+1)
            for (i = firstFlagPos; i <= NF; i += 6) {
                if ($i != 0 || $(i+1) != refId) {
                    next
                }
            }
            for (i = 1; i < firstFlagPos; ++i) {
                printf("%s\t", $i)
            }
            printf("%s\t", $(firstFlagPos+1))
            for (i = firstFlagPos; i < NF; ++i) {
                if ((i - firstFlagPos) % 6 <= 1) {
                    continue
                }
                printf("%s\t", $i)
            }
            printf("%s\n", $NF)
        }
    '
}

inputFile=$1

IFS=',' read -r -a spliterIndexArray <<< "$spliterIndices"
IFS=',' read -r -a minScoreArray <<< "$minScores"
maps=""
for ii in ${!spliterIndexArray[@]}
do
    maps="${maps} <(cut -f$((ii+1)) ${inputFile} | mapSpliter ${minScoreArray[$ii]} ${spliterIndexArray[$ii]} | gawk -f getAlignPos.awk)"
done

eval paste "${inputFile}" $maps | filterSpilters
alias ~~~=":" # This suppresses a warning and is not part of source.