Usage

param1=value1 param2=value2 ... runWorkFlow.sh [options]

---
title: runWorkflow.sh
---
flowchart TD
    PF[(
        plasmid_file
        
            
                adapter#0040;20bp#0041; + sgRNA#0040;20bp#0041; + scaffold#0040;83/93bp#0041; + query#0040;44bp#0041; + 3bp + RCbarcode#0040;18bp#0041; + RCprimer#0040;21bp#0041;
            
            
                ...
            
        
    )] --> GSPFR[getSxPlasmidFileRef.sh]
    GF[(genome_file)] --> GSPFR
    BI[(bowtie2index)] --> GSPFR
    EXT[(
        extentions
        
            
                ext1up
                ext1down
                ext2up
                ext2down
            
        
    )] --> GSPFR
    GSPFR --> REF[(
        reference_file
        
            
                start1
                ref1
                end1
                start2
                ref2
                end2
            
            
                ...
                ...
                ...
                ...
                ...
                ...
            
        
    )]

    PF --> SEM[sxExtractMarker.sh]
    SEM --> MK1[(
        stdout
        
            
                primer#0040;21bp#0041; + barcode#0040;18bp#0041;
            
            
                ...
            
        
    )]
    SEM --> MK2[(
        fd3
        
            
                adapter#0040;20bp#0041; + sgRNA#0040;20bp#0041; + scaffold#0040;83/93bp#0041;
            
            
                ...
            
        
    )]

    R1[(fastqR1)] --> RD[removeDuplicates.sh]
    R2[(faqstR2)] --> RD
    RD --> UNIQUE[(
        removeDuplicates_file
        
            
                R1
                R2
                #
            
            
                ...
                ...
                ...
            
        
    )]

    UNIQUE --> DM[demultiplex.sh]
    SCORE[(
        minScores
        
            
                score1
                score2
            
            
                ...
                ...
            
        
    )]
    MK1 --> DM
    MK2 --> DM
    SCORE --> DM
    DM --> ONTARGET[(
        demultiplex_file
        
            
                R1
                R2
                #
                id
                rstart1
                rend1
                qstart1
                qend1
                rstart2
                rend2
                qstart2
                qend2
            
            
                ...
                ...
                ...
                ...
                ...
                ...
                ...
                ...
                ...
                ...
                ...
                ...
            
        
    )]

    ONTARGET --> sxCRAFC[sxCutR2AdapterFilterCumulate.sh] --> QUERY[(
        input_file
        
            
                query
                #
                id
            
            
                ...
                ...
                ...
            
        
    )] --> REARR[rearrangement]
    REF --> REARR
    REARR --> ALG[(
        rearrangement_file
        
            
                idx
                #
                score
                id
            
            
                ref1
                ref2
            
            
                query
            
        
    )]

    REF --> CMH[correct_micro_homology.awk]
    DIRECTION[(
        direction_file
        
            
                up/down
            
            
                ...
            
        
    )] --> CMH
    ALG --> CMH
    CMH --> CORRECTED[(
        correct_micro_homology_file
        
            
                idx
                #
                score
                id
                udangle
                rstart1
                qstart1
                rend1
                qend1
                random
                rstart2
                qstart2
                rend2
                qend2
                ddangle
                cut1
                ref1+cut2
            
            
                ref1
                ref2
            
            
                query
            
        
    )]

adapter#0040;20bp#0041; + sgRNA#0040;20bp#0041; + scaffold#0040;83/93bp#0041; + query#0040;44bp#0041; + 3bp + RCbarcode#0040;18bp#0041; + RCprimer#0040;21bp#0041;
...

start1	ref1	end1	start2	ref2	end2
...	...	...	...	...	...

primer#0040;21bp#0041; + barcode#0040;18bp#0041;
...

adapter#0040;20bp#0041; + sgRNA#0040;20bp#0041; + scaffold#0040;83/93bp#0041;
...

R1	R2	#
...	...	...

score1	score2
...	...

R1	R2	#	id	rstart1	rend1	qstart1	qend1	rstart2	rend2	qstart2	qend2
...	...	...	...	...	...	...	...	...	...	...	...

query	#	id
...	...	...

up/down
...

options are passed to the underlying make calling. makeTarget is the file you want to generate. The underly make engine use file extensions to determine which step to run, so the file extension matters. Depending on makeTarget, you may need to provide additional parameters and input files.
To remove duplicates for paired (or multiply paired) fastq files, run
```
makeTarget=removeDuplicates_file.noDup \
fastqFiles=fastqR1,fastqR2,... \
runWorkFlow.sh
```
For more details, see removeDuplicates.sh.
To demultiplex removeDuplicates_file.noDup, run
```
makeTarget=demultiplex_file.demultiplex \
markerIndices=marker1,marker2,... \
minScores=score1,scores2,... \
runWorkFlow.sh
```
For more details, see demultiplex.sh. If marker is not indexed by bowtie2, runWorkFlow.sh will index it silently
To align query.post to reference and correct microhomology, run
```
makeTarget=correct_micro_homology_file.alg \
refFile=reference_file \
directionFile=direction_file \
runWorkFlow.sh
```
Chimeric alignment scores used by rearrangement can be set as follows.
```
s0=-6
s1=4
s2=2
u=-3
v=-9
ru=0
rv=0
qu=0
qv=-5
```
For more details, see core part of rearr. If only refFile is provided, a default directionFile=${refFile}.direct will be created with all up. For more details, see workFlow.mak.
The output of demultiplex.sh does not fit the input of core part of rearr. The transformation between them is highly dependent on the design of experiment and changes from now and that. For out in-house data, this is done by sxCutR2AdapterFilterCumulate.sh as follows.
```
makeTarget=query.post \
minToMapShear=30 \
./runWorkFlow.sh
```
Our in-house data use plasmids in a plasmid_file. We extract demultiplex markers from those plasmids by sxExtractMarker.sh.
```
makeTarget=plasmid_file.target.fa \
./runWorkFlow.sh
```
Besides plasmid_file.target.fa used as demutiplex marker for R2, another file plasmid_file.pair.fa will be generated as well used as demutiplex marker for R1.
The plasmid_file also contain reference sequences (sgRNAs). These references are extract by getSxPlasmidFileRef.sh.
```
makeTarget=plasmid_file.ref \
genome=genome_file \
bowtie2index=bowtie2index_prefix \
./runWorkFlow.sh
```
Our in-house data use hg19.

To run the full workflow for our in-house data (our in-house data put fastqR2 before fastqR1),

makeTarget=correct_micro_homology_file.alg \
fastqFiles=fastqR2,fastqR1 \
markerIndices=pasmid_file.target.fa,pasmid_file.pair.fa \
genome=genome_file \
bowtie2index=bowtie2index_prefix \
refFile=plasmid_file.ref \
./runWorkFlow.sh

runWorkFlow.sh will run all steps above for you to generate correct_micro_homology_file.alg.

runWorkFlow.sh use make engine, which skips the updating of the outputs if no change is detected in the inputs necesary to generate that output. This saves computations for you.

Source

# The following parameters should be replaced.
makeTarget=${makeTarget:-test/test_work_flow/rearr.alg}
fastqFiles=${fastqFiles:-test/test_work_flow/A2-g1n-3.R2.fq.gz,test/test_work_flow/A2-g1n-3.fq.gz}
markerIndices=${markerIndices:-test/test_work_flow/final_hgsgrna_libb_all_0811_NGG_scaffold_nor_G1.csv.target.fa,test/test_work_flow/final_hgsgrna_libb_all_0811_NGG_scaffold_nor_G1.csv.pair.fa}
minScores=${minScores:-30,100}

minToMapShear=${minToMapShear:-30}
refFile=${refFile:-test/test_work_flow/final_hgsgrna_libb_all_0811_NGG_scaffold_nor_G1.csv.ref}
directionFile=${directionFile:-"${refFile}.direct"}
ext1up=${ext1up:-50}
ext1down=${ext1down:-0}
ext2up=${ext2up:-10}
ext2down=${ext2down:-100}

# The following parameters are default in most cases.
genome=${genome:-"${GENOME}"}
bowtie2index=${bowtie2index:-"${BOWTIE2INDEX}"}
s0=${s0:--6}
s1=${s1:-4}
s2=${s2:-2}
u=${u:--3}
v=${v:--9}
ru=${ru:-0}
rv=${rv:-0}
qu=${qu:-0}
qv=${qv:--5}

default_prefix=${CONDA_PREFIX:-"${HOME}/.local"}
prefix=${prefix:-"${default_prefix}"}
if [ -f "workFlow.mak" ]
then
    make_file="workFlow.mak"
else
    make_file="${prefix}/share/rearr/workFlow.mak"
fi

make $@ -f "${make_file}" "${makeTarget}" \
    fastqFiles="${fastqFiles}" \
    markerIndices="${markerIndices}" \
    minScores="${minScores}" \
    genome="${genome}" \
    bowtie2index="${bowtie2index}" \
    refFile="${refFile}" \
    directionFile="${directionFile}" \
    s0="${s0}" \
    s1="${s1}" \
    s2="${s2}" \
    u="${u}" \
    v="${v}" \
    ru="${ru}" \
    rv="${rv}" \
    qu="${qu}" \
    qv="${qv}" \
    minToMapShear="${minToMapShear}"

alias ~~~=":" # This suppresses a warning and is not part of source.