1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env -S gawk -f

# Usage: getAlignPos.awk <samFile
# Get the start\end position of local alignment for marker\sequence
# Input: samFile without header
# Output: flag|marker|markerStart|markerEnd|seqStart|seqEnd

BEGIN{
    FS = "\t"
    OFS = "\t"
}

{
    if ($6 == "*") {
        print $2, $3, 0, 0, 0, 0
    } else {
        n = patsplit($6, cigarSegs, /[0-9]+[MIDNSHPX=]/)
        markerStart = $4 - 1 # sam file is 1-based, so minus 1
        markerEnd = markerStart 
        seqStart = 0
        seqEnd = 0
        for (i = 1; i <= n; ++i) {
            patsplit(cigarSegs[i], num, /[0-9]+/, labels)
            if (labels[1] ~ /[MI=X]/ || labels[1] ~ /[SH]/ && i == 1) {
                seqEnd += num[1]
                if (labels[1] ~ /[SH]/ && i == 1) {
                    seqStart += num[1]
                }
            }
            if (labels[1] ~ /[MD=X]/) {
                markerEnd += num[1]
            }
        }
        print $2, $3, markerStart, markerEnd, seqStart, seqEnd
    }
}