CMH

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env -S gawk -f

# Usage: correct_micro_homology.awk -- reference_file direction_file < rearrangement_output

# reference_file is the same as the input of rearrangement. For each row of reference_file, direction_file has a row of fields being either up or down. Each field corresponds a junction of two adjacent references in the row of reference_file. For up\down, correct_micro_homology.awk try to remove the deletion or templated insertion of the up\down-stream DSB end. This is achieved by modifying the alignment up to the equivalence of microhomology.

# The header line of each alignment output by rearrangement only contains idx, #, score, id. correct_micro_homology.awk enriches the header by adding unaligned part of query and aligned ranges of both reference and query. The cut sites at junctions of adjacent references are also appended to the header line.

BEGIN{
    FS = "\t"
    refFile = ARGV[1]
    directionFile = ARGV[2]
    ref_id = 0
    # Read information from refFile.
    while (getline ref < refFile) {
        getline direction < directionFile
        n = split(ref, ref_arr, "\t")
        split(direction, direction_type[ref_id], "\t")
        ref_accum_len = 0
        for (i = 3; i < n; i += 3) {
            cut1s[ref_id, i / 3] = ref_arr[i]
            cut1s_accum[ref_id, i / 3] = ref_accum_len + cut1s[ref_id, i / 3]
            ref_accum_len += length(ref_arr[i - 1])
            cut2s[ref_id, i / 3] = ref_arr[i + 1]
            cut2s_accum[ref_id, i / 3] = ref_accum_len + cut2s[ref_id, i / 3]
        }
        ++ref_id
    }
    # Delete refFile and directionFile from arguments, so correct_micro_homology.AWK does not process them.
    for (i = 1; i <= 2; ++i) {
        delete ARGV[i]
    }
}

# From queryline, extract aligned parts (including internal and end deletion gaps of refline) to targets and unaligned parts to inserts.
function query_patsplit(refs, dashs, queryline, targets, inserts,       start, i) {
    start = 1
    for (i = 1; i <= length(refs); ++i) {
        inserts[i - 1] = substr(queryline, start, length(dashs[i - 1]))
        start += length(dashs[i - 1])
        targets[i] = substr(queryline, start, length(refs[i]))
        start += length(refs[i])
    }
    inserts[length(refs)] = substr(queryline, start, length(dashs[length(refs)]))
}

# Extract the aligned range of reference.
function query_seg_range(target, seg_range,       mat) {
    seg_range[1] = match(target, /([ACGTN][-ACGTN]*[ACGTN]|[ACGTN])/, mat) - 1
    seg_range[2] = seg_range[1] + length(mat[0])
}

# Since ref contains gap '-', the actual cut site in ref is push downstream by gaps upstream to it.
function get_gap_cut(ref, cut,       n, segs, gaps, accum_len, j, k) {
    n = patsplit(ref, segs, /[acgtnACGTN]+/, gaps)
    accum_len = 0
    for (j = 1; j <= n; ++j) {
        accum_len += length(segs[j])
        if (accum_len >= cut) {
            break
        }
    }
    gap_cut = cut
    for (k = 1; k < j; ++k) {
        gap_cut += length(gaps[k])
    }
    return gap_cut
}

# Get the length of the longest common prefix\suffix of string1 and string2. fix is either "prefix" or "suffix".
function longest_common_fix(string1, string2, fix,      i, start, rgx) {
    string1 = toupper(string1)
    string2 = toupper(string2)
    for (i = 1; i <= length(string2); ++i) {
        start = fix == "prefix" ? 1 : length(string2) - i + 1
        rgx = fix == "prefix" ? "^" substr(string2, start, i) : substr(string2, start, i) "$"
        if (string1 !~ rgx) {
            return i - 1
        }
    }
}

# Add additional information to the header line, including unaligned part of query and aligned ranges of both reference and query, as well as the cut sites at junctions of adjacent references.
function print_mark(insert, seg_range, ref, target,       ref_block, query_block) {
    printf("%s\t", insert)
    query_pos += length(insert)
    ref_pos += (seg_range[1] == -1 ? 0 : seg_range[1])
    printf("%d\t%d\t", ref_pos, query_pos)
    ref_block = substr(ref, seg_range[1] + 1, seg_range[2] - seg_range[1])
    gsub(/-/, "", ref_block)
    ref_pos += length(ref_block)
    query_block = substr(target, seg_range[1] + 1, seg_range[2] - seg_range[1])
    gsub(/-/, "", query_block)
    query_pos += length(query_block)
    printf("%d\t%d\t", ref_pos, query_pos)
    ref_pos += seg_range[1] == -1 ? length(ref) : length(ref) - seg_range[2]
}

{
    idx = $1
    count = $2
    score = $3
    ref_id = $4
    printf("%d\t%d\t%d\t%d\t", idx, count, score, ref_id)

    getline refline
    getline queryline

    patsplit(refline, refs, /[acgtn][-ACGTN]*[acgtn]/, dashs)
    query_patsplit(refs, dashs, queryline, targets, inserts)

    # Iteration over all reference junctions.
    ref_pos = query_pos = 0
    query_seg_range(targets[1], seg_range1)
    for (i = 1; i < length(refs); ++i) {
        query_seg_range(targets[i + 1], seg_range2)
        # Micro-homology correction is possible only if no unaligned part of query between the alignments of the two adjacent references, and neither reference is skipped.
        if (length(inserts[i]) == 0 && seg_range1[1] != 0 && seg_range2[1] != 0) {
            gap_cut1 = get_gap_cut(refs[i], cut1s[ref_id, i])
            gap_cut2 = get_gap_cut(refs[i + 1], cut2s[ref_id, i])
            
            # Determine the correct direction based on both direction_type in directionFile and the actual indel type.
            correct_direct = ""
            if (tolower(direction_type[ref_id][i]) == "up") {
                if (seg_range1[2] < gap_cut1) {
                    receiver = substr(refs[i], seg_range1[2] + 1, gap_cut1 - seg_range1[2])
                    provider = substr(refs[i + 1], seg_range2[1] + 1, seg_range2[2] - seg_range2[1])
                    correct_direct = "prefix"
                } else if (seg_range1[2] > gap_cut1) {
                    provider = substr(refs[i], gap_cut1 + 1, seg_range1[2] - gap_cut1)
                    receiver = substr(refs[i + 1], 1, seg_range2[1])
                    correct_direct = "suffix"
                }
            } else {
                if (seg_range2[1] < gap_cut2) {
                    provider = substr(refs[i + 1], seg_range2[1] + 1, gap_cut2 - seg_range2[1])
                    receiver = substr(refs[i], seg_range1[2] + 1)
                    correct_direct = "prefix"
                } else if (seg_range2[1] > gap_cut2) {
                    receiver = substr(refs[i + 1], gap_cut2 + 1, seg_range2[1] - gap_cut2)
                    provider = substr(refs[i], seg_range1[1] + 1, seg_range1[2] - seg_range1[1])
                    correct_direct = "suffix"
                }
            }

            # If there is neither deletion nor templated insertion, then neglect the correction step.
            if (correct_direct != "") {
                correct_length = longest_common_fix(receiver, provider, correct_direct)
                target1_split_start = correct_direct == "prefix" ? seg_range1[2] : seg_range1[2] - correct_length
                target2_split_start = correct_direct == "prefix" ? seg_range2[1] : seg_range2[1] - correct_length
                target1_exchange = substr(targets[i], target1_split_start + 1, correct_length)
                target2_exchange = substr(targets[i + 1], target2_split_start + 1, correct_length)
                targets[i] = substr(targets[i], 1, target1_split_start) target2_exchange substr(targets[i], target1_split_start + correct_length + 1)
                targets[i + 1] = substr(targets[i + 1], 1, target2_split_start) target1_exchange substr(targets[i + 1], target2_split_start + correct_length + 1)

                shift = correct_direct == "prefix" ? correct_length : -correct_length
                seg_range1[2] += shift
                seg_range2[1] += shift
            }
        }
        # Print additional header information upstream to the current junction.
        print_mark(inserts[i - 1], seg_range1, refs[i], targets[i])
        seg_range1[1] = seg_range2[1]
        seg_range1[2] = seg_range2[2]
    }
    # Print addtional header information downstream to the last junction.
    print_mark(inserts[i - 1], seg_range1, refs[i], targets[i])
    printf("%s", inserts[i])
    for (i = 1; i < length(refs); ++i) {
        printf("\t%d\t%d", cut1s_accum[ref_id, i], cut2s_accum[ref_id, i])
    }
    # Construct corrected queryline.
    queryline = inserts[0]
    for (i = 1; i <= length(refs); ++i) {
        queryline = queryline targets[i] inserts[i]
    }
    printf("\n%s\n%s\n", refline, queryline)
}