#include <htslib/sam.h> #include <htslib/khash.h> #include <ctype.h> #include <Rinternals.h> #include "../../plp_utils.h" // Parse MD tag and enumerate types and number of mismatches // Determine if read passes n_mis and n_types thresholds // // // returns: -1 if no MD tag // 0 if read passes filter // >0 with # of unique mismatches if doesn't pass // // based on cigar and MD parsing code from htsbox (written by Heng Li) // https://github.com/lh3/htsbox/blob/ffc1e8ad4f61291c676a323ed833ade3ee681f7c/samview.c#L117 int parse_mismatches(bam1_t* b, int n_types, int n_mis){ int ret = 0; if(n_types < 1 && n_mis < 1){ return ret; } const uint32_t *cigar = bam_get_cigar(b); int k, x, y, c; uint8_t *pMD = 0; char *p; int nm = 0; char vars[2]; vars[1] = '\0'; str2intmap_t vh; vh = kh_init(str2intmap); if ((pMD = bam_aux_get(b, "MD")) != 0) { for (k = x = y = 0; k < b->core.n_cigar; ++k) { int op = bam_cigar_op(cigar[k]); int len = bam_cigar_oplen(cigar[k]); if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { x += len, y += len; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += len; } p = bam_aux2Z(pMD); y = 0; while (isdigit(*p)) { y += strtol(p, &p, 10); if (*p == 0) { break; } else if (*p == '^') { // deletion ++p; while (isalpha(*p)) ++p; } else { while (isalpha(*p)) { if (y >= x) { y = -1; break; } c = y < b->core.l_qseq ? seq_nt16_str[bam_seqi(bam_get_seq(b), y)] : 'N'; nm += 1; vars[0] = c; char *var = strdup(vars); int rval = 0; kh_put(str2intmap, vh, var, &rval); if(rval == -1){ REprintf("[raer internal] issue tabulating variants per read at, %s\n", bam_get_qname(b)); clear_str2int_hashmap(vh); kh_destroy(str2intmap, vh); return -1; } else if (rval == 0){ free(var); } ++y, ++p; } if (y == -1) break; } } if (x != y) { REprintf("[raer internal] inconsistent MD for read '%s' (%d != %d); ignore MD\n", bam_get_qname(b), x, y); ret = -1; } else { if(kh_size(vh) >= n_types && nm >= n_mis){ ret = kh_size(vh); } } } clear_str2int_hashmap(vh); kh_destroy(str2intmap, vh); return ret; }