Logo Search packages:      
Sourcecode: samtools version File versions  Download package

bam_rmdupse.c

#include <math.h>
#include "sam.h"
#include "khash.h"
#include "klist.h"

#define QUEUE_CLEAR_SIZE 0x100000
#define MAX_POS 0x7fffffff

typedef struct {
      int endpos;
      uint32_t score:31, discarded:1;
      bam1_t *b;
} elem_t, *elem_p;
#define __free_elem(p) bam_destroy1((p)->data.b)
KLIST_INIT(q, elem_t, __free_elem)
typedef klist_t(q) queue_t;

KHASH_MAP_INIT_INT(best, elem_p)
typedef khash_t(best) besthash_t;

typedef struct {
      uint64_t n_checked, n_removed;
      besthash_t *left, *rght;
} lib_aux_t;
KHASH_MAP_INIT_STR(lib, lib_aux_t)

static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
{
      khint_t k = kh_get(lib, aux, lib);
      if (k == kh_end(aux)) {
            int ret;
            char *p = strdup(lib);
            lib_aux_t *q;
            k = kh_put(lib, aux, p, &ret);
            q = &kh_val(aux, k);
            q->left = kh_init(best);
            q->rght = kh_init(best);
            q->n_checked = q->n_removed = 0;
            return q;
      } else return &kh_val(aux, k);
}

static inline int sum_qual(const bam1_t *b)
{
      int i, q;
      uint8_t *qual = bam1_qual(b);
      for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
      return q;
}

static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
{
      elem_t *p = kl_pushp(q, queue);
      p->discarded = 0;
      p->endpos = endpos; p->score = score;
      if (p->b == 0) p->b = bam_init1();
      bam_copy1(p->b, b);
      return p;
}

static void clear_besthash(besthash_t *h, int32_t pos)
{
      khint_t k;
      for (k = kh_begin(h); k != kh_end(h); ++k)
            if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
                  kh_del(best, h, k);
}

static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
{
      if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
            khint_t k;
            while (1) {
                  elem_t *q;
                  if (queue->head == queue->tail) break;
                  q = &kl_val(queue->head);
                  if (q->discarded) {
                        q->b->data_len = 0;
                        kl_shift(q, queue, 0);
                        continue;
                  }
                  if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
                  samwrite(out, q->b);
                  q->b->data_len = 0;
                  kl_shift(q, queue, 0);
            }
            for (k = kh_begin(h); k != kh_end(h); ++k) {
                  if (kh_exist(h, k)) {
                        clear_besthash(kh_val(h, k).left, pos);
                        clear_besthash(kh_val(h, k).rght, pos);
                  }
            }
      }
}

void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
{
      bam1_t *b;
      queue_t *queue;
      khint_t k;
      int last_tid = -2;
      khash_t(lib) *aux;

      aux = kh_init(lib);
      b = bam_init1();
      queue = kl_init(q);
      while (samread(in, b) >= 0) {
            bam1_core_t *c = &b->core;
            int endpos = bam_calend(c, bam1_cigar(b));
            int score = sum_qual(b);
            
            if (last_tid != c->tid) {
                  if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
                  last_tid = c->tid;
            } else dump_alignment(out, queue, c->pos, aux);
            if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
                  push_queue(queue, b, endpos, score);
            } else {
                  const char *lib;
                  lib_aux_t *q;
                  besthash_t *h;
                  uint32_t key;
                  int ret;
                  lib = bam_get_library(in->header, b);
                  q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
                  ++q->n_checked;
                  h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
                  key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
                  k = kh_put(best, h, key, &ret);
                  if (ret == 0) { // in the hash table
                        elem_t *p = kh_val(h, k);
                        ++q->n_removed;
                        if (p->score < score) {
                              if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
                                    p->discarded = 1;
                                    kh_val(h, k) = push_queue(queue, b, endpos, score);
                              } else { // replace
                                    p->score = score; p->endpos = endpos;
                                    bam_copy1(p->b, b);
                              }
                        } // otherwise, discard the alignment
                  } else kh_val(h, k) = push_queue(queue, b, endpos, score);
            }
      }
      dump_alignment(out, queue, MAX_POS, aux);

      for (k = kh_begin(aux); k != kh_end(aux); ++k) {
            if (kh_exist(aux, k)) {
                  lib_aux_t *q = &kh_val(aux, k);
                  fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
                              (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
                  kh_destroy(best, q->left); kh_destroy(best, q->rght);
                  free((char*)kh_key(aux, k));
            }
      }
      kh_destroy(lib, aux);
      bam_destroy1(b);
      kl_destroy(q, queue);
}

Generated by  Doxygen 1.6.0   Back to index