BLASTの結果をSAM形式に変換するスクリプト

BLAST(blastn)の結果を次世代シークエンサで使われるSAM形式に変換するスクリプトです.SAM toolsの中にも,blast2sam.plというperlスクリプトがあり,これのruby移植です.
少し拡張してあって,SAM toolsのblast2sam.plだと配列やquarity valueを出してくれないせいで,tabletなどのSAM/BAMを見られるviewerで読めないことがあります.以下のスクリプトだと,quarity scoreも(適当に)出力するので,tabletで読み込むことが可能です.
使い方は,blastn2sam.rb とすると

% blastn2sam.rb -t 1e-5 input_blast_result > output_sam_file

です.-t の後に示した値以下のE-valueを持つ結果だけsamに変換します.指定しないと全部変換します.
BLASTだとblastnしか対応していませんが,BLATのtblastx相当のオプションでBLAST出力した場合(-t=dnax -q=dnax -out=blast)もparseできます.

#!/usr/bin/env ruby

require 'optparse'

maxeval = 1.0
show_seq = true
opt = OptionParser.new
opt.on("-t", "--threshold [VALUE]", Float){|v| maxeval = v}
opt.parse!(ARGV)

def blast2sam(filename, maxeval, show_seq)
  fp = open(filename, "r")
  query_count = 1
  query_name = ""
  sam = {
    :qname=>'', :flag=>0x00, :rname=>'', :pos=>nil, 
    :mapq=>255, :cigar=>'', :nrnm=>'*', :mpos=>0, 
    :isize=>0, :seq=>'', :qual=>'', :as=>'', :ev=>''}
  cmaux = {:cur_op=>nil, :count=>0} # [0,0,0,'']
  cigar = Array.new
  qlen = 0
  qend = 0
  rname = ""
  query_next = false
  skip_size = 0  # number of spaces before alignment figure
  while line = fp.gets
    if cigar.size !=0 && (line =~ /^Query=/ || line =~ /Score =.*bits.*Expect/) 
      blast_print_sam(sam, cigar, cmaux, qlen - qend) if sam[:ev].to_f <= maxeval
      cigar = Array.new
    end
    if line =~ /^Query= (\S+)/
      query_name = $1
      query_count = 1
    elsif line =~ /\((\S+)\s+letters\)/
      qlen_str = $1
      qlen_str.gsub!(/,/, "")
      qlen = qlen_str.to_i
    elsif line =~ /^>(\S+)/
      rname = $1
    elsif line =~ /Length = (\d+)/
      slen = $1
    elsif line =~ /Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/
      as = ($1.to_f + 0.499).to_i
      ev = $3
      ev = "1#{ev}" if ev =~ /^e/
      sam[:flag] = 0x49
      sam[:isize] = 0
      sam[:as] = as
      sam[:ev] = ev
      sam[:seq] = ""
      sam[:qual] = ""
      sam[:pos] = 0
      qbeg = 0
      cigar = Array.new
      cmaux = {:cur_op=>nil, :count=>0} # [0,0,0,'']  # @cmaux = (0, 0, 0, '');
      sam[:qname] = "#{query_name}_n#{query_count}" 
      sam[:rname] = rname
      query_count += 1
    elsif line =~ /Strand = (\S+) \/ (\S+)/
      sam[:flag] |= 0x10 if $2 == "Minus"
    elsif line =~ /Query\:\s(\d+)(\s*)(\S+)\s(\d+)/
      start = $1.to_i
      skip_size = 7 + $1.size + $2.size
      q = $3
      if qbeg == 0
        qbeg = start
        cigar << "#{(start-1)}H" if start > 1
      end
      qend = $4.to_i
      if show_seq == true
        sam[:seq] += q.gsub(/-/, "")
      end
      query_next = true
    elsif query_next == true
      query_next = false
      line.chomp!
      align = ""
      0.upto(q.size) do |i|
        align += line[skip_size+i,1] unless q[i,1] == "-"
      end
      align.gsub!(" ", ";")
      sam[:qual] += align
    elsif line =~ /Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/
      s = $2
      if sam[:flag] & 0x10 != 0
        sam[:pos] = $3.to_i
      else
        sam[:pos] = $1.to_i if sam[:pos] == 0
      end
      cigar = aln2cm(cigar, q, s, cmaux)
    end
  end    
  blast_print_sam(sam, cigar, cmaux, qlen - qend) if sam[:ev].to_f <= maxeval
end

def blast_print_sam(sam, cigar, cmaux, qrest)
  cigar << cmaux[:count].to_s + cmaux[:cur_op].to_s
  if sam[:flag] & 0x10 != 0
    cigar = cigar.reverse
    sam[:seq] = sam[:seq].reverse unless sam[:seq].nil?
    sam[:seq].tr!("atgcrymkswATGCRYMKSW","tacgyrkmswTACGYRKMSW")
    sam[:qual] = sam[:qual].reverse unless sam[:qual] == "*"
  end
  sam[:seq] = "*" if sam[:seq].nil?
  sam[:cigar] = cigar.join("")

  puts [sam[:qname],sam[:flag],sam[:rname],sam[:pos], 
        sam[:mapq],sam[:cigar],sam[:nrnm],sam[:mpos], 
        sam[:isize], sam[:seq], sam[:qual], 
        "AS:i:#{sam[:as]}", "EV:Z:#{sam[:ev]}"].join("\t")
end

def aln2cm(cigar, q, s, cmaux)
  l = q.size
  op = nil
  0.upto(l-1) do |i|
    if q[i,1] == "-"
      op = :D
    elsif s[i,1] == "-"
      op = :I
    else
      op = :M
    end
    if cmaux[:cur_op] == op
      cmaux[:count] += 1
    else
      cigar << cmaux[:count].to_s + cmaux[:cur_op].to_s unless cmaux[:cur_op].nil?
      cmaux[:cur_op] = op
      cmaux[:count] = 1
    end
  end
  cigar
end

blast2sam(ARGV[0], maxeval, show_seq)