BLASTの結果をSAM形式に変換するスクリプト
BLAST(blastn)の結果を次世代シークエンサで使われるSAM形式に変換するスクリプトです.SAM toolsの中にも,blast2sam.plというperlスクリプトがあり,これのruby移植です.
少し拡張してあって,SAM toolsのblast2sam.plだと配列やquarity valueを出してくれないせいで,tabletなどのSAM/BAMを見られるviewerで読めないことがあります.以下のスクリプトだと,quarity scoreも(適当に)出力するので,tabletで読み込むことが可能です.
使い方は,blastn2sam.rb とすると
% blastn2sam.rb -t 1e-5 input_blast_result > output_sam_file
です.-t の後に示した値以下のE-valueを持つ結果だけsamに変換します.指定しないと全部変換します.
BLASTだとblastnしか対応していませんが,BLATのtblastx相当のオプションでBLAST出力した場合(-t=dnax -q=dnax -out=blast)もparseできます.
#!/usr/bin/env ruby require 'optparse' maxeval = 1.0 show_seq = true opt = OptionParser.new opt.on("-t", "--threshold [VALUE]", Float){|v| maxeval = v} opt.parse!(ARGV) def blast2sam(filename, maxeval, show_seq) fp = open(filename, "r") query_count = 1 query_name = "" sam = { :qname=>'', :flag=>0x00, :rname=>'', :pos=>nil, :mapq=>255, :cigar=>'', :nrnm=>'*', :mpos=>0, :isize=>0, :seq=>'', :qual=>'', :as=>'', :ev=>''} cmaux = {:cur_op=>nil, :count=>0} # [0,0,0,''] cigar = Array.new qlen = 0 qend = 0 rname = "" query_next = false skip_size = 0 # number of spaces before alignment figure while line = fp.gets if cigar.size !=0 && (line =~ /^Query=/ || line =~ /Score =.*bits.*Expect/) blast_print_sam(sam, cigar, cmaux, qlen - qend) if sam[:ev].to_f <= maxeval cigar = Array.new end if line =~ /^Query= (\S+)/ query_name = $1 query_count = 1 elsif line =~ /\((\S+)\s+letters\)/ qlen_str = $1 qlen_str.gsub!(/,/, "") qlen = qlen_str.to_i elsif line =~ /^>(\S+)/ rname = $1 elsif line =~ /Length = (\d+)/ slen = $1 elsif line =~ /Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/ as = ($1.to_f + 0.499).to_i ev = $3 ev = "1#{ev}" if ev =~ /^e/ sam[:flag] = 0x49 sam[:isize] = 0 sam[:as] = as sam[:ev] = ev sam[:seq] = "" sam[:qual] = "" sam[:pos] = 0 qbeg = 0 cigar = Array.new cmaux = {:cur_op=>nil, :count=>0} # [0,0,0,''] # @cmaux = (0, 0, 0, ''); sam[:qname] = "#{query_name}_n#{query_count}" sam[:rname] = rname query_count += 1 elsif line =~ /Strand = (\S+) \/ (\S+)/ sam[:flag] |= 0x10 if $2 == "Minus" elsif line =~ /Query\:\s(\d+)(\s*)(\S+)\s(\d+)/ start = $1.to_i skip_size = 7 + $1.size + $2.size q = $3 if qbeg == 0 qbeg = start cigar << "#{(start-1)}H" if start > 1 end qend = $4.to_i if show_seq == true sam[:seq] += q.gsub(/-/, "") end query_next = true elsif query_next == true query_next = false line.chomp! align = "" 0.upto(q.size) do |i| align += line[skip_size+i,1] unless q[i,1] == "-" end align.gsub!(" ", ";") sam[:qual] += align elsif line =~ /Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/ s = $2 if sam[:flag] & 0x10 != 0 sam[:pos] = $3.to_i else sam[:pos] = $1.to_i if sam[:pos] == 0 end cigar = aln2cm(cigar, q, s, cmaux) end end blast_print_sam(sam, cigar, cmaux, qlen - qend) if sam[:ev].to_f <= maxeval end def blast_print_sam(sam, cigar, cmaux, qrest) cigar << cmaux[:count].to_s + cmaux[:cur_op].to_s if sam[:flag] & 0x10 != 0 cigar = cigar.reverse sam[:seq] = sam[:seq].reverse unless sam[:seq].nil? sam[:seq].tr!("atgcrymkswATGCRYMKSW","tacgyrkmswTACGYRKMSW") sam[:qual] = sam[:qual].reverse unless sam[:qual] == "*" end sam[:seq] = "*" if sam[:seq].nil? sam[:cigar] = cigar.join("") puts [sam[:qname],sam[:flag],sam[:rname],sam[:pos], sam[:mapq],sam[:cigar],sam[:nrnm],sam[:mpos], sam[:isize], sam[:seq], sam[:qual], "AS:i:#{sam[:as]}", "EV:Z:#{sam[:ev]}"].join("\t") end def aln2cm(cigar, q, s, cmaux) l = q.size op = nil 0.upto(l-1) do |i| if q[i,1] == "-" op = :D elsif s[i,1] == "-" op = :I else op = :M end if cmaux[:cur_op] == op cmaux[:count] += 1 else cigar << cmaux[:count].to_s + cmaux[:cur_op].to_s unless cmaux[:cur_op].nil? cmaux[:cur_op] = op cmaux[:count] = 1 end end cigar end blast2sam(ARGV[0], maxeval, show_seq)