#!/usr/bin/perl -w use strict; my @a_pSam; # Previous sam line my @aa_tmplocus; # Current locus (tab of tab sam line) my @a_locus; # Tab of locus (ref\tstart\tend\tlength\tnbSamLine) my %h_locusAln; my ($deb, $end, $cmp); my $locusId = 0; # First mapped read (sam line) while (my $line=) { @a_pSam = split('\s+', $line); ($cmp, $deb, $end) = (1, $a_pSam[3], $a_pSam[3]+length($a_pSam[9])-1); last if($a_pSam[1] != 4); } while (my $line=) { chomp $line; my @a_sam = split('\s+',$line); next if($a_sam[1] == 4); my @a_tmp = @a_pSam; push @aa_tmplocus, [ @a_tmp ]; if($a_sam[2] eq $a_pSam[2] && $a_sam[3] <= $end) { $cmp++; $end = max($a_sam[3]+length($a_sam[9])-1, $end); } else { $locusId++; my $strand = $aa_tmplocus[0][1]; for (my $i=0; $i<=$#aa_tmplocus; $i++) { $strand = -1 if($aa_tmplocus[$i][1] ne $strand); my $lenshift = $aa_tmplocus[$i][3] - $deb; my $shift = ""; for (my $j=$deb; $j<$aa_tmplocus[$i][3]; $j++) { $shift .= "-"; } $aa_tmplocus[$i][9] = $shift.$aa_tmplocus[$i][9]; push @{$aa_tmplocus[$i]}, "XX:Z:locus_$locusId"; print STDERR join("\t",@{$aa_tmplocus[$i]})."\n"; } if($end-$deb+1<=38 && $cmp>0) { ##25 cmp>1 push(@a_locus, "locus_$locusId\t$strand\t$a_pSam[2]\t$deb\t$end\t".($end-$deb+1)."\tisoform:$cmp"); for (my $i=0; $i<=$#aa_tmplocus; $i++) { $h_locusAln{"locus_$locusId"} .= "$aa_tmplocus[$i][0]\t$aa_tmplocus[$i][9]\n"; } } print STDERR "##locus_$locusId\t$strand\t$a_pSam[2]\t$deb\t$end\t".($end-$deb+1)."\tisoform:$cmp\n"; $cmp = 1; $deb = $a_sam[3]; $end = $a_sam[3]+length($a_sam[9])-1; @aa_tmplocus = (); } @a_pSam = @a_sam; } my $link = 0; my $refseq = ""; my @a_aln; my @a_locusBeg; my @a_locusEnd; my @a_locusDescr; my @a_miRNADescr; for (my $i=0; $i<=$#a_locus; $i++) { print "##\nlocus\t" if($link == 0); my @a_curr = split('\s+', $a_locus[$i]); my @a_next = split('\s+', $a_locus[$i+1]); if($a_curr[2] eq $a_next[2] && $a_curr[4]+30 > $a_next[3] && $a_curr[4]+4 < $a_next[3]) { if($link == 0) { push @a_locusDescr, "$a_locus[$i]"; push @a_aln, [ split('\n', $h_locusAln{$a_curr[0]}) ]; push @a_locusBeg, $a_curr[3]; push @a_locusEnd, $a_curr[4]; $link++; } push @a_locusDescr, "$a_locus[$i+1]"; push @a_aln, [ split('\n', $h_locusAln{$a_next[0]}) ]; push @a_locusBeg, $a_next[3]; push @a_locusEnd, $a_next[4]; $link++; $refseq = $a_next[2]; } else { if($link == 0) { push @a_locusDescr, "$a_locus[$i]"; push @a_aln, [ split('\n', $h_locusAln{$a_curr[0]}) ]; push @a_locusBeg, max(1, $a_curr[3]-50); push @a_locusEnd, $a_curr[4]+50; $link++; $refseq = $a_curr[2]; # Consensus (extractseq) and folding (RNAfold) my $exs = extractseq($refseq, max(1,$a_curr[3]-50), $a_curr[4]); my $fol = fastaOneLiner($exs); my $rfo = rnafold($fol); my ($rfoScore) = ( $rfo =~ /([+-]?\d+\.\d+)/ ); push @a_aln, [ split('\n', $rfo) ]; push @a_locusBeg, max(1, $a_curr[3]-50); push @a_locusEnd, $a_curr[4]; push @a_miRNADescr, "miRNA-miRNA*\t$refseq\t".max(1,$a_curr[3]-50)."\t$a_curr[4]". "\tfoldScore:$rfoScore\tfoldFlag:".computeRnafoldFlag($rfo); $exs = extractseq($refseq, $a_curr[3], $a_curr[4]+50); $fol = fastaOneLiner($exs); $rfo = rnafold($fol); ($rfoScore) = ( $rfo =~ /([+-]?\d+\.\d+)/ ); push @a_aln, [ split('\n', $rfo) ]; push @a_locusBeg, $a_curr[3]; push @a_locusEnd, $a_curr[4]+50; push @a_miRNADescr, "miRNA-miRNA*\t$refseq\t$a_curr[3]\t".($a_curr[4]+50). "\tfoldScore:$rfoScore\tfoldFlag:".computeRnafoldFlag($rfo); } else { for(my $x=0; $x<=($link-2); $x++) { # Consensus (extractseq) and folding (RNAfold) my $exs = extractseq($refseq, $a_locusBeg[0+$x], $a_locusEnd[1+$x]); my $fol = fastaOneLiner($exs); my $rfo = rnafold($fol); my ($rfoScore1) = ( $rfo =~ /([+-]?\d+\.\d+)/ ); my $rfoFlag1 = computeRnafoldFlag($rfo); push @a_aln, [ split('\n', $rfo) ]; push @a_locusBeg, $a_locusBeg[0+$x]; push @a_locusEnd, $a_locusEnd[1+$x]; # Folding with N in the loop my $l1 = $a_locusEnd[0+$x] - $a_locusBeg[0+$x] + 1; my $loop = $a_locusBeg[1+$x] - $a_locusEnd[0+$x] - 1; $fol =~ s/(.{$l1})([ATCGNatcgn]{$loop})(.*)$/$1.("N" x length($2)).$3/e; $rfo = rnafold($fol); my ($rfoScore2) = ( $rfo =~ /([+-]?\d+\.\d+)/ ); my $rfoFlag2 = computeRnafoldFlag($rfo); push @a_aln, [ split('\n', $rfo) ]; push @a_locusBeg, $a_locusBeg[0+$x]; push @a_locusEnd, $a_locusEnd[1+$x]; # Align miRNA vs miRNA* my $needleScore = 0; my @a_fol = split('\n', $fol); my @a_mirna = split('NNNN+', $a_fol[1]); ## Min 4 N for the loop if($#a_mirna == 1) { ## If only mirna + mirna* (one loop) my $revcmp = reverse($a_mirna[1]); $revcmp =~ tr/ACGTacgt/TGCAtgca/; open(TMP, ">sam2locus.$$.needle1") || die "Enable to create sam2locus.$$.needle1 file"; print TMP ">miRNA_locus1\n".$a_mirna[0]; close TMP; open(TMP, ">sam2locus.$$.needle2") || die "Enable to create sam2locus.$$.needle2 file"; print TMP ">miRNA_locus2\n".$revcmp; close TMP; `needle sam2locus.$$.needle1 sam2locus.$$.needle2 -gapopen 10 -gapextend 0.5 -outfile sam2locus.$$.needle 2> /dev/null`; open(NEEDLE, "sam2locus.$$.needle") || die "Enable to open sam2locus.$$.needle output file"; my $needleOutput = ""; $needleScore = 0; while (my $line=) { chomp $line; if($line =~ /^# Score:/) { $needleScore = $line; $needleScore =~ s/# Score:\s+//; } if($line =~ /^miRNA_locus1/) { $line =~ s/miRNA_locus1\s+/>miRNA_locus1\($needleScore\)\t/; $needleOutput = "$line \n"; } if($line =~ /^miRNA_locus2/) { $line =~ s/miRNA_locus2\s+/>miRNA_locus2\($needleScore\)\t/; $needleOutput .= "$line \n"; } } close NEEDLE; unlink("sam2locus.$$.needle1"); unlink("sam2locus.$$.needle2"); unlink("sam2locus.$$.needle"); push @a_aln, [ split('\n', $needleOutput) ]; push @a_locusBeg, $a_locusBeg[0+$x]; push @a_locusEnd, $a_locusEnd[1+$x]; } push @a_miRNADescr, "miRNA-miRNA*\t$refseq\t".$a_locusBeg[0+$x]."\t".$a_locusEnd[1+$x]. "\tfoldScore:$rfoScore1\tfoldFlag:$rfoFlag1". "\tNfoldScore:$rfoScore2\tNfoldFlag:$rfoFlag2". "\tneedleScore:$needleScore"; } } print "$a_curr[2]\t$a_locusBeg[0]\t$a_locusEnd[-1]\tlocusNumber:$link\n"; foreach my $descr (@a_locusDescr) { print "$descr\n"; } foreach my $descr (@a_miRNADescr) { print "$descr\n"; } print "#\n"; my $maxName = 0; for (my $j=0; $j<=$#a_aln; $j++) { for (my $k=0; $k<=$#{$a_aln[$j]}; $k++) { my @a_tmp = split('\t', $a_aln[$j][$k]); $maxName = max(length($a_tmp[0]), $maxName); } } for (my $j=0; $j<=$#a_aln; $j++) { for (my $k=0; $k<=$#{$a_aln[$j]}; $k++) { my @a_tmp = split('\t', $a_aln[$j][$k]); for (my $l=length($a_tmp[0]); $l<$maxName; $l++) { $a_tmp[0] .= " "; } if($link == 1 && $j<=$#a_aln-2) { for (my $l=0; $l<50; $l++) { $a_tmp[1] = "-".$a_tmp[1]; } } for (my $l=$a_locusBeg[0]; $l<$a_locusBeg[$j]; $l++) { $a_tmp[1] = "-".$a_tmp[1]; } for (my $l=length($a_tmp[1]); $l<$a_locusEnd[-1]-$a_locusBeg[0]+1; $l++) { $a_tmp[1] .= "-"; } print "$a_tmp[0] $a_tmp[1]\n"; } print "#\n" if($j<$#a_aln); } $link = 0; $refseq = ""; @a_aln = (); @a_locusBeg = (); @a_locusEnd = (); @a_locusDescr = (); @a_miRNADescr = (); } } sub max { if ($_[0] > $_[1]) { $_[0]; } else{ $_[1]; } } sub min { if ($_[0] < $_[1]) { $_[0]; } else{ $_[1]; } } sub extractseq { my ($refseq, $beg, $end) = @_; return `extractseq V4_454Scaffolds.fna:$refseq -regions $beg-$end -outseq stdout 2> /dev/null`; } sub fastaOneLiner { my ($fasta) = @_; my @a_lines = split(/\n/, $fasta); my $ac = shift @a_lines; $ac =~ /^(\S+)/; my $r = "$1\n".join('', @a_lines); return $r; } sub rnafold { my ($seq) = @_; ## 3 lines in RNAfold output (1:header fasta - 2:seq - 3:(((...())[...] (nrj)) my @a_res = split ('\n', `perl -e '{print "$seq"}' | RNAfold -noPS -noconv`); $a_res[2] =~ s/\( +/\(/; my @a_stru = split (' ', $a_res[2]); return "$a_res[0]\t$a_res[1]\n$a_res[0]$a_stru[1]\t$a_stru[0]\n"; } sub extractseq_rnafold { my ($refseq, $beg, $end) = @_; my $cmd_extractseq = "extractseq V4_454Scaffolds.fna:$refseq -regions $beg-$end -outseq stdout"; my $cmd_rnafold = "RNAfold -noPS -noconv"; ## 3 lines in RNAfold output (1:header fasta - 2:seq - 3:(((...())[...] (nrj)) my @a_res = split ('\n', `$cmd_extractseq | ./fastaOneliner.pl | $cmd_rnafold`); $a_res[2] =~ s/\( /\(/; my @a_stru = split (' ', $a_res[2]); return "$a_res[0]\t$a_res[1]\n$a_res[0]$a_stru[1]\t$a_stru[0]\n"; } sub computeRnafoldFlag { my ($rfo) = @_; my ($fold) = ($rfo =~ /\s+([\(\)\.]{10,})\s+/); if(!defined $fold) {print "###$rfo\n";exit();} return 0 if($fold =~ /^\.+$/); my ($fold1,$loop,$fold2) = ($fold =~ /^([\(\.]+\()(.+?)(\)[\)\.]+)$/); return 0 if(!defined $fold1 || !defined $loop || !defined $fold2); my $loopSize = length($loop); $loop =~ s/\.//g; if($loopSize <= 30 && length($loop) <= 4 && $fold1 !~ /\.{5,}/ && $fold2 !~ /\.{5,}/) { return 1; } return 0; } ### TODO : ### 1 Attention les deux mirna mirna* doivent etre sur le meme brin !!! => flag -1 ### 2 Faire descendre les comptages d'occurence ### 3 Produire un alignement avec les deux mirna et mirna* => ok ### 4 extraire la réf de la zone (avec contexte) ! ### splitfasta avec fastaexplode ### puis extractseq... # #foreach i (*locus) #grep "###" -c $i #end #foreach i ( *locus.stderr ) #grep -c "##locu" $i #end #foreach i (*locus) #grep "###" $i | grep -c "\-1" #end #foreach i (*locus) #grep "###" $i | grep -cPv '###2\n' #end #grep "###" s_6_uT25.fastq.cuta.nr.sort.bam.NM0_NoXA.redon.locus | perl -lane '{`extractseq V4_454Scaffolds.fna:$F[2].fa -regions $F[3]-$F[11] -outseq stdout | tee $F[2]_$F[3]-$F[11].fa | RNAfold -noPS -noconv >> $F[2]_$F[3]-$F[11].fa`}' #perl -lne '{if($_ =~ /^\>/){print "$_"} else { $seq.=$_ if($_ =~ /^[ATCGNatcgn]+$/); if($_=~/^[\.\(\)]+ /) {$_ =~s/ (.+)//; $str.=$_}; print "$seq\n$str" if ( eof ) ;} }' scaffold58014_1652-1717.fa # Banque Rfam 10.0 : /bank/sanger/rfam/current/blast/