# $Id: FASTAgrammar.pm,v 1.17 2002/10/31 20:54:44 achaudha Exp $ # Copyright @ 2002 - 2010 The Institute for Genomic Research (TIGR). # All rights reserved. # # This software is provided "AS IS". TIGR makes no warranties, express or # implied, including no representation or warranty with respect to the # performance of the software and derivatives or their safety, # effectiveness, or commercial viability. TIGR does not warrant the # merchantability or fitness of the software and derivatives for any # particular purpose, or that they may be exploited without infringing the # copyrights, patent rights or property rights of others. # # This software program may not be sold, leased, transferred, exported or # otherwise disclaimed to anyone, in whole or in part, without the prior # written consent of TIGR. package TIGR::FASTAgrammar; { =head1 NAME FASTAgrammar - module for validating FASTA format records =head1 SYNOPSIS use TIGR::FASTAgrammar ':public'; $header = FASTA header here... $data = FASTA data here... $return_value = isValidFASTARecord($header, $data); ... =head1 DESCRIPTION This module provides functions for verifying compliance with TIGR's FASTA file and record definitions. =cut BEGIN { require 5.006_00; } use strict; require Exporter; ## internal variables and identifiers our @ISA = qw(Exporter); our $REVISION = (qw$Revision: 1.17 $)[-1]; our $VERSION = '1.2'; our $VERSION_STRING = "$VERSION (Build $REVISION)"; our @DEPEND = (); ## Export methods our %EXPORT_TAGS = ( 'public' => [ qw( isValidFASTArecord isValidFASTAheader isValidFASTAdata isValidFASTAlineLength setValidFASTAlineLength ) ] , 'private' => [ qw( _headerToIdentifier _isNucleotideData _isPeptideData ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'public'} }, @{ $EXPORT_TAGS{'private'} } ); ## data structures # IUPAC extended codes acceptable for sequence data # see WU-BLAST format on http://tigrblast.tigr.org/html/fasta.html our $NA_IUPAC_CODES = 'ATUGCMRWSYKVHDBN\.\-'; our $AA_IUPAC_CODES = 'ARNDBCQEZGHILKMFPSTUWXYVJ\*\-'; # FASTA parameters our $FASTA_SEPARATOR = '^>'; our $UNBOUND_FASTA_SEPARATOR = '>'; # note: BLAST can accept 80 bases per line; this just emits a warning our $OUTPUT_LINE_LENGTH = 60; my $RECORD_LINE_LENGTH = 0; # the TIGR FASTA header parse our $FASTA_HEADER_PARSE = '^>([[:graph:]]+)(?: +[ \ca[:graph:]]*$| *$)'; ## prototypes sub isValidFASTArecord(@); sub isValidFASTAheader($); sub isValidFASTAdata($); sub isValidFASTAlineLength($); sub setValidFASTAlineLength($); sub _isNucleotideData($); sub _isPeptideData($); sub _headerToIdentifier($); ## implementation =over =item $result = isValidFASTArecord(@record_defn); This method determines if a FASTA record, C<@record_defn>, fits the TIGR definition for a FASTA record. C<@record_defn> is an array of lines over which the record is defined. The first line should be the FASTA header, and subsequent lines the data definition. This method checks line width, character composition, and header format. If the record parses correctly, this method returns 1. Otherwise, this method returns 0. =cut sub isValidFASTArecord(@) { my $header = shift; my @data_lines = @_; my $valid_flag = 0; my $first_line_flag = 0; my $first_len = 0; # check conformance of header $valid_flag = isValidFASTAheader($header); # check conformance of data if ( ( $valid_flag != 0 ) && ( scalar(@data_lines) > 0 ) ) { my $data_scalar = join "", @data_lines; $data_scalar =~ s/\n//g; # extract new lines from scalar data $valid_flag = isValidFASTAdata($data_scalar); } # check conformance of line length while ( ( $valid_flag != 0 ) && ( defined ( my $line = shift @data_lines ) ) ) { chomp $line; if($first_line_flag == 0) { $first_len = setValidFASTAlineLength($line); $first_line_flag = 1; } if(defined($first_len)) { my $line_len_flag = isValidFASTAlineLength($line); if ( ( $line_len_flag > 0 ) || ( ( $line_len_flag < 0 ) && ( $#data_lines != -1 ) ) ) { $valid_flag = 0; } } } return $valid_flag; } =item $result = isValidFASTAheader($header); This method determines if the FASTA header description, C<$header>, is a valid description. It checks for a leading carot symbol and trailing non white space characters. Any number of white space characters may be interleaved throughout the text portion of the header, with the exception that there may be no space between the carot and the first word. If the header is valid, this method returns 1. Otherwise, this method returns 0. =cut sub isValidFASTAheader($) { my $header = shift; my $return_val = 0; my $identifier = undef; if ( ( defined ($header) ) && ( ($identifier) = $header =~ /$FASTA_HEADER_PARSE/ ) ) { if((defined $identifier) && ($identifier !~ /\//)) { $return_val = 1; } else { $return_val = 0; } } else { $return_val = 0; } return $return_val; } =item $result = isValidFASTAdata($data_def); This method takes the scalar data definition of a FASTA record, C<$data_def>. It tests the data and returns 1 if the data conforms to nucleotide data or if it conforms to peptide data or both. If the data is not recognizable or is undefined, it returns 0. =cut sub isValidFASTAdata($) { my $data_definition = shift; my $return_val = undef; if(($data_definition =~ /^[$NA_IUPAC_CODES]+$/i) || ($data_definition =~ /^[$AA_IUPAC_CODES]+$/i)) { $return_val = 1; } else { $return_val = 0; } } =item $result = isValidFASTAlineLength($line); This method returns -1 if the data line, C<$line> is less than the TIGR definition requirement for line length, 0 if the data line meets the TIGR definition requirement for line length, and 1 if the data line is greater than the TIGR definition requirement for line length. =cut sub isValidFASTAlineLength($) { my $line = shift; my $line_len = undef; my $return_val = undef; if ( defined ($line) ) { chomp $line; $line_len = length($line); if ( $line_len > $RECORD_LINE_LENGTH ) { $return_val = 1; } elsif ( $line_len < $RECORD_LINE_LENGTH ) { $return_val = -1; } else { $return_val = 0; } } } =item $result = setValidFASTAlineLength($); This method takes in the first data line in the data portion of a FASTA record. The function returns the length of this line if it is positive. This length determines the line length for all the data lines following this first line. The function returns undefined if unsuccessful. =cut sub setValidFASTAlineLength($) { my $line = shift; my $line_len = undef; my $ret_len = undef; if(defined ($line)) { chomp $line; $line_len = length($line); if($line_len > 0) { $ret_len = $line_len ; } } $RECORD_LINE_LENGTH = $ret_len; return $ret_len; } # $result = _isNucleotideData($data_def); #This method takes the scalar data definition of a FASTA record, C<$data_def>. #It tests it for conformance to a nucleotide data type. If the data are #nucleotide IUPAC characters, this method returns 1. If not, this method #returns 0. This method returns 0 if C<$data_def> is undefined. sub _isNucleotideData($) { my $data_def = shift; my $return_val = undef; if ( ( defined ( $data_def ) ) && ( $data_def =~ /^[$NA_IUPAC_CODES]+$/i ) ) { $return_val = 1; } else { $return_val = 0; } return $return_val; } # $result = _isPeptideData($data_def); #This method takes the scalar data definition of a FASTA record, C<$data_def>. #It tests it for conformance to a peptide data type. If the data are #peptide IUPAC characters, this method returns 1. If not, this method returns #zero. This method returns undefined if C<$data_def> is undefined. sub _isPeptideData($) { my $data_def = shift; my $return_val = undef; if ( ( defined ( $data_def ) ) && ( $data_def =~ /^[$AA_IUPAC_CODES]+$/i ) ) { $return_val = 1; } else { $return_val = 0; } return $return_val; } # $identifier = _headerToIdentifier($header); #This function takes a FASTA header as a parameter, returning a parsed #identifier for the record. If the supplied header is invalid or undefined, #this method returns undefined. sub _headerToIdentifier($) { my $header = shift; my $identifier = undef; my $return_val = undef; if ( ( defined ($header) ) && ( ($identifier) = $header =~ /$FASTA_HEADER_PARSE/ ) && ( defined ($identifier) ) ) { if($identifier =~ /\//) { $return_val = undef; } else { $return_val = $identifier; } } else { $return_val = undef; } return $return_val; } =back =head1 USAGE This module is not intended for developer use. Instead, use the front end modules C and C. =cut } 1;