# $Id: FASTArecord.pm,v 1.17 2002/10/31 21:13:23 achaudha Exp $ # Copyright @ 2002 - 2010 The Institute for Genomic Research (TIGR). # All rights reserved. # # This software is provided "AS IS". TIGR makes no warranties, express or # implied, including no representation or warranty with respect to the # performance of the software and derivatives or their safety, # effectiveness, or commercial viability. TIGR does not warrant the # merchantability or fitness of the software and derivatives for any # particular purpose, or that they may be exploited without infringing the # copyrights, patent rights or property rights of others. # # This software program may not be sold, leased, transferred, exported or # otherwise disclaimed to anyone, in whole or in part, without the prior # written consent of TIGR. package TIGR::FASTArecord; { =head1 NAME TIGR::FASTArecord - TIGR::FASTArecord class describing FASTA records =head1 SYNOPSIS use TIGR::FASTArecord; my $obj_instance = new TIGR::FASTArecord ($record_header, $record_data); =head1 DESCRIPTION This module provides an object definition for a FASTA record. It verifies data entry on creation, and returns information queried on the record. =cut BEGIN { require 5.006_00; } use strict; use TIGR::FASTAgrammar ':public'; use TIGR::FASTAgrammar ':private'; ## external variables my $OUTPUT_LINE_LENGTH_REF = \$TIGR::FASTAgrammar::OUTPUT_LINE_LENGTH; my $UNBOUND_FASTA_SEPARATOR = $TIGR::FASTAgrammar::UNBOUND_FASTA_SEPARATOR; ## internal variables and identifiers our $REVISION = (qw$Revision: 1.17 $)[-1]; our $VERSION = '1.2'; our $VERSION_STRING = "$VERSION (Build $REVISION)"; our @DEPEND = (); ## prototypes sub new($$); sub equals($); sub getHeader(); sub getIdentifier(); sub getData(); sub size(); sub toString(); sub reverseComplement($); sub reverseComplementData(); sub subSequence($;$); sub unGap(); ## implementation =over =item $obj_instance = new TIGR::FASTArecord ($header, $data_rec); This method returns a new instance of a TIGR::FASTArecord object. It takes a record header, C<$header>, and the record data, C<$data_rec> as parameters. The record header may optionally contain the leading header character, a C<> symbol. Both parameters are parsed. A new object instance is returned on success. If parsing fails or a record cannot be created, this method returns undefined. =cut sub new($$) { my $pkg = shift; my $header = shift; my $data_rec = shift; my $self = undef; my $identifier = undef; if ( ( defined ($header) ) && ( $header =~ s[^($UNBOUND_FASTA_SEPARATOR){0,1}(.*)] # accept no [$UNBOUND_FASTA_SEPARATOR$2] ) && # separator ( defined ($data_rec) ) && ( defined($identifier = _headerToIdentifier($header)) ) && # # parsable id. ( isValidFASTAdata($data_rec) != 0 ) ) { # valid data? $self = {}; bless $self, $pkg; chomp($header); chomp($data_rec); $self->{header} = $header; $self->{identifier} = $identifier; $self->{data_rec} = $data_rec; $self->{rec_size} = length($data_rec); # compute size } return $self; } =item $result = $obj_instance->equals($fasta_obj); This method takes a I object as a parameter. It compares the passed object with the calling object's internal structures to test for equality. If the two objects are equal, this method returns true (1). Otherwise, this method returns false (undefined). =cut sub equals($) { my $self = shift; my $other_obj = shift; my $return_val = undef; if ( ( defined ($other_obj) ) && ( $self->toString() eq $other_obj->toString() ) ) { $return_val = 1; } else { $return_val = undef; } return $return_val; } =item $identifier = $obj_instance->getHeader(); This method returns the header string for the record. =cut sub getHeader() { my $self = shift; my $header = $self->{header}; # should always be defined if (! isValidFASTAheader($header) ) { # need to prepend an $UNBOUND_FASTA_SEPARATOR $header = $UNBOUND_FASTA_SEPARATOR . $header; } return $header; } =item $identifier = $obj_instance->getIdentifier(); This method returns the identifier string for the record. =cut sub getIdentifier() { my $self = shift; return $self->{identifier}; # should always be defined } =item $data_contents = $obj_instance->getData(); This method returns the data contents of the record as an uninterrupted string. =cut sub getData() { my $self = shift; return $self->{data_rec}; # should always be defined } =item $size_of_rec = $obj_instance->size(); This method returns the size of the data string contained by the record. =cut sub size() { my $self = shift; return $self->{rec_size}; # should always be defined } =item $str_rep = $obj_instance->toString(); This method returns a string representation of the FASTA record. This string representation conforms to the TIGR definition of a FASTA record. =cut sub toString() { my $self = shift; my $data_copy = $self->{data_rec}; my $str_rep = $self->getHeader() . "\n"; my $seg = undef; my @segments = (); my $last_seg = undef; my $skip = undef; while((defined($seg = substr $data_copy, 0, $$OUTPUT_LINE_LENGTH_REF,'')) && ($seg ne '')) { $str_rep .= $seg . "\n"; } return $str_rep; } =item $rev_compl_str = $obj_instance->reverseComplement($NA_strand); This method takes in a string which represents a Nucleotide strand and returns the reverse complement of the strand. If the string does not represent a Nucleotide strand or is undefined, the method returns undefined. When an empty string is passed into the method, we get an empty string on return. =cut sub reverseComplement($) { my $self = shift; my $NA_strand = shift; my $error_condition = 1; my $rev_compl_str = undef; my $validData = 0; if((defined ($NA_strand)) && ($validData = _isNucleotideData($NA_strand)) && ($validData == 1)) { $rev_compl_str = ''; # taking the complement of the sequence if(!defined( $NA_strand =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBbXxNn.-/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVvXxNn.-/)) { $error_condition = undef; } # taking the reverse of the complemented sequence elsif(!defined($rev_compl_str = reverse($NA_strand))) { $error_condition = undef; } } else { $error_condition = undef; } return ( defined ( $error_condition ) ) ? $rev_compl_str : undef; } =item $rev_compl_str = $obj_instance->reverseComplementData(); This method returns the reverse complement of the FASTA record data. If the FASTA record data does not represent a Nucleotide strand or is undefined, the method returns undefined. =cut sub reverseComplementData() { my $self = shift; my $NA_strand = $self->{data_rec}; my $error_condition = 1; my $rev_compl_str = undef; my $validData = 0; if((defined ($NA_strand)) && ($validData = _isNucleotideData($NA_strand)) && ($validData == 1)) { $rev_compl_str = ''; # taking the complement of the sequence if(!defined( $NA_strand =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBbXxNn.-/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVvXxNn.-/)) { $error_condition = undef; } # taking the reverse of the complemented sequence elsif(!defined($rev_compl_str = reverse($NA_strand))) { $error_condition = undef; } } else { $error_condition = undef; } return ( defined ( $error_condition ) ) ? $rev_compl_str : undef; } =item $data_substr = $obj_instance->subSequence($startpos, $length); This method behaves like substr(). This method takes in two numbers $startpos and $length and returns a substring of the record data. The $startpos for the first base in the data is 0. The $length is optional. The substring is extracted starting at $startpos characters from the front of the data string. If $startpos is negative, the substring starts that far from the end of the string instead. If $length is omitted, everything to the end of the string is returned. If $length is negative, the length is calculated to leave that many characters off the end of the string. Otherwise, $length indicates the length of the substring to extract. If $startpos is undefined the function returns undefined. If either $startpos or $length are greater than the data length, the method returns undefined =cut sub subSequence($;$) { my $self = shift; my ($startpos, $length) = @_; my $data_str = $self->getData(); my $data_substr = undef; my $strlen = length($data_str); if((defined ($data_str)) && (defined ($startpos)) && (defined ($length)) && (abs($startpos) < $strlen) && (abs($length) <= $strlen)) { #both parameters are specified $data_substr = substr($data_str, $startpos, $length); } elsif((defined ($data_str)) && (defined ($startpos)) && (!defined ($length)) && (abs($startpos) < $strlen) ) { #only one parameter is specified. $data_substr = substr($data_str, $startpos); } else { #either $data_str is undefined or $startpos is undefined so do #nothing. } return $data_substr; } =item $result = $obj_instance->unGap(); This method removes all the gaps('-' characters) from the record data. The record size is changed after the gaps are removed. The method returns 1 on success and undefined otherwise. =cut sub unGap() { my $self = shift; my $data_str = $self->getData(); my $result = 1; if((defined ($data_str)) && ($data_str =~ /[-]+/)) { #removing the gaps in the data. $data_str =~ s/[-]+//g; $self->{data_rec} = $data_str; $self->{rec_size} = length($data_str); } elsif(!defined ($data_str)) { $result = undef; } return $result; } =back =head1 USAGE To use this module, load it using the C function. The object must be initialized with a header and a data string. An example follows. Please refer to the C package usage for more examples. #!/usr/local/bin/perl -w use strict; use TIGR::FASTArecord; MAIN: { # set up a simple example without using the leading carot. my $header1 = "ORF00001 The first ORF in the series"; my $data1 = "MEEISTPEGGVLVPISIETEVKRAYIDYSMSVIVSRALPDVRDGLKPVHRRILYAMEEKG" . "LRFSGPTRKCAKIVGDVLGSFHPHGDASVYDALVRLGQDFSLRYPVIHPQGNFGTIGGDP" . "PAAYRYTEAKMARIAESMVEDIKKETVSFVPNFDDSDVEPTVLPGRFPFLLANGSSGIAV" . "GMTTNMPPHNLREIAAAISAYIENPNLSIQELCDCINGPDFPTGGIIFGKNGIRQSYETG" . "RGKIVVRARFTIETDSKGRDTIIFTEVPYQVNTTMLVMRIGELARAKVIEGIANVNDETS" . "DRTGLRIVVELKKGTPAQVVLNHLFAKTPLQSSFNVINLALVEGRPRMLTLKDLVRYFVE" . "HRVDVVTRRAHFELRKAQERIHLVRALIRALDAIDKIITLIRHSQNTELAKQRLREQFDF" . "DNVQAQAIVDMQMKRLTGLEVESLRTELKDLTELISSLEELLTSPQKVLGVVKKETRDIA" . "DMFGDDRRTDIVSNEIEYLDVEDFIQKEEMVILISHLGYIKRVPVSAYRNQNRGGKGSSS" . "ANLAAHDFISQIFTASTHDYVMFVTSRGRAYWLKVYGIPESGRANRGSHIKSLLMVATDE" . "EITAIVSLREFSNKSYVFMATARGVVKKVTTDNFVNAKTRGIIALKLSGGDTLVSAVLVQ" . "DEDEVMLITRQGKALRMSGREVREMGRNSSGVIGIKLTSEDLVAGVLRVSEQRKVLIMTE" . "NGYGKRVSFSEFSVHGRGTAGQKIYTQTDRKGAIIGALAVLDTDECMCITGQGKTIRVDV" . "CAISVLGRGAQGVRVLDIEPSDLVVGLSCVMQG"; my $fasta_record = new TIGR::FASTArecord $header1, $data1; if ( defined ( $fasta_record ) ) { print STDOUT "Sequence " . $fasta_record->getIdentifier() . " is " . $fasta_record->size() . " residues.\n"; print STDOUT $fasta_record->toString(); } else { die "Invalid FASTA record 1"; } # but this header is also valid. my $header2 = ">ORF00001 The first ORF in the series"; my $data2 = "MEEISTPEGGVLVPISIETEVKRAYIDYSMSVIVSRALPDVRDGLKPVHRRILYAMEEKG" . "LRFSGPTRKCAKIVGDVLGSFHPHGDASVYDALVRLGQDFSLRYPVIHPQGNFGTIGGDP" . "PAAYRYTEAKMARIAESMVEDIKKETVSFVPNFDDSDVEPTVLPGRFPFLLANGSSGIAV" . "GMTTNMPPHNLREIAAAISAYIENPNLSIQELCDCINGPDFPTGGIIFGKNGIRQSYETG" . "RGKIVVRARFTIETDSKGRDTIIFTEVPYQVNTTMLVMRIGELARAKVIEGIANVNDETS" . "DRTGLRIVVELKKGTPAQVVLNHLFAKTPLQSSFNVINLALVEGRPRMLTLKDLVRYFVE" . "HRVDVVTRRAHFELRKAQERIHLVRALIRALDAIDKIITLIRHSQNTELAKQRLREQFDF" . "DNVQAQAIVDMQMKRLTGLEVESLRTELKDLTELISSLEELLTSPQKVLGVVKKETRDIA" . "DMFGDDRRTDIVSNEIEYLDVEDFIQKEEMVILISHLGYIKRVPVSAYRNQNRGGKGSSS" . "ANLAAHDFISQIFTASTHDYVMFVTSRGRAYWLKVYGIPESGRANRGSHIKSLLMVATDE" . "EITAIVSLREFSNKSYVFMATARGVVKKVTTDNFVNAKTRGIIALKLSGGDTLVSAVLVQ" . "DEDEVMLITRQGKALRMSGREVREMGRNSSGVIGIKLTSEDLVAGVLRVSEQRKVLIMTE" . "NGYGKRVSFSEFSVHGRGTAGQKIYTQTDRKGAIIGALAVLDTDECMCITGQGKTIRVDV" . "CAISVLGRGAQGVRVLDIEPSDLVVGLSCVMQG"; my $fasta_record2 = new TIGR::FASTArecord $header2, $data2; if ( defined ( $fasta_record2 ) ) { print STDOUT "Sequence " . $fasta_record2->getIdentifier() . " is " . $fasta_record2->size() . " residues.\n"; print STDOUT $fasta_record2->toString(); } else { die "Invalid FASTA record 2"; } # this entry fails; note the 'J' in the second line of data (8th char.) my $header3 = "ORF00001 The first ORF in the series"; my $data3 = "MEEISTPEGGVLVPISIETEVKRAYIDYSMSVIVSRALPDVRDGLKPVHRRILYAMEEKG" . "LRFSGPTJKCAKIVGDVLGSFHPHGDASVYDALVRLGQDFSLRYPVIHPQGNFGTIGGDP" . "PAAYRYTEAKMARIAESMVEDIKKETVSFVPNFDDSDVEPTVLPGRFPFLLANGSSGIAV" . "GMTTNMPPHNLREIAAAISAYIENPNLSIQELCDCINGPDFPTGGIIFGKNGIRQSYETG" . "RGKIVVRARFTIETDSKGRDTIIFTEVPYQVNTTMLVMRIGELARAKVIEGIANVNDETS" . "DRTGLRIVVELKKGTPAQVVLNHLFAKTPLQSSFNVINLALVEGRPRMLTLKDLVRYFVE" . "HRVDVVTRRAHFELRKAQERIHLVRALIRALDAIDKIITLIRHSQNTELAKQRLREQFDF" . "DNVQAQAIVDMQMKRLTGLEVESLRTELKDLTELISSLEELLTSPQKVLGVVKKETRDIA" . "DMFGDDRRTDIVSNEIEYLDVEDFIQKEEMVILISHLGYIKRVPVSAYRNQNRGGKGSSS" . "ANLAAHDFISQIFTASTHDYVMFVTSRGRAYWLKVYGIPESGRANRGSHIKSLLMVATDE" . "EITAIVSLREFSNKSYVFMATARGVVKKVTTDNFVNAKTRGIIALKLSGGDTLVSAVLVQ" . "DEDEVMLITRQGKALRMSGREVREMGRNSSGVIGIKLTSEDLVAGVLRVSEQRKVLIMTE" . "NGYGKRVSFSEFSVHGRGTAGQKIYTQTDRKGAIIGALAVLDTDECMCITGQGKTIRVDV" . "CAISVLGRGAQGVRVLDIEPSDLVVGLSCVMQG"; my $fasta_record3 = new TIGR::FASTArecord $header3, $data3; if ( defined ( $fasta_record3 ) ) { print STDOUT "Sequence " . $fasta_record3->getIdentifier() . " is " . $fasta_record3->size() . " residues.\n"; print STDOUT $fasta_record3->toString(); } else { die "Invalid FASTA record 3"; } } =cut } 1;