.\" Automatically generated by Pod::Man 4.11 (Pod::Simple 3.35)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is >0, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{\
.    if \nF \{\
.        de IX
.        tm Index:\\$1\t\\n%\t"\\$2"
..
.        if !\nF==2 \{\
.            nr % 0
.            nr F 2
.        \}
.    \}
.\}
.rr rF
.\" ========================================================================
.\"
.IX Title "Bio::DB::Fasta 3pm"
.TH Bio::DB::Fasta 3pm "2020-10-28" "perl v5.30.3" "User Contributed Perl Documentation"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
Bio::DB::Fasta \- Fast indexed access to fasta files
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 1
\&  use Bio::DB::Fasta;
\&
\&  # Create database from a directory of Fasta files
\&  my $db       = Bio::DB::Fasta\->new(\*(Aq/path/to/fasta/files/\*(Aq);
\&  my @ids      = $db\->get_all_primary_ids;
\&
\&  # Simple access
\&  my $seqstr   = $db\->seq(\*(AqCHROMOSOME_I\*(Aq, 4_000_000 => 4_100_000);
\&  my $revseq   = $db\->seq(\*(AqCHROMOSOME_I\*(Aq, 4_100_000 => 4_000_000);
\&  my $length   = $db\->length(\*(AqCHROMOSOME_I\*(Aq);
\&  my $header   = $db\->header(\*(AqCHROMOSOME_I\*(Aq);
\&  my $alphabet = $db\->alphabet(\*(AqCHROMOSOME_I\*(Aq);
\&
\&  # Access to sequence objects. See Bio::PrimarySeqI.
\&  my $seq     = $db\->get_Seq_by_id(\*(AqCHROMOSOME_I\*(Aq);
\&  my $seqstr  = $seq\->seq;
\&  my $subseq  = $seq\->subseq(4_000_000 => 4_100_000);
\&  my $trunc   = $seq\->trunc(4_000_000 => 4_100_000);
\&  my $length  = $seq\->length;
\&
\&  # Loop through sequence objects
\&  my $stream  = $db\->get_PrimarySeq_stream;
\&  while (my $seq = $stream\->next_seq) {
\&    # Bio::PrimarySeqI stuff
\&  }
\&
\&  # Filehandle access
\&  my $fh = Bio::DB::Fasta\->newFh(\*(Aq/path/to/fasta/files/\*(Aq);
\&  while (my $seq = <$fh>) {
\&    # Bio::PrimarySeqI stuff
\&  }
\&
\&  # Tied hash access
\&  tie %sequences,\*(AqBio::DB::Fasta\*(Aq,\*(Aq/path/to/fasta/files/\*(Aq;
\&  print $sequences{\*(AqCHROMOSOME_I:1,20000\*(Aq};
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
or a directory of files. It provides persistent random access to each sequence
entry (either as a Bio::PrimarySeqI\-compliant object or a string), and to
subsequences within each entry, allowing you to retrieve portions of very large
sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
based on Bio::DB::IndexedBase. See this module's documentation for details.
.PP
The Fasta files may contain any combination of nucleotide and protein sequences;
during indexing the module guesses the molecular type. Entries may have any line
length up to 65,536 characters, and different line lengths are allowed in the
same file.  However, within a sequence entry, all lines must be the same length
except for the last. An error will be thrown if this is not the case.
.PP
The module uses /^>(\eS+)/ to extract the primary \s-1ID\s0 of each sequence
from the Fasta header. See \-makeid in Bio::DB::IndexedBase to pass a callback
routine to reversibly modify this primary \s-1ID,\s0 e.g. if you wish to extract a
specific portion of the gi|gb|abc|xyz GenBank IDs.
.SH "DATABASE CREATION AND INDEXING"
.IX Header "DATABASE CREATION AND INDEXING"
The object-oriented constructor is \fBnew()\fR, the filehandle constructor is \fBnewFh()\fR
and the tied hash constructor is \fBtie()\fR. They all allow one to index a single Fasta
file, several files, or a directory of files. See Bio::DB::IndexedBase.
.SH "SEE ALSO"
.IX Header "SEE ALSO"
Bio::DB::IndexedBase
.PP
Bio::DB::Qual
.PP
Bio::PrimarySeqI
.SH "AUTHOR"
.IX Header "AUTHOR"
Lincoln Stein <lstein@cshl.org>.
.PP
Copyright (c) 2001 Cold Spring Harbor Laboratory.
.PP
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.  See \s-1DISCLAIMER\s0.txt for
disclaimers of warranty.
.SH "APPENDIX"
.IX Header "APPENDIX"
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
.PP
For BioPerl-style access, the following methods are provided:
.SS "get_Seq_by_id"
.IX Subsection "get_Seq_by_id"
.Vb 9
\& Title   : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
\& Usage   : my $seq = $db\->get_Seq_by_id($id);
\& Function: Given an ID, fetch the corresponding sequence from the database.
\& Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
\&           Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
\&           only load the sequence string into memory when requested using seq().
\&           See L<Bio::PrimarySeqI> for methods provided by the sequence objects
\&           returned from get_Seq_by_id() and get_PrimarySeq_stream().
\& Args    : ID
.Ve
.SS "get_PrimarySeq_stream"
.IX Subsection "get_PrimarySeq_stream"
.Vb 7
\& Title   : get_PrimarySeq_stream
\& Usage   : my $stream = $db\->get_PrimarySeq_stream();
\& Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
\&           single method, next_seq(). Each call to next_seq() returns a new
\&           Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
\& Returns : A Bio::DB::Indexed::Stream object
\& Args    : None
.Ve
.SH ""
.IX Header ""
For simple access, the following methods are provided:
.SS "new"
.IX Subsection "new"
.Vb 7
\& Title   : new
\& Usage   : my $db = Bio::DB::Fasta\->new( $path, %options);
\& Function: Initialize a new database object. When indexing a directory, files
\&           ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
\& Returns : A new Bio::DB::Fasta object.
\& Args    : A single file, or path to dir, or arrayref of files
\&           Optional arguments: see Bio::DB::IndexedBase
.Ve
.SS "seq"
.IX Subsection "seq"
.Vb 10
\& Title   : seq, sequence, subseq
\& Usage   : # Entire sequence string
\&           my $seqstr    = $db\->seq($id);
\&           # Subsequence
\&           my $subseqstr = $db\->seq($id, $start, $stop, $strand);
\&           # or...
\&           my $subseqstr = $db\->seq($compound_id);
\& Function: Get a subseq of a sequence from the database. For your convenience,
\&           the sequence to extract can be specified with any of the following
\&           compound IDs:
\&              $db\->seq("$id:$start,$stop")
\&              $db\->seq("$id:$start..$stop")
\&              $db\->seq("$id:$start\-$stop")
\&              $db\->seq("$id:$start,$stop/$strand")
\&              $db\->seq("$id:$start..$stop/$strand")
\&              $db\->seq("$id:$start\-$stop/$strand")
\&              $db\->seq("$id/$strand")
\&           In the case of DNA or RNA sequence, if $stop is less than $start,
\&           then the reverse complement of the sequence is returned. Avoid using
\&           it if possible since this goes against Bio::Seq conventions.
\& Returns : A string
\& Args    : ID of sequence to retrieve
\&             or
\&           Compound ID of subsequence to fetch
\&             or
\&           ID, optional start (defaults to 1), optional end (defaults to length
\&           of sequence) and optional strand (defaults to 1).
.Ve
.SS "length"
.IX Subsection "length"
.Vb 5
\& Title   : length
\& Usage   : my $length = $qualdb\->length($id);
\& Function: Get the number of residues in the indicated sequence.
\& Returns : Number
\& Args    : ID of entry
.Ve
.SS "header"
.IX Subsection "header"
.Vb 6
\& Title   : header
\& Usage   : my $header = $db\->header($id);
\& Function: Get the header line (ID and description fields) of the specified
\&           sequence.
\& Returns : String
\& Args    : ID of sequence
.Ve
.SS "alphabet"
.IX Subsection "alphabet"
.Vb 5
\& Title   : alphabet
\& Usage   : my $alphabet = $db\->alphabet($id);
\& Function: Get the molecular type of the indicated sequence: dna, rna or protein
\& Returns : String
\& Args    : ID of sequence
.Ve