在实际应用中,尤其是sequence数据,应该尽可能避免把整个文件读入内存,文件有可能几个G甚至更大,可能引起内存崩溃。应该放在数据库里,建索引以检索数据。Bioperl有一个module, Bio::DB::Fasta可以实现这个,这个模块在没有安装bioperl的情况下也可以单独安装使用。

#!/usr/bin/env perl  
use strict; 
use warnings; 
use Bio::DB::Fasta; 
 
my $dbFile = /home/ygc/Research/Data/3utr_hg18.fa; 
 
# create database from fasta files…  
my $db = Bio::DB::Fasta->new($dbFile); 

## simple access  
my @id = $db->ids; # get all id…  
my $obj = $db->get_Seq_by_id("NM_032129"); # get sequence object by id  
my $seq = $obj->seq; # get sequence  
my $length = $obj->length; # get the length of the sequence  # etc…  

## Bioperl-style access  
my $stream = Bio::DB::Fasta->new($dbFile)->get_PrimarySeq_stream; 

while (my $obj = $stream->next_seq){     
    my $id = $obj->display_id;     
    my $seq = $obj->seq;     
    ## process the sequence…  
}