Skip to content
Snippets Groups Projects
count_fasta.pl 3.15 KiB
#!/usr/bin/perl -w
# count_fasta.pl
# AUTHOR: Joseph Fass (modified from script by Brad Sickler)
# LAST REVISED: November 2010
# 
# The Bioinformatics Core at UC Davis Genome Center
# http://bioinformatics.ucdavis.edu
# Copyright (c) 2009 The Regents of University of California, Davis Campus.
# All rights reserved.
use strict;
use POSIX;
use Getopt::Std;

my $usage = "\n\nusage: $0 [-i <interval size in # of residues>] <fasta file(s)>\n".
            "Produces a histogram of sequence lengths and other measures to standard out.\n\n".
            "-i #          specify bin size for histogram (default 100)\n\n";
our($opt_i); # histogram interval
getopts('i:') or die $usage;
if (!defined($opt_i) or !($opt_i =~ m/^[0-9]+$/)) {$opt_i = 100;}

if( ( $#ARGV + 1 ) < 1 ) {
	die $usage;
}

# Read in sequences from one or more fasta files
my @data_files;
for(my $i = 0; $i < ($#ARGV + 1); $i++){
	$data_files[$i] = $ARGV[$i];
}
my $Id;
my %seq;
foreach my $file (@data_files){
	open(FASTA, $file) or die"Can't open file $file\n";
	while (<FASTA>) {
#         if (/^>(.+)\s/)  { $Id = $1; }  # overwrites if id up to 1st whitespace is non-unique in file
        if (/^>(.*)$/)  { $Id = $1; }
# 		if (/(\w+)/) {
		elsif (/^(\S+)$/) 	{ $seq{$Id} .= $1 if $Id; }
	}
	close (FASTA);
}

# Count the number of sequences in the file and create a histogram of the distribution
my $n = 0;
my $int;
my $totalLength = 0;
my $gcCount = 0;
my %len= ();
my @seqLengths;
foreach my $id (keys %seq) {
	push @seqLengths, length($seq{$id}); # record length for N50 calc's
	$n++;
	$int = floor( length($seq{$id})/$opt_i );
	$totalLength += length($seq{$id});
	$gcCount += ($seq{$id} =~ tr/gGcC/gGcC/);
	if( !defined($len{$int}) ) {
		$len{$int} = 1;
	} else {
		$len{$int}++;
	}
}

# Calculate N25, N50, and N75 and counts
my $N25; my $N50; my $N75;
my $N25count=0; my $N50count=0; my $N75count=0;
my $frac_covered = $totalLength;
@seqLengths = reverse sort { $a <=> $b } @seqLengths;
$N25 = $seqLengths[0];
while ($frac_covered > $totalLength*3/4) {
	$N25 = shift(@seqLengths);