Yet Another CPAN Grep

MEME-Alphabet/lib/MEME/Alphabet.pm

=pod

=head1 NAME

MEME::Alphabet - Provides nucleobase alphabet capabilities for Perl code.

=head1 AUTHORS

James Johnson and Timothy L. Bailey.

=head1 LICENSE

Copyright
(1994 - 2017) The Regents of the University of California.
All Rights Reserved.

Permission to use, copy, modify, and distribute any part of
this software for educational, research and non-profit purposes,
without fee, and without a written agreement is hereby granted,
provided that the above copyright notice, this paragraph and
the following three paragraphs appear in all copies.

Those desiring to incorporate this software into commercial
products or use for commercial purposes should contact the
Technology Transfer Office, University of California, San Diego,
9500 Gilman Drive, La Jolla, California, 92093-0910,
Ph: (619) 534 5815.

IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO
ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR
CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF
THE USE OF THIS SOFTWARE, EVEN IF THE UNIVERSITY OF CALIFORNIA
HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE
UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE
MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
THE UNIVERSITY OF CALIFORNIA MAKES NO REPRESENTATIONS AND
EXTENDS NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR IMPLIED,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, OR THAT
THE USE OF THE MATERIAL WILL NOT INFRINGE ANY PATENT,
TRADEMARK OR OTHER RIGHTS.

=cut

package MEME::Alphabet;

use strict;
use warnings;
use feature 'unicode_strings';

use Carp;
use Fcntl qw(O_RDONLY);

# pinned to the latest MEME Suite version (x.yy.z), and then versioned from there
use version; our $VERSION = version->declare("v4.12.0.1.3");

require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(rna dna moddna protein);

my $SYM_RE = qr/[A-Za-z0-9\?\.\*\-]/;
my $COLOUR_RE = qr/[0-9A-Fa-f]{6}/;
my $NAME_RE = qr/"((?:[^\\"]+|\\["\/bfnrt]|\\u[0-9A-Fa-f]{4})*)"/;
my $CORE_RE = qr/($SYM_RE)(?:\s+$NAME_RE)?(?:\s+($COLOUR_RE))?/;
my $COMMENT_RE = qr/^(\s*(?:[^#](?:[^"#]*|"(?:\\.|[^"\\]*)*")*)?)#.*$/;
my $HEADER_RE = qr/^\s*ALPHABET(?:\s+v1)?(?:\s+$NAME_RE)?(?:\s+(RNA|DNA|PROTEIN)-LIKE)?\s*$/;
my $CORE_SINGLE_RE = qr/^\s*$CORE_RE\s*$/;
my $CORE_PAIR_RE = qr/^\s*$CORE_RE\s*~\s*$CORE_RE\s*$/;
my $AMBIG_RE = qr/^\s*$CORE_RE\s*=\s*($SYM_RE*)\s*$/;

sub new {
  my $classname = shift;
  my $self = {};
  bless($self, $classname);
  $self->_init(@_);
  return $self;
}

sub _init {
  my $self = shift;
  my ($file) = @_;
  $self->{errors} = [];
  $self->{file_name} = undef;
  $self->{line_no} = 0;
  $self->{seen_header} = 0; #FALSE
  $self->{seen_symbol} = 0; #FALSE
  $self->{seen_ambig} = 0; #FALSE
  $self->{seen_lc} = 0; #FALSE
  $self->{seen_uc} = 0; #FALSE
  $self->{name} = undef;
  $self->{sym_lookup} = {};
  $self->{core_syms} = [];
  $self->{ambig_syms} = [];
  $self->{parsed} = 0; #FALSE
  if (defined($file)) {
    $self->parse_file($file);
    if (@{$self->{errors}}) {
      foreach my $error ( @{$self->{errors}} ) {
        warn($error);
      }
      croak("Invalid alphabet file \"$file\"\n");
    }
  }
}

sub _error {
  my $self = shift;
  my ($reason) = @_;
  my $line_info = '';
  my $file_info = '';
  $line_info = 'on line ' . $self->{line_no} . ' ' if ($self->{line_no});
  $file_info = 'file "' . $self->{file_name} . '"' if ($self->{file_name});
  push(@{$self->{errors}}, "Invalid alphabet $file_info- $line_info$reason\n");
}

sub parse_header {
  my $self = shift;
  my ($name, $like) = @_;
  die("Already finished parsing") if $self->{parsed};
  $self->_error('repeated header') if ($self->{seen_header});
  $self->_error('header after symbol') if ($self->{seen_symbol});
  $self->_error('like must be "DNA", "RNA" or "PROTEIN" (ignoring case)') if ($like && $like !~ m/^(RNA|DNA|PROTEIN)$/i);
  $self->{name} = $name;
  $self->{like} = ($like ? uc($like) : '');
  $self->{seen_header} = 1; #TRUE
}

sub uniq {
  my %seen;
  grep !$seen{$_}++, @_;
}

sub parse_symbol {
  my $self = shift;
  my ($sym, $name, $colour, $complement, $comprise, $aliases) = @_;
  die("Already finished parsing") if $self->{parsed};
  $self->_error('expected header but found symbol') unless ($self->{seen_header} || $self->{seen_symbol});
  if (defined($self->{sym_lookup}->{$sym})) {
    $self->_error("symbol $sym is already used");
    return; # can't really proceed with a duplicate
  }
  $self->{seen_lc} = 1 if ($sym ge 'a' && $sym le 'z');
  $self->{seen_uc} = 1 if ($sym ge 'A' && $sym le 'Z');
  my $symbol = {sym => $sym, aliases => [], name => $name, colour => $colour};
  if (defined($aliases)) {
    push(@{$symbol->{aliases}}, split(//, $aliases));
  }
  if (defined($comprise)) { # ambiguous symbol
    my @equals = split //, $comprise;
    @equals = sort _symbol_cmp @equals;
    @equals = &uniq(@equals);
    for (my $i = 0; $i < scalar(@equals); $i++) {
      my $csym = $self->{sym_lookup}->{$equals[$i]};
      if (!defined($csym)) {
        $self->_error('referenced symbol ' . $equals[$i] . ' is unknown');
      } elsif (defined($csym->{comprise})) {
        $self->_error('referenced symbol ' . $equals[$i] . ' is ambiguous');
      }
    }
    $symbol->{comprise} = join('', @equals);
    push(@{$self->{ambig_syms}}, $symbol);
    $self->{seen_ambig} = 1;
    if ($sym eq '?' && length($symbol->{comprise}) < scalar(@{$self->{core_syms}})) {
      $self->_error('symbol ? is reserved as a wildcard');
    }
  } else { # core symbol
    $self->_error('symbol ? is reserved as a wildcard') if ($sym eq '?');
    $self->_error('unexpected core symbol (as ambiguous symbols seen)') if ($self->{seen_ambig});
    $symbol->{complement} = $complement;
    push(@{$self->{core_syms}}, $symbol);
  }
  $self->{sym_lookup}->{$sym} = $symbol;
  $self->{seen_symbol} = 1;
}

sub parse_line {
  my $self = shift;
  die("Already finished parsing") if $self->{parsed};
  my ($line) = @_;
  $self->{line_no} += 1;
  $line =~ s/$COMMENT_RE/$1/;
  return unless ($line =~ m/\S/); # skip empty lines
  if ($line =~ m/$HEADER_RE/) {
    $self->parse_header($1, $2);
  } elsif ($line =~ m/$CORE_PAIR_RE/) {
    $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3), $4);
    $self->parse_symbol($4, &_decode_JSON_string($5), &_decode_colour($6), $1);
  } elsif ($line =~ m/$CORE_SINGLE_RE/) {
    $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3));
  } elsif ($line =~ m/$AMBIG_RE/) {
    $self->parse_symbol($1, &_decode_JSON_string($2), &_decode_colour($3), undef, $4);
  } else {
    $self->_error('unrecognised pattern');
  }
}

sub _add_lookup {
  my ($lookup, $target, $character, $both_case) = @_;
  $lookup->{$character} = $target;
  if ($both_case) {
    if ($character ge 'A' && $character le 'Z') {
      $lookup->{chr(ord($character) + 32)} = $target;
    } elsif ($character ge 'a' && $character le 'z') {
      $lookup->{chr(ord($character) - 32)} = $target;
    }
  }
}

sub parse_done {
  my $self = shift;
  die("Already finished parsing") if $self->{parsed};
  # now sort the symbols
  @{$self->{core_syms}} = sort _symobj_cmp @{$self->{core_syms}};
  @{$self->{ambig_syms}} = sort _symobj_cmp @{$self->{ambig_syms}};
  # check if there is a wildcard
  if (scalar(@{$self->{ambig_syms}}) == 0 || length($self->{ambig_syms}->[0]->{comprise}) != scalar(@{$self->{core_syms}})) {
    #warn("Warning: wildcard symbol automatically generated\n");
    my $wildcard_comprise = '';
    for (my $i = 0; $i < scalar(@{$self->{core_syms}}); $i++) {
      $wildcard_comprise .= $self->{core_syms}->[$i]->{sym};
    }
    unshift(@{$self->{ambig_syms}}, {sym => '?', aliases => [], comprise => $wildcard_comprise});
  }
  # create the final list of symbols and determine aliases
  my %comprise_lookup = ();
  $self->{ncore} = scalar(@{$self->{core_syms}});
  $self->{syms} = [];
  for (my $i = 0; $i < $self->{ncore}; $i++) {
    my $sym = $self->{core_syms}->[$i];
    $comprise_lookup{$sym->{sym}} = $sym;
    $sym->{index} = scalar(@{$self->{syms}});
    push(@{$self->{syms}}, $sym);
  }
  for (my $i = 0; $i < scalar(@{$self->{ambig_syms}}); $i++) {
    my $sym = $self->{ambig_syms}->[$i];
    my $real = $comprise_lookup{$sym->{comprise}};
    if (defined($real)) {
      push(@{$real->{aliases}}, $sym->{sym});
    } else {
      $comprise_lookup{$sym->{comprise}} = $sym;
      $sym->{index} = scalar(@{$self->{syms}});
      push(@{$self->{syms}}, $sym);
    }
  }
  # cleanup some entries we don't need
  delete $self->{sym_lookup};
  delete $self->{core_syms};
  delete $self->{ambig_syms};
  # map comprising characters to objects
  for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
    my $sym = $self->{syms}->[$i];
    my @list = map { $comprise_lookup{$_} } split(//, $sym->{comprise});
    $sym->{comprise} = \@list;
  }
  #determine complements
  my $fully_complementable = 1;
  for (my $i = 0; $i < $self->{ncore}; $i++) {
    my $sym = $self->{syms}->[$i];
    unless (defined($sym->{complement})) {
      $fully_complementable = 0;
      next;
    }
    $sym->{complement} = $comprise_lookup{$sym->{complement}};
  }
  AMBIG: for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
    my $sym = $self->{syms}->[$i];
    my @complements = ();
    for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
      my $target = $sym->{comprise}->[$j]->{complement};
      next AMBIG unless defined $target;
      push(@complements, $target->{sym});
    }
    @complements = sort _symbol_cmp @complements;
    my $complement = $comprise_lookup{join('', @complements)};
    next AMBIG unless defined $complement;
    $sym->{complement} = $complement;
  }
  # create lookup
  my $both_case = ($self->{seen_lc} != $self->{seen_uc});
  $self->{lookup} = {};
  for (my $i = 0; $i < scalar(@{$self->{syms}}); $i++) {
    my $sym = $self->{syms}->[$i];
    &_add_lookup($self->{lookup}, $sym, $sym->{sym}, $both_case);
    for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
      &_add_lookup($self->{lookup}, $sym, $sym->{aliases}->[$j], $both_case);
    }
  }
  $self->{comprise_lookup} = \%comprise_lookup;
  $self->{fully_complementable} = $fully_complementable;
  # create regular expressions to match something that isn't a valid symbol
  my $all_symbols = quotemeta(join('', (keys %{$self->{lookup}})));
  $self->{re_not_valid} = qr/[^$all_symbols]/;
  my $ambig_symbols = '';
  for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
    my $entry = $self->{syms}->[$i];
    my $prime_sym = $entry->{sym};
    $ambig_symbols .= $prime_sym;
    if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
      my $prime_sym_copy = $prime_sym; # copy
      $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
      $ambig_symbols .= $prime_sym_copy;
    }
    foreach my $alias_sym (@{$entry->{aliases}}) {
      $ambig_symbols .= $alias_sym;
      if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
        my $alias_sym_copy = $alias_sym; # copy
        $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
        $ambig_symbols .= $alias_sym_copy;
      }
    }
  }
  $self->{re_ambig} = qr/[\Q$ambig_symbols\E]/;
  $self->{re_ambig_only} = qr/^[\Q$ambig_symbols\E]*$/;
  #$ambig_symbols = quotemeta($ambig_symbols);
  #$self->{re_ambig} = qr/[$ambig_symbols]/;
  #$self->{re_ambig_only} = qr/^[$ambig_symbols]*$/;

  # create translatation strings
  # Strings to translate aliases of core symbols to prime representation
  my ($tr_src_core_alias_to_prime, $tr_dst_core_alias_to_prime) = ('', '');
  for (my $i = 0; $i < $self->{ncore}; $i++) {
    my $entry = $self->{syms}->[$i];
    my $prime_sym = $entry->{sym};
    # alternate case of primary symbol?
    if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
      my $prime_sym_copy = $prime_sym; # copy
      $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
      $tr_src_core_alias_to_prime .= $prime_sym_copy;
      $tr_dst_core_alias_to_prime .= $prime_sym;
    }
    # alias symbols
    foreach my $alias_sym (@{$entry->{aliases}}) {
      $tr_src_core_alias_to_prime .= $alias_sym;
      $tr_dst_core_alias_to_prime .= $prime_sym;
      # alternate case of alias symbol?
      if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
        my $alias_sym_copy = $alias_sym; # copy
        $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
        $tr_src_core_alias_to_prime .= $alias_sym_copy;
        $tr_dst_core_alias_to_prime .= $prime_sym;
      }
    }
  }
  $tr_src_core_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
  $tr_dst_core_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
  $self->{tr_src_core_alias_to_prime} = $tr_src_core_alias_to_prime;
  $self->{tr_dst_core_alias_to_prime} = $tr_dst_core_alias_to_prime;
  # Strings to translate aliases of ambiguous symbols to prime representation
  my ($tr_src_ambig_alias_to_prime, $tr_dst_ambig_alias_to_prime) = ('', '');
  for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
    my $entry = $self->{syms}->[$i];
    my $prime_sym = $entry->{sym};
    # alternate case of primary symbol?
    if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
      my $prime_sym_copy = $prime_sym; # copy
      $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/; # swap case
      $tr_src_ambig_alias_to_prime .= $prime_sym_copy;
      $tr_dst_ambig_alias_to_prime .= $prime_sym;
    }
    # alias symbols
    foreach my $alias_sym (@{$entry->{aliases}}) {
      $tr_src_ambig_alias_to_prime .= $alias_sym;
      $tr_dst_ambig_alias_to_prime .= $prime_sym;
      # alternate case of alias symbol?
      if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
        my $alias_sym_copy = $alias_sym;
        $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/;
        $tr_src_ambig_alias_to_prime .= $alias_sym_copy;
        $tr_dst_ambig_alias_to_prime .= $prime_sym;
      }
    }
  }
  $tr_src_ambig_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
  $tr_dst_ambig_alias_to_prime =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
  $self->{tr_src_ambig_alias_to_prime} = $tr_src_ambig_alias_to_prime;
  $self->{tr_dst_ambig_alias_to_prime} = $tr_dst_ambig_alias_to_prime;
  # Strings to translate ambiguous symbols and their aliases to the wildcard
  my ($tr_src_ambig_to_wild, $tr_dst_ambig_to_wild) = ('', '');
  my $wildcard = $self->{syms}->[$self->{ncore}]->{sym};
  for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
    my $entry = $self->{syms}->[$i];
    my $prime_sym = $entry->{sym};
    # primary symbol to wildcard (unless it is the wildcard)
    if ($i != $self->{ncore}) {
      $tr_src_ambig_to_wild .= $prime_sym;
      $tr_dst_ambig_to_wild .= $wildcard;
    }
    # alternate case of primary symbol?
    if ($both_case && $prime_sym =~ m/^[A-Za-z]$/) {
      my $prime_sym_copy = $prime_sym;
      $prime_sym_copy =~ tr/A-Za-z/a-zA-Z/;
      $tr_src_ambig_to_wild .= $prime_sym_copy;
      $tr_dst_ambig_to_wild .= $wildcard;
    }
    # alias symbols
    foreach my $alias_sym (@{$entry->{aliases}}) {
      $tr_src_ambig_to_wild .= $alias_sym;
      $tr_dst_ambig_to_wild .= $wildcard;
      # alternate case of alias symbol?
      if ($both_case && $alias_sym =~ m/^[A-Za-z]$/) {
        my $alias_sym_copy = $alias_sym;
        $alias_sym_copy =~ tr/A-Za-z/a-zA-Z/;
        $tr_src_ambig_to_wild .= $alias_sym_copy;
        $tr_dst_ambig_to_wild .= $wildcard;
      }
    }
  }
  $tr_src_ambig_to_wild =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
  $tr_dst_ambig_to_wild =~ s/([\/\-])/\\$1/g; # escape forward slash and dash
  $self->{tr_src_ambig_to_wild} = $tr_src_ambig_to_wild;
  $self->{tr_dst_ambig_to_wild} = $tr_dst_ambig_to_wild;
  # Strings to translate unknown symbols to the wildcard
  my ($tr_src_unknown_to_wild, $tr_dst_unknown_to_wild) = ('', '');
  my $range_start = 0;
  for (my $i = 33; $i <= 126; $i++) {
    if (defined($self->{lookup}->{chr($i)})) {
      if ($i > ($range_start + 1)) {
        $tr_src_unknown_to_wild .= sprintf("\\%03o-\\%03o", $range_start, $i - 1);
      } elsif ($i == ($range_start + 1)) {
        $tr_src_unknown_to_wild .= sprintf("\\%03o", $range_start);
      }
      $range_start = $i + 1;
    }
  }
  $tr_src_unknown_to_wild .= sprintf("\\%03o-\\377", $range_start);
  $tr_dst_unknown_to_wild = ($wildcard eq '-' || $wildcard eq '/' ? "\\" : "") .$wildcard; # tr will extend using the last character
  $self->{tr_src_unknown_to_wild} = $tr_src_unknown_to_wild;
  $self->{tr_dst_unknown_to_wild} = $tr_dst_unknown_to_wild;
  # check 'like' a standard alphabet
  if ($self->{like}) {
    my ($req_core, $req_comp, $name);
    if ($self->{like} eq 'RNA') {
      $req_core = 'ACGU';
      $name = 'RNA';
    } elsif ($self->{like} eq 'DNA') {
      $req_core = 'ACGT';
      $req_comp = 'TGCA';
      $name = 'DNA';
    } elsif ($self->{like} eq 'PROTEIN') {
      $req_core = 'ACDEFGHIKLMNPQRSTVWY';
      $name = 'protein';
    }
    if (defined $req_core) {
      for (my $i = 0; $i < length($req_core); $i++) {
        my $sym = substr($req_core, $i, 1);
        my $symobj = $self->{lookup}->{$sym};
        unless (defined $symobj) {
          $self->_error("not $name like - expected symbol '$sym'");
        }
        unless ($symobj->{index} < $self->{ncore}) {
          $self->_error("not $name like - symbol '$sym' must be a core symbol");
        }
        if (defined $req_comp) {
          my $comp = substr($req_comp, $i, 1);
          my $compobj = $self->{lookup}->{$comp};
          if (not defined $compobj || not defined $symobj->{complement} || $symobj->{complement} != $compobj) {
            $self->_error("not $name like - symbol '$sym' complement must be '$comp'");
          }
        } else {
          #if (defined $symobj->{complement}) {
          #  $self->_error("not $name like - symbol '$sym' must not have complement");
          #}
        }
      }
    }
  }
  # done
  $self->{parsed} = 1; # TRUE
}

sub parse_file {
  my $self = shift;
  die("Already finished parsing") if $self->{parsed};
  my ($file) = @_;
  $self->{file_name} = $file;
  my ($fh, $line);
  sysopen($fh, $file, O_RDONLY);
  while($line = <$fh>) {
    chomp($line);
    $self->parse_line($line);
  }
  close($fh);
  $self->parse_done();
}

sub _sym_to_text {
  my ($sym) = @_;
  my $name_info = '';
  if (defined($sym->{name})) {
    $name_info = ' "' . &_encode_JSON_string($sym->{name}) . '"';
  }
  my $colour_info = '';
  if (defined($sym->{colour})) {
    $colour_info = sprintf(" %.06X", $sym->{colour});
  }
  return $sym->{sym} . $name_info . $colour_info;
}

sub to_text {
  my $self = shift;
  my %options = @_;
  my $out = '';
  my $gap = undef;
  # set option defaults
  $options{print_header} = 1 unless defined $options{print_header};
  $options{print_footer} = 1 unless defined $options{print_footer};
  # output header
  if ($options{print_header}) {
    $out .= "ALPHABET " . $self->name(1) . 
        ($self->{like} ? ' ' . $self->{like} . '-LIKE' : '') . "\n";
  }
  # output all paired core symbols
  for (my $i = 0; $i < $self->{ncore}; $i++) {
    my $sym = $self->{syms}->[$i];
    my $csym = $sym->{complement};
    if (defined($csym) && $sym->{index} < $csym->{index}) {
      $out .= &_sym_to_text($sym) . ' ~ ' . &_sym_to_text($csym) . "\n";
    }
  }
  # output all unpared core symbols
  for (my $i = 0; $i < $self->{ncore}; $i++) {
    my $sym = $self->{syms}->[$i];
    if (!defined($sym->{complement})) {
      $out .=  &_sym_to_text($sym) . "\n";
    }
  }
  # output ambiguous symbols
  for (my $i = $self->{ncore}; $i < scalar(@{$self->{syms}}); $i++) {
    my $sym = $self->{syms}->[$i];
    if (scalar(@{$sym->{comprise}}) == 0) {
      $gap = $sym;
      next;
    }
    my $comprise = '';
    for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
      $comprise .= $sym->{comprise}->[$j]->{sym};
    }
    $out .= &_sym_to_text($sym) . ' = ' . $comprise . "\n";
    for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
      $out .= $sym->{aliases}->[$j] . ' = ' . $comprise . "\n";
    }
  }
  # output core symbol aliases
  for (my $i = 0; $i < $self->{ncore}; $i++) {
    my $sym = $self->{syms}->[$i];
    for (my $j = 0; $j < scalar(@{$sym->{aliases}}); $j++) {
      $out .= $sym->{aliases}->[$j] . ' = ' . $sym->{sym} . "\n";
    }
  }
  # output gap symbol and aliases
  if ($gap) {
    $out .= &_sym_to_text($gap) . " =\n"; 
    for (my $j = 0; $j < scalar(@{$gap->{aliases}}); $j++) {
      $out .= $gap->{aliases}->[$j] . " =\n";
    }
  }
  # output footer
  if ($options{print_footer}) {
    $out .= "END ALPHABET\n";
  }
  return $out;
}

sub to_json() {
  my $self = shift;
  my ($jw) = @_;
  $jw->start_object_value();
  $jw->str_prop("name", $self->{name});
  $jw->str_prop("like", lc($self->{like})) if ($self->{like});
  $jw->num_prop("ncore", $self->{ncore});
  $jw->property("symbols");
  $jw->start_array_value();
  for (my $i = 0; $i < scalar(@{$self->{syms}}); $i++) {
    my $sym = $self->{syms}->[$i];
    $jw->start_object_value();
    $jw->str_prop("symbol", $sym->{sym});
    $jw->str_prop("aliases", join("", @{$sym->{aliases}})) if (scalar(@{$sym->{aliases}}) > 0);
    $jw->str_prop("name", $sym->{name}) if (defined($sym->{name}));
    $jw->str_prop("colour", sprintf("%06X", $sym->{colour})) if (defined($sym->{colour}) && $sym->{colour} > 0);
    if ($i < $self->{ncore}) {
      $jw->str_prop("complement", $sym->{complement}->{sym}) if (defined($sym->{complement}));
    } else {
      my $comprise = '';
      for (my $j = 0; $j < scalar(@{$sym->{comprise}}); $j++) {
        $comprise .= $sym->{comprise}->[$j]->{sym};
      }
      $jw->str_prop("equals", $comprise);
    }
    $jw->end_object_value();
  }
  $jw->end_array_value();
  $jw->end_object_value();
}

sub has_errors {
  my $self = shift;
  return scalar(@{$self->{errors}});
}

sub get_errors {
  my $self = shift;
  return @{$self->{errors}};
}

sub name {
  my $self = shift;
  my ($should_encode) = @_;
  if ($should_encode) {
    return '"' . &_encode_JSON_string($self->{name}) . '"';
  } else {
    return $self->{name};
  }
}

sub has_complement {
  my $self = shift;
  die("Not finished parsing") unless $self->{parsed};
  return $self->{fully_complementable};
}

sub size_core {
  my $self = shift;
  return $self->{ncore};
}

sub size_full {
  my $self = shift;
  return scalar(@{$self->{syms}});
}

#
# Return the input list in list context
# and the flattened list in scalar context.
#
sub _flatten_list_in_scalar_context {
    if (wantarray) {
        return @_;
    } else {
        return join('' , @_);
    }
}

#
# Returns the concatenation of all the
# possible ambiguity codes of the alphabet.
# Returns a list in list context, otherwise a scalar.
#
sub get_ambig_syms {
    my $self = shift;
    my @ambig_syms;

    foreach my $sym ($self->get_syms()) {
        unless ($self->is_core($sym)) {
            push(@ambig_syms, $sym);
        }
    }

    return _flatten_list_in_scalar_context(@ambig_syms);
}

#
# Returns the concatenation of all the possible symbols
# of the alphabet.
# Returns a list in list context, otherwise a scalar.
#
sub get_syms {
    my $self = shift;
    my @syms;

    foreach my $symbol (@{$self->{syms}}) {
        push(@syms, $symbol->{sym});
    }

    return _flatten_list_in_scalar_context(@syms);
}

#
# Returns the concatenation of all the core symbols
# of the alphabet.
# Returns a list in list context, otherwise a scalar.
#
sub get_core {
    my $self = shift;
    my @core_syms;

    foreach my $sym ($self->get_syms()) {
        if ($self->is_core($sym)) {
            push(@core_syms, $sym);
        }
    }

    return _flatten_list_in_scalar_context(@core_syms);
}

#
# Returns a dictionary containing the core symbols that
# each ambiguity code of the alphabet is composed of.
#
sub get_alternates_dict {
    my $self = shift;
    my %alternates;
    foreach my $ambig_base (split(//, $self->get_ambig_syms())) {
        %alternates = (%alternates, %{$self->comprise($ambig_base)});
    }
    return %alternates;
}

#
# Is the symbol part of the core alphabet.
#
sub is_core {
  my $self = shift;
  my ($letter) = @_;
  my $sym = $self->{lookup}->{$letter};
  return defined($sym) && $sym->{index} < $self->{ncore};
}

#
# Is the symbol part of the alphabet.
#
sub is_known {
  my $self = shift;
  my ($letter) = @_;
  my $sym = $self->{lookup}->{$letter};
  return defined($sym);
}

#
# Does the sequence only contain ambiguous symbols.
# This might be useful if you want to throw away something containing only
# ambiguous symbols as that has no real useful information.
#
sub is_ambig_seq {
  my $self = shift;
  my ($seq) = @_;
  my $ambig_only_re = $self->{re_ambig_only};

  return scalar($seq =~ m/$ambig_only_re/);
}

#
# Does the sequence contain ambiguous symbols.
#
sub is_part_ambig_seq {
  my $self = shift;
  my ($seq) = @_;
  my $ambig_re = $self->{re_ambig};

  return scalar($seq =~ m/$ambig_re/);
}

sub char {
  my $self = shift;
  my ($index) = @_;

  return if ($index < 0 || $index > scalar(@{$self->{syms}}));

  return $self->{syms}->[$index]->{sym};
}

#
# Converts a symbol into the index in the alphabet.
#
sub index {
  my $self = shift;
  my ($letter) = @_;
  my $sym = $self->{lookup}->{$letter};

  return unless defined $sym;

  return $sym->{index};
}

#
# Converts a symbol that could be an alias into the primary symbol.
#
sub encode {
  my $self = shift;
  my ($letter) = @_;
  my $sym = $self->{lookup}->{$letter};

  return unless defined $sym;
  return $sym->{sym};
}

#
# Takes a letter and returns the set of comprising core symbols.
#
sub comprise {
  my $self = shift;
  my ($letter) = @_;
  my $sym = $self->{lookup}->{$letter};
  my %set = ();
  if (defined($sym->{comprise})) {
    for (my $i = 0; $i < scalar(@{$sym->{comprise}}); $i++) {
      $set{$sym->{comprise}->[$i]->{sym}} = 1;
    }
  } else {
    $set{$sym->{sym}} = 1;
  }
  return \%set;
}

#
# Converts a set of core symbols into either a matching ambigous symbol or
# a group of core symbols.
#
sub regex {
  my $self = shift;
  my ($set) = @_;
  my $key = join('', sort _symbol_cmp keys %{$set});
  my $sym = $self->{comprise_lookup}->{$key};
  if (defined($sym)) {
    return $sym->{sym};
  } else {
    return '['.$key.']';
  }
}

#
# Takes a set of core symbols and creates a set containing the other core symbols.
#
sub negate_set {
  my $self = shift;
  my ($set) = @_;
  my %negated_set = ();

  for (my $i = 0; $i < $self->{ncore}; $i++) {
    my $a = $self->{syms}->[$i]->{sym};
    if (!($set->{$a})) {
      $negated_set{$a} = 1;
    }
  }

  return \%negated_set;
}

#
# Finds the first unknown symbol.
#
sub find_unknown {
  my $self = shift;
  my ($seq) = @_;
  my $not_valid = $self->{re_not_valid};

  if ($seq =~ m/($not_valid)/) {
    return ($-[1], $1);
  } else {
    return;
  }
}

#
# Reverse complements a sequence using the alphabet.
#
sub rc_seq {
  my $self = shift;
  croak("Alphabet does not allow complementing.") unless $self->has_complement();
  my ($sequence) = @_;
  my @seq = split //, $sequence;

  @seq = reverse @seq;
  @seq = map { $self->{lookup}->{$_}->{complement}->{sym} } @seq;

  return join('', @seq);
}

#
# Translate a sequence using the alphabet.
#
# NO_ALIASES    Convert any alias symbols into the main symbol.
# NO_AMBIGS     Convert any ambiguous symbols into the main wildcard symbol.
# NO_UNKNOWN    Convert any unknown symbols into the main wildcard symbol.
#
sub translate_seq {
  my $self = shift;
  my ($sequence, %options) = @_;
  my $no_aliases = (defined($options{NO_ALIASES}) ? $options{NO_ALIASES} : 0);
  my $no_ambigs = (defined($options{NO_AMBIGS}) ? $options{NO_AMBIGS} : 0);
  my $no_unknown = (defined($options{NO_UNKNOWN}) ? $options{NO_UNKNOWN} : 0);
  my ($src, $dst) = ('', '');
  if ($no_aliases) {
    $src .= $self->{tr_src_core_alias_to_prime};
    $dst .= $self->{tr_dst_core_alias_to_prime};
    if ($no_ambigs) {
      $src .= $self->{tr_src_ambig_to_wild};
      $dst .= $self->{tr_dst_ambig_to_wild};
    } else {
      $src .= $self->{tr_src_ambig_alias_to_prime};
      $dst .= $self->{tr_dst_ambig_alias_to_prime};
    }
  } elsif ($no_ambigs) {
    $src .= $self->{tr_src_ambig_to_wild};
    $dst .= $self->{tr_dst_ambig_to_wild};
  }
  if ($no_unknown) {
    $src .= $self->{tr_src_unknown_to_wild};
    $dst .= $self->{tr_dst_unknown_to_wild};
  }

  eval {$sequence =~ tr/$src/$dst/};

  return $sequence;
}

#
# Returns a scalar of Weblogo 3 colour arguments:
# --color COLOR SYMBOLS DESCRIPTION ...
# Currently only operates over core symbols.
sub get_Weblogo_colour_args {
    my $self = shift;

    my $arg_string = "";

    foreach my $letter ($self->get_core()) {
        my $sym = $self->{lookup}->{$letter};

        # convert back from decoded hex, adding '#' prefix
        my $HTML_colour = sprintf("#%06X", $sym->{colour});

        $arg_string .= "--color '$HTML_colour' '$sym->{sym}' '$sym->{name}' ";
    }

    return $arg_string;
}

#
# Compare 2 different alphabet objects for equality
#
sub equals {
  my $self = shift;
  my ($other) = @_;
  return 0 if ($self->size_core() != $other->size_core());
  return 0 if ($self->size_full() != $other->size_full());
  return 0 if ($self->has_complement() != $other->has_complement());
  return 0 if (defined($self->{name}) != defined($other->{name}));
  if (defined($self->{name}) && defined($other->{name})) {
    return 0 if ($self->{name} ne $other->{name});
  }
  my $nsyms = $self->size_full();
  for (my $i = 0; $i < $nsyms; $i++) {
    my $self_sym = $self->{syms}->[$i];
    my $other_sym = $other->{syms}->[$i];
    return 0 if ($self_sym->{sym} ne $other_sym->{sym});
    return 0 if (defined($self_sym->{name}) != defined($other_sym->{name}));
    if (defined($self_sym->{name}) && defined($other_sym->{name})) {
      return 0 if ($self_sym->{name} ne $other_sym->{name});
    }
    return 0 if (defined($self_sym->{colour}) != defined($other_sym->{colour}));
    if (defined($self_sym->{colour}) && defined($other_sym->{colour})) {
      return 0 if ($self_sym->{colour} != $other_sym->{colour});
    }
    return 0 if (scalar(@{$self_sym->{aliases}}) != scalar(@{$other_sym->{aliases}}));
    my $naliases = scalar(@{$self_sym->{aliases}});
    for (my $j = 0; $j < $naliases; $j++) {
      return 0 if ($self_sym->{aliases}->[$j] ne $other_sym->{aliases}->[$j]);
    }
  }
  $nsyms = $self->size_core();
  for (my $i = 0; $i < $nsyms; $i++) {
    my $self_sym = $self->{syms}->[$i];
    my $other_sym = $other->{syms}->[$i];
    return 0 if (defined($self_sym->{complement}) != defined($other_sym->{complement}));
    if (defined($self_sym->{complement}) && defined($other_sym->{complement})) {
      return 0 if ($self_sym->{complement}->{sym} ne $other_sym->{complement}->{sym});
    }
  }
  $nsyms = $self->size_full();
  for (my $i = $self->size_core(); $i < $nsyms; $i++) {
    my $self_sym = $self->{syms}->[$i];
    my $other_sym = $other->{syms}->[$i];
    return 0 if (scalar(@{$self_sym->{comprise}}) != scalar(@{$other_sym->{comprise}}));
    for (my $j = 0; $j < scalar(@{$self_sym->{comprise}}); $j++) {
      return 0 if ($self_sym->{comprise}->[$j]->{sym} ne $other_sym->{comprise}->[$j]->{sym});
    }
  }
  return 1;
}

sub is_dna {
  my $self = shift;
  $self->{cached_is_dna} = $self->equals(&dna()) unless defined $self->{cached_is_dna};
  return $self->{cached_is_dna};
}

sub is_moddna {
  my $self = shift;
  $self->{cached_is_moddna} = $self->equals(&moddna()) unless defined $self->{cached_is_moddna};
  return $self->{cached_is_moddna};
}

sub is_rna {
  my $self = shift;
  $self->{cached_is_rna} = $self->equals(&rna()) unless defined $self->{cached_is_rna};
  return $self->{cached_is_rna};
}

sub is_protein {
  my $self = shift;
  $self->{cached_is_protein} = $self->equals(&protein()) unless defined $self->{cached_is_protein};
  return $self->{cached_is_protein};
}

#
# Helper function to decode a string encoded using the method defined in the JSON specification.
#
sub _decode_JSON_string {
  my ($text) = @_;

  return unless defined $text;

  $text =~ s/\\(["\\\/])/$1/g;
  $text =~ s/\\b/\x{8}/g;
  $text =~ s/\\f/\x{C}/g;
  $text =~ s/\\n/\x{A}/g;
  $text =~ s/\\r/\x{D}/g;
  $text =~ s/\\t/\x{9}/g;
  $text =~ s/\\u([0-9A-Fa-f]{4})/chr(hex($1))/ge;

  return $text;
}

#
# Helper function to encode a string using the method defined in the JSON specification.
#
sub _encode_JSON_string {
  my ($text) = @_;

  $text =~ s/(["\\\/])/\\$1/g;
  $text =~ s/\x{8}/\\b/g;
  $text =~ s/\x{C}/\\f/g;
  $text =~ s/\x{A}/\\n/g;
  $text =~ s/\x{D}/\\r/g;
  $text =~ s/\x{9}/\\t/g;
  $text =~ s/([\x{0}-\x{1F}\x{7F}-\x{9F}\x{2028}\x{2029}])/sprintf("\\u%.04X",ord($1))/ge;

  return $text;
}

#
# Helper function to convert the colour into a number.
#
sub _decode_colour {
  my ($text) = @_;

  return unless defined $text;

  return hex($text);
}

# _symbol_cmp
# Sorts so letters are before numbers which are before symbols.
# Otherwise sorts using normal string comparison.
sub _symbol_cmp($$) {  ## no critic
  my ($a, $b) = @_;

  if (length($a) == length($b)) {
    for (my $i = 0; $i < length($a); $i++) {
      my ($sym_a, $sym_b);
      $sym_a = ord(substr($a, $i, 1));
      $sym_b = ord(substr($b, $i, 1));
      # check to see if either is a letter
      my ($is_letter_a, $is_letter_b);
      $is_letter_a = (($sym_a >= ord('A') && $sym_a <= ord('Z')) || ($sym_a >= ord('a') && $sym_a <= ord('z')));
      $is_letter_b = (($sym_b >= ord('A') && $sym_b <= ord('Z')) || ($sym_b >= ord('a') && $sym_b <= ord('z')));
      if ($is_letter_a) {
        if ($is_letter_b) {
          if ($sym_a < $sym_b) {
            return -1;
          } elsif ($sym_b < $sym_a) {
            return 1;
          }
          next;
        } else {
          return -1;
        }
      } elsif ($is_letter_b) {
        return 1;
      }

      # check to see if either is a number
      my ($is_num_a, $is_num_b);
      $is_num_a = ($sym_a ge '0' && $sym_a le '9');
      $is_num_b = ($sym_b ge '0' && $sym_b le '9');
      if ($is_num_a) {
        if ($is_num_b) {
          if ($sym_a < $sym_b) {
            return -1;
          } elsif ($sym_b < $sym_a) {
            return 1;
          }
          next;
        } else {
          return -1;
        }
      } elsif ($is_num_b) {
        return 1;
      }

      # both must be symbols
      if ($sym_a < $sym_b) {
        return -1;
      } elsif ($sym_b < $sym_a) {
        return 1;
      }
    }

    return 0;
  } elsif (length($a) > length($b)) {
    return -1;
  } else {
    return 1;
  }
}

sub _symobj_cmp($$) {  ## no critic
  my ($sym1, $sym2) = @_;
  my $ret;

  # compare comprise
  if (defined($sym1->{comprise}) && defined($sym2->{comprise})) {
    $ret = _symbol_cmp($sym1->{comprise}, $sym2->{comprise});
    return $ret if $ret != 0;
  } elsif (defined($sym1->{comprise})) {
    return 1; # sym2 first because it is a core symbol
  } elsif (defined($sym2->{comprise})) {
    return -1; # sym1 first because it is a core symbol
  }

  # compare symbol
  $ret = _symbol_cmp($sym1->{sym}, $sym2->{sym});
  return $ret if $ret != 0;

  # compare complement
  if (defined($sym1->{complement}) && defined($sym2->{complement})) {
    $ret = _symbol_cmp($sym1->{complement}, $sym2->{complement});
    return $ret if $ret != 0;
  } elsif (defined($sym1->{complement})) {
    return 1;
  } elsif (defined($sym2->{complement})) {
    return -1;
  }

  # compare aliases
  if (scalar(@{$sym1->{aliases}}) != scalar(@{$sym2->{aliases}})) {
    # sort length decending
    return scalar(@{$sym2->{aliases}}) <=> scalar(@{$sym1->{aliases}});
  }
  for (my $i = 0; $i < scalar(@{$sym1->{aliases}}); $i++) {
    $ret = _symbol_cmp($sym1->{aliases}->[$i], $sym2->{aliases}->[$i]);
    return $ret if $ret != 0;
  }

  # compare name
  if (defined($sym1->{name}) && defined($sym2->{name})) {
    $ret = $sym1->{name} cmp $sym2->{name};
    return $ret if $ret != 0;
  } elsif (defined($sym1->{name})) {
    return 1;
  } elsif (defined($sym2->{name})) {
    return -1;
  }

  # compare colour
  if (defined($sym1->{colour}) && defined($sym2->{colour})) {
    return $sym1->{colour} <=> $sym2->{colour};
  } elsif (defined($sym1->{colour})) {
    return 1;
  } elsif (defined($sym2->{colour})) {
    return -1;
  }
  return 0;
}

#
# Return the RNA alphabet.
#
sub rna {
  my $alph = new __PACKAGE__;

  $alph->parse_header('RNA', 'RNA');

  # core symbols
  $alph->parse_symbol('A', 'Adenine', 0xCC0000);
  $alph->parse_symbol('C', 'Cytosine', 0x0000CC);
  $alph->parse_symbol('G', 'Guanine', 0xFFB300);
  $alph->parse_symbol('U', 'Uracil', 0x008000, undef, undef, 'T');
  # ambiguous symbols
  $alph->parse_symbol('W', 'Weak', undef, undef, 'AU');
  $alph->parse_symbol('S', 'Strong', undef, undef, 'CG');
  $alph->parse_symbol('M', 'Amino', undef, undef, 'AC');
  $alph->parse_symbol('K', 'Keto', undef, undef, 'GU');
  $alph->parse_symbol('R', 'Purine', undef, undef, 'AG');
  $alph->parse_symbol('Y', 'Pyrimidine', undef, undef, 'CU');
  $alph->parse_symbol('B', 'Not A', undef, undef, 'CGU');
  $alph->parse_symbol('D', 'Not C', undef, undef, 'AGU');
  $alph->parse_symbol('H', 'Not G', undef, undef, 'ACU');
  $alph->parse_symbol('V', 'Not U', undef, undef, 'ACG');
  $alph->parse_symbol('N', 'Any base', undef, undef, 'ACGU', 'X.');
  # for now treat '.' (gap) as a wildcard

  # process
  $alph->parse_done();

  # error check
  die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());

  return $alph;
}

#
# Parse core symbols in the DNA alphabet.
# This is done separately, to ease expansion.
# All core symbols must be parsed before any ambiguous symbols.
# Optionally accepts arguments to set colours (alphabetical order).
#
sub _parse_core_DNA_symbols {
  my $alph = shift;

  $$alph->parse_symbol('A', 'Adenine', shift || 0xCC0000, 'T');
  $$alph->parse_symbol('C', 'Cytosine', shift || 0x0000CC, 'G');
  $$alph->parse_symbol('G', 'Guanine', shift || 0xFFB300, 'C');
  $$alph->parse_symbol('T', 'Thymine', shift || 0x008000, 'A', undef, 'U');
}

#
# Parse ambiguous symbols in the DNA alphabet.
# This is done separately, to ease expansion.
#
sub _parse_ambiguous_DNA_symbols {
  my $alph = shift;

  $$alph->parse_symbol('W', 'Weak', undef, undef, 'AT');
  $$alph->parse_symbol('S', 'Strong', undef, undef, 'CG');
  $$alph->parse_symbol('M', 'Amino', undef, undef, 'AC');
  $$alph->parse_symbol('K', 'Keto', undef, undef, 'GT');
  $$alph->parse_symbol('R', 'Purine', undef, undef, 'AG');
  $$alph->parse_symbol('Y', 'Pyrimidine', undef, undef, 'CT');
  $$alph->parse_symbol('B', 'Not A', undef, undef, 'CGT');
  $$alph->parse_symbol('D', 'Not C', undef, undef, 'AGT');
  $$alph->parse_symbol('H', 'Not G', undef, undef, 'ACT');
  $$alph->parse_symbol('V', 'Not T', undef, undef, 'ACG');
  $$alph->parse_symbol('N', 'Any base', undef, undef, 'ACGT', 'X.');
}

#
# Return the DNA alphabet.
#
sub dna {
  my $alph = new __PACKAGE__;

  $alph->parse_header('DNA', 'DNA');

  _parse_core_DNA_symbols(\$alph);
  _parse_ambiguous_DNA_symbols(\$alph);

  # for now treat '.' (gap) as a wildcard
  # process
  $alph->parse_done();

  die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());

  return $alph;
}

#
# Return the DNA with cytosine modifications alphabet.
#
sub moddna {
  my $alph = new __PACKAGE__;

  $alph->parse_header('modDNA', 'DNA');

  _parse_core_DNA_symbols(\$alph, 0x8510A8, 0xA50026, 0x313695, 0xA89610);

  # additional core symbols
  $alph->parse_symbol('m', '5-Methylcytosine', 0xD73027, '1');
  $alph->parse_symbol('1', 'Guanine:5-Methylcytosine', 0x4575B4, 'm');
  $alph->parse_symbol('h', '5-Hydroxymethylcytosine', 0xF46D43, '2');
  $alph->parse_symbol('2', 'Guanine:5-Hydroxymethylcytosine', 0x74ADD1, 'h');
  $alph->parse_symbol('f', '5-Formylcytosine', 0xFDAE61, '3');
  $alph->parse_symbol('3', 'Guanine:5-Formylcytosine', 0xABD9E9, 'f');
  $alph->parse_symbol('c', '5-Carboxylcytosine', 0xFEE090, '4');
  $alph->parse_symbol('4', 'Guanine:5-Carboxylcytosine', 0xE0F3F8, 'c');

  _parse_ambiguous_DNA_symbols(\$alph);

  # additional ambiguous symbols
  $alph->parse_symbol('z', 'Any C mod', undef, undef, 'Cmhfc');
  $alph->parse_symbol('9', 'Guanine:any C mod', undef, undef, 'G1234');
  $alph->parse_symbol('y', 'C, not (hydroxy)methylcytosine', undef, undef, 'Cfc');
  $alph->parse_symbol('8', 'Guanine:C, not (hydroxy)methylcytosine', undef, undef, 'G34');
  $alph->parse_symbol('x', '(Hydroxy)methylcytosine', undef, undef, 'mh');
  $alph->parse_symbol('7', 'Guanine:(hydroxy)methylcytosine', undef, undef, '12');
  $alph->parse_symbol('w', '(Formyl/carboxyl)cytosine', undef, undef, 'fc');
  $alph->parse_symbol('6', 'Guanine:(formyl/carboxyl)cytosine', undef, undef, '34');

  # process
  $alph->parse_done();

  die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());

  return $alph;
}

#
# Return the Protein alphabet.
#
sub protein {
  my $alph = new __PACKAGE__;

  $alph->parse_header('Protein', 'PROTEIN');

  # core symbols
  $alph->parse_symbol('A', 'Alanine', 0x0000CC);
  $alph->parse_symbol('R', 'Arginine', 0xCC0000);
  $alph->parse_symbol('N', 'Asparagine', 0x008000);
  $alph->parse_symbol('D', 'Aspartic acid', 0xFF00FF);
  $alph->parse_symbol('C', 'Cysteine', 0x0000CC);
  $alph->parse_symbol('E', 'Glutamic acid', 0xFF00FF);
  $alph->parse_symbol('Q', 'Glutamine', 0x008000);
  $alph->parse_symbol('G', 'Glycine', 0xFFB300);
  $alph->parse_symbol('H', 'Histidine', 0xFFCCCC);
  $alph->parse_symbol('I', 'Isoleucine', 0x0000CC);
  $alph->parse_symbol('L', 'Leucine', 0x0000CC);
  $alph->parse_symbol('K', 'Lysine', 0xCC0000);
  $alph->parse_symbol('M', 'Methionine', 0x0000CC);
  $alph->parse_symbol('F', 'Phenylalanine', 0x0000CC);
  $alph->parse_symbol('P', 'Proline', 0xFFFF00);
  $alph->parse_symbol('S', 'Serine', 0x008000);
  $alph->parse_symbol('T', 'Threonine', 0x008000);
  $alph->parse_symbol('W', 'Tryptophan', 0x0000CC);
  $alph->parse_symbol('Y', 'Tyrosine', 0x33E6CC);
  $alph->parse_symbol('V', 'Valine', 0x0000CC);
  # ambiguous symbols
  $alph->parse_symbol('B', 'Asparagine or Aspartic acid', undef, undef, 'ND');
  $alph->parse_symbol('Z', 'Glutamine or Glutamic acid', undef, undef, 'QE');
  $alph->parse_symbol('J', 'Leucine or Isoleucine', undef, undef, 'LI');
  $alph->parse_symbol('X', 'Any amino acid', undef, undef, 'ARNDCEQGHILKMFPSTWYV', '*.');
  # for now treat '*' (stop codon) and '.' (gap) as a wildcard

  # process
  $alph->parse_done();

  # error check
  die("Uhoh, this shouldn't happen:\n" + join("\n", $alph->get_errors())) if ($alph->has_errors());

  return $alph;
}

1;
__END__