Yet Another CPAN Grep

Math-NumSeq/lib/Math/NumSeq/OEIS/File.pm

# Copyright 2011, 2012, 2013, 2014, 2016, 2018, 2019, 2020 Kevin Ryde

# This file is part of Math-NumSeq.
#
# Math-NumSeq is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3, or (at your option) any later
# version.
#
# Math-NumSeq is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License along
# with Math-NumSeq.  If not, see <http://www.gnu.org/licenses/>.


# http://oeis.org/wiki/Clear-cut_examples_of_keywords
#
# ENHANCE-ME: share most of the a-file/b-file reading with Math::NumSeq::File
#
# FIXME: When using a b-file only, does i_start come out right?  Meaning the
# first index in the file.

package Math::NumSeq::OEIS::File;
use 5.004;
use strict;
use Carp;
use POSIX ();
use File::Spec;
use Symbol 'gensym';

use vars '$VERSION','@ISA';
$VERSION = 75;

use Math::NumSeq;
@ISA = ('Math::NumSeq');
*_to_bigint = \&Math::NumSeq::_to_bigint;

use vars '$VERSION';
$VERSION = 75;

eval q{use Scalar::Util 'weaken'; 1}
  || eval q{sub weaken { $_[0] = undef }; 1 }
  || die "Oops, error making a weaken() fallback: $@";

# uncomment this to run the ### lines
# use Smart::Comments;


# use constant name => Math::NumSeq::__('OEIS File');
use Math::NumSeq::OEIS;
*parameter_info_array = \&Math::NumSeq::OEIS::parameter_info_array;

use constant::defer _HAVE_ENCODE => sub {
  eval { require Encode; 1 } || 0;
};

sub description {
  my ($class_or_self) = @_;
  if (ref $class_or_self && defined $class_or_self->{'description'}) {
    # instance
    return $class_or_self->{'description'};
  } else {
    # class
    return Math::NumSeq::__('OEIS sequence from file.');
  }
}

sub values_min {
  my ($self) = @_;
  ### OEIS-File values_min() ...
  return _analyze($self)->{'values_min'};
}
sub values_max {
  my ($self) = @_;
  ### OEIS-File values_max() ...
  return _analyze($self)->{'values_max'};
}

my %analyze_characteristics = (increasing            => 1,
                               increasing_from_i     => 1,
                               non_decreasing        => 1,
                               non_decreasing_from_i => 1,
                               smaller               => 1,
                              );
sub characteristic {
  my ($self, $key) = @_;
  if ($analyze_characteristics{$key}) {
    _analyze($self);
  }
  return shift->SUPER::characteristic(@_);
}

sub oeis_dir {
  require File::HomeDir;
  return File::Spec->catfile (File::HomeDir->my_home, 'OEIS');
}
sub anum_to_bfile {
  my ($anum, $prefix) = @_;
  $prefix ||= 'b';
  $anum =~ s/^A/$prefix/;
  return "$anum.txt";
}

#------------------------------------------------------------------------------
# Keep track of all instances which exist and on an ithread CLONE re-open
# any filehandles in the instances, so they have their own independent file
# positions in the new thread.

my %instances;
sub DESTROY {
  my ($self) = @_;
  delete $instances{$self+0};
}
sub CLONE {
  my ($class) = @_;
  foreach my $self (values %instances) {
    next unless $self;
    next unless $self->{'fh'};
    my $pos = _tell($self);
    my $fh = gensym;
    if (open $fh, "< $self->{'filename'}") {
      $self->{'fh'} = $fh;
      _seek ($self, $pos);
    } else {
      delete $self->{'fh'};
      delete $self->{'filename'};
    }
  }
}

#------------------------------------------------------------------------------

# The length in decimal digits of the biggest value which fits in a plain
# Perl integer.  For example on a 32-bit system this is 9 since 9 digit
# numbers such as "999_999_999" are the biggest which fit a signed IV
# (+2^31).
#
# The IV size is probed rather than using ~0 since under "perl -Minteger"
# have ~0 as -1 rather than the biggest UV ... except "use integer" is not
# normally global.
#
# The NV size is applied to the limit too since not sure should trust values
# to stay in IV or UV.  This means on a 64-bit integer with 53-bit NV
# "double" the limit is 53-bits.
#
use constant 1.02 _MAX_DIGIT_LENGTH => do {
  ### ~0 is: ~0

  my $iv = 0;
  for (1 .. 256) {
    my $new = ($iv << 1) | 1;
    unless ($new > $iv && ($new & 1) == 1) {
      last;
    }
    $iv = $new;
  }
  ### $iv

  require POSIX;
  my $nv = POSIX::FLT_RADIX() ** (POSIX::DBL_MANT_DIG()-5);
  ### $nv

  my $iv_len = length($iv) - 1;
  my $nv_len = length($nv) - 1;
  ($iv_len < $nv_len ? $iv_len : $nv_len)  # smaller of the two lengths;
};
### _MAX_DIGIT_LENGTH: _MAX_DIGIT_LENGTH()


#------------------------------------------------------------------------------

# special case a000000.txt files to exclude
#
my %afile_exclude
  = (
     # a003849.txt has replication level words rather than the individual
     # sequence values.
     'a003849.txt' => 1,

     # a027750.txt is unflattened divisors as lists.
     # Its first line is a correct looking "1 1" so _afile_is_good() doesn't
     # notice.
     'a027750.txt' => 1,
    );


# Fields:
#   fh          File handle ref, if reading B-file or A-file
#
#   next_seek   File pos to seek $fh for next() to read from.
#               ith() sets this when it moves the file position.
#
#   array       Arrayref of values if using .internal or .html.
#   array_pos   Index 0,1,2,... of next value of $array to return by next().
#
#   i           Next $i for next() to return.
#               When reading a file this is ignored, use the file i instead.

sub new {
  ### OEIS-File new() ...
  my $self = shift->SUPER::new(@_);

  delete $self->{'next_seek'}; # no initial seek
  $self->{'characteristic'}->{'integer'} = 1;

  my $anum = $self->{'anum'};
  (my $num = $anum) =~ s/^A//;
  if (defined(my $filename = $self->{'_b_filename'})) {
    $self->{'filename'} = $filename; # the B-file or A-file name
    my $fh = gensym();
    $self->{'fh'} = $fh;
    open $fh, "< $filename"
      or croak "Cannot open $filename: ", $!;
    $self->{'fh_i'} = $self->i_start;  # at first entry

  } else {
    foreach my $basefile ("a$num.txt",
                          "b$num.txt"
                         ) {
      next if $afile_exclude{$basefile};

      next if $self->{'_dont_use_afile'} && $basefile =~ /^a/;
      next if $self->{'_dont_use_bfile'} && $basefile =~ /^b/;

      my $filename = File::Spec->catfile (oeis_dir(), $basefile);
      ### $filename
      my $fh = gensym();
      if (! open $fh, "< $filename") {
        ### cannot open: $!
        next;
      }

      $self->{'filename'} = $filename; # the B-file or A-file name
      $self->{'fh'} = $fh;
      if (! _afile_is_good($self)) {
        ### this afile not good ...
        close delete $self->{'fh'};
        delete $self->{'filename'};
        next;
      }
      ### opened: $fh
      $self->{'fh_i'} = $self->i_start;  # at first entry
      last;
    }
  }


  my $have_info = (_read_internal_txt($self, $anum)
                   || _read_internal_html($self, $anum)
                   || _read_html($self, $anum));

  if (! $have_info && ! $self->{'fh'}) {
    croak 'OEIS file(s) not found for A-number "',$anum,'"';
  }

  weaken($instances{$self+0} = $self);
  return $self;
}

sub _analyze {
  my ($self) = @_;

  if ($self->{'analyze_done'}) {
    return $self;
  }
  $self->{'analyze_done'} = 1;

  ### _analyze() ...

  my $i_start = $self->i_start;
  my ($i, $value);
  my ($prev_i, $prev_value);

  my $values_min;
  my $values_max;
  my $increasing_from_i = $i_start;
  my $non_decreasing_from_i = $i_start;
  my $strictly_smaller_count = 0;
  my $smaller_count = 0;
  my $total_count = 0;

  my $analyze = sub {
    ### $prev_value
    ### $value
    if (! defined $values_min || $value < $values_min) {
      $values_min = $value;
    }
    if (! defined $values_max || $value > $values_max) {
      $values_max = $value;
    }

    if (defined $prev_value) {
      my $cmp = ($value <=> $prev_value);
      if ($cmp < 0) {
        # value < $prev_value
        $increasing_from_i = $i;
        $non_decreasing_from_i = $i;
      }
      if ($cmp <= 0) {
        # value <= $prev_value
        $increasing_from_i = $i;
      }
    }

    $total_count++;
    $smaller_count += (abs($value) <= $i);
    $strictly_smaller_count += ($value < $i);

    $prev_i = $value;
    $prev_value = $value;
  };

  if (my $fh = $self->{'fh'}) {
    my $oldpos = _tell($self);
    while (($i, $value) = _readline($self)) {
      $analyze->($value);
      last if $total_count > 200;
    }
    _seek ($self, $oldpos);
  } else {
    $i = $i_start;
    foreach (@{$self->{'array'}}) {
      $i++;
      $value = $_;
      $analyze->();
    }
  }

  my $range_is_small = (defined $values_max
                        && $values_max - $values_min <= 16);
  ### $range_is_small

  # "full" means whole sequence in sample values
  # "sign" means negatives in sequence
  if (! defined $self->{'values_min'}
      && ($range_is_small
          || $self->{'characteristic'}->{'OEIS_full'}
          || ! $self->{'characteristic'}->{'OEIS_sign'})) {
    ### set values_min: $values_min
    $self->{'values_min'} = $values_min;
  }
  if (! defined $self->{'values_max'}
      && ($range_is_small
          || $self->{'characteristic'}->{'OEIS_full'})) {
    ### set values_max: $values_max
    $self->{'values_max'} = $values_max;
  }

  $self->{'characteristic'}->{'smaller'}
    = ($total_count == 0
       || ($smaller_count / $total_count >= .9
           && $strictly_smaller_count > 0));
  ### decide smaller: $self->{'characteristic'}->{'smaller'}

  ### $increasing_from_i
  if (defined $prev_i && $increasing_from_i < $prev_i) {
    if ($increasing_from_i - $i_start < 20) {
      $self->{'characteristic'}->{'increasing_from_i'} = $increasing_from_i;
    }
    if ($increasing_from_i == $i_start) {
      $self->{'characteristic'}->{'increasing'} = 1;
    }
  }

  ### $non_decreasing_from_i
  if (defined $prev_i && $non_decreasing_from_i < $prev_i) {
    if ($non_decreasing_from_i - $i_start < 20) {
      $self->{'characteristic'}->{'non_decreasing_from_i'} = $non_decreasing_from_i;
    }
    if ($non_decreasing_from_i == $i_start) {
      $self->{'characteristic'}->{'non_decreasing'} = 1;
    }
  }

  return $self;
}

# # compare $x <=> $y but in strings in case they're bigger than IV or NV
# # my $cmp = _value_cmp ($value, $prev_value);
# sub _value_cmp {
#   my ($x, $y) = @_;
#   ### _value_cmp(): "$x  $y"
#   ### cmp: $x cmp $y
#
#   my $x_neg = substr($x,0,1) eq '-';
#   my $y_neg = substr($y,0,1) eq '-';
#   ### $x_neg
#   ### $y_neg
#
#   return ($y_neg <=> $x_neg
#           || ($x_neg ? -1 : 1) * (length($x) <=> length($y)
#                                   || $x cmp $y));
# }

sub _seek {
  my ($self, $pos) = @_;
  seek ($self->{'fh'}, $pos, 0)
    or croak "Cannot seek $self->{'filename'}: $!";
}
sub _tell {
  my ($self) = @_;
  my $pos = tell $self->{'fh'};
  if ($pos < 0) {
    croak "Cannot tell file position $self->{'filename'}: $!";
  }
  return $pos;
}

sub rewind {
  my ($self) = @_;
  ### OEIS-File rewind() ...

  $self->{'i'} = $self->i_start;
  $self->{'array_pos'} = 0;
  $self->{'next_seek'} = 0;
}

sub next {
  my ($self) = @_;
  ### OEIS-File next(): "i=$self->{'i'}"

  my $value;
  if (my $fh = $self->{'fh'}) {
    ### from readline ...
    if (defined (my $pos = delete $self->{'next_seek'})) {
      ### seek to: $pos
      _seek($self, $pos);
    }
    return _readline($self);

  } else {
    ### from array ...
    my ($value) = _array_value($self, $self->{'array_pos'}++)
      or return;
    return ($self->{'i'}++, $value);
  }
}

# Return $self->{'array'}->[$pos], or no values if $pos past end of array.
# Array values are promoted to BigInt if necessary.
sub _array_value {
  my ($self, $pos) = @_;
  ### _array_value(): $pos

  my $array = $self->{'array'};
  if ($pos > $#$array) {
    ### past end of array ...
    return;
  }
  my $value = $array->[$pos];

  # large values as Math::BigInt
  # initially $array has strings, make bigint objects when required
  if (! ref $value && length($value) > _MAX_DIGIT_LENGTH) {
    $value = $array->[$pos] = _to_bigint($value);
  }
  ### $value
  return $value;
}

# Read a line from an open B-file or A-file, return ($i,$value).
# At EOF return empty ().
#
sub _readline {
  my ($self) = @_;
  my $fh = $self->{'fh'};
  while (defined (my $line = <$fh>)) {
    chomp $line;
    $line =~ tr/\r//d;    # delete CR if CRLF line endings, eg. b009000.txt
    ### $line

    if ($line =~ /^\s*(#|$)/) {
      ### ignore blank or comment ...
      # comment lines with "#" eg. b002182.txt
      next;
    }

    # leading whitespace allowed as per b195467.txt
    if (my ($i, $value) = ($line =~ /^\s*
                                     ([0-9]+)      # i
                                     [ \t]+
                                     (-?[0-9]+)    # value
                                     [ \t]*
                                     $/x)) {
      ### _readline: "$i  $value"
      if (length($value) > _MAX_DIGIT_LENGTH) {
        $value = _to_bigint($value);
      }
      $self->{'fh_i'} = $i+1;
      return ($i, $value);
    }
  }
  undef $self->{'fh_i'};
  return;
}

# Return true if the a000000.txt file in $self->{'fh'} looks good.
# Various a-files are source code or tables rather than sequence values.
#
sub _afile_is_good {
  my ($self) = @_;
  my $fh = $self->{'fh'};
  my $good = 0;
  my $prev_i;
  while (defined (my $line = <$fh>)) {
    chomp $line;
    $line =~ tr/\r//d;    # delete CR if CRLF line endings, eg. b009000.txt
    ### $line

    if ($line =~ /^\s*(#|$)/) {
      ### ignore blank or comment ...
      next;
    }

    # Must have line like "0 123".  Can have negative OFFSET and so index i,
    # eg. A166242 (though that one doesn't have an A-file).
    my ($i,$value) = ($line =~ /^(-?[0-9]+)     # i
                                [ \t]+
                                (-?[0-9]+)    # value
                                [ \t]*
                                $/x)
      or last;

    if (defined $prev_i && $i != $prev_i+1) {
      ### bad A-file, initial "i" values not consecutive ...
      last;
    }
    $prev_i = $i;

    $good++;
    if ($good >= 3) {
      ### three good lines, A-file is good ...
      _seek ($self, 0);
      return 1;
    }
  }
  return 0;
}

sub _read_internal_txt {
  my ($self, $anum) = @_;
  ### _read_internal_txt(): $anum

  return 0 if $self->{'_dont_use_internal'};

  foreach my $basefile ("$anum.internal.txt") {
    my ($fullname, $contents) = _slurp_oeis_file($self,$basefile)
      or next;
    if (_HAVE_ENCODE) {
      # "Internal" text format is utf-8.
      $contents = Encode::decode('utf-8', $contents, Encode::FB_PERLQQ());
    }

    ### $contents

    # eg. "%O A007318 0,5" is OFFSET=0
    my $offset;
    if ($contents =~ /^%O\s+\Q$anum\E\s+(-?\d+)/m) {
      $offset = $1;
      ### %O line: $offset
    } else {
      $offset = 0;
    }

    # eg. "%N A007318 Pascal's triangle ..."
    if ($contents =~ m{^%N\s+\Q$anum\E\s+(.*)}im) {
      _set_description ($self, $1);
    } else {
      ### description not matched ...
    }

    # eg. "%K A007318 nonn,tabl,nice,easy,core,look,hear,changed"
    _set_characteristics ($self,
                          $contents =~ /^%K\s+\Q$anum\E\s+(.*)/im && $1);

    # the eishelp1.html says
    # %V,%W,%X lines for signed sequences
    # %S,%T,%U lines for non-negative sequences
    # though now %S is signed and unsigned both is it?
    #
    if (! $self->{'fh'}) {
      my @samples;
      # capital %STU etc, but any case <tt>
      while ($contents =~ m{^%[VWX]\s+\Q$anum\E\s+(.*)}mg) {
        push @samples, $1;
      }
      unless (@samples) {
        while ($contents =~ m{^%[STU]\s+\Q$anum\E\s+(.*)}mg) {
          push @samples, $1;
        }
        unless (@samples) {
          croak "Oops list of values not found in ",$self->{'filename'};
        }
      }
      # join multiple lines of samples
      _split_sample_values ($self, join(', ',@samples));
    }

    # %O "OFFSET" is subscript of first number.
    # Or for digit expansions it's the number of terms before the decimal
    # point, per http://oeis.org/eishelp2.html#RO
    #
    ### digits: $self->{'characteristic'}->{'digits'}
    unless ($self->{'characteristic'}->{'digits'}) {
      $self->{'i'} = $self->{'i_start'} = $offset;
    }
    ### i: $self->{'i'}
    ### i_start: $self->{'i_start'}

    return 1; # success
  }

  return 0; # file not found
}

sub _read_internal_html {
  my ($self, $anum) = @_;
  ### _read_internal_html(): $anum

  return 0 if $self->{'_dont_use_internal'};

  foreach my $basefile ("$anum.internal.html") {
    my ($fullname, $contents) = _slurp_oeis_file($self,$basefile)
      or next;
    # "Internal" files are served as html with a <meta> charset indicator
    $contents = _decode_html_charset($contents);
    ### $contents

    my $offset;
    if ($contents =~ /(^|<tt>)%O\s+(\d+)/im) {
      $offset = $2;
      ### %O line: $offset
    } else {
      $offset = 0;
    }

    if ($contents =~ m{(^|<tt>)%N (.*?)(<tt>|$)}im) {
      _set_description ($self, $2);
    } else {
      ### description not matched ...
    }

    _set_characteristics ($self,
                          $contents =~ /(^|<tt>)%K (.*?)(<tt>|$)/im
                          && $2);

    # the eishelp1.html says
    # %V,%W,%X lines for signed sequences
    # %S,%T,%U lines for non-negative sequences
    # though now %S is signed and unsigned both is it?
    #
    if (! $self->{'fh'}) {
      my @samples;
      # capital %STU etc, but any case <tt>
      while ($contents =~ m{(^|<[tT][tT]>)%[VWX] (.*?)(</[tT][tT]>|$)}mg) {
        push @samples, $2;
      }
      unless (@samples) {
        while ($contents =~ m{(^|<[tT][tT]>)%[STU] (.*?)(</[tT][tT]>|$)}mg) {
          push @samples, $2;
        }
        unless (@samples) {
          croak "Oops list of values not found in ",$self->{'filename'};
        }
      }
      # join multiple lines of samples
      _split_sample_values ($self, join(', ',@samples));
    }

    # %O "OFFSET" is subscript of first number.
    # Or for digit expansions it's the number of terms before the decimal
    # point, per http://oeis.org/eishelp2.html#RO
    #
    unless ($self->{'characteristic'}->{'digits'}) {
      $self->{'i'} = $self->{'i_start'} = $offset;
    }
    ### i: $self->{'i'}
    ### i_start: $self->{'i_start'}

    return 1; # success
  }

  return 0; # file not found
}

# Fill $self with contents of ~/OEIS/A000000.html but various fragile greps
# of the html.
# Return 1 if .html or .htm file exists, 0 if not.
#
sub _read_html {
  my ($self, $anum) = @_;
  ### _read_html(): $anum

  return 0 if $self->{'_dont_use_html'};

  foreach my $basefile ("$anum.html", "$anum.htm") {
    my ($fullname, $contents) = _slurp_oeis_file($self,$basefile)
      or next;
    $contents = _decode_html_charset($contents);

    if ($contents =~
        m{$anum[ \t]*\r?\n.*?       # target anum
          <td[^>]*>\s*(?:</td>)? # <td ...></td> empty
          <td[^>]*>              # <td ...>
          \s*
          (.*?)                  # text through to ...
          (<br>|</?td)           # <br> or </td> or <td>
       }isx) {
      _set_description ($self, $1);
    } else {
      ### description not matched ...
    }

    my $offset = ($contents =~ /OFFSET.*?<[tT][tT]>(\d+)/s
                  && $1);
    ### $offset

    # fragile grep out of the html ...
    my $keywords;
    if ($contents =~ m{KEYWORD.*?<[tT][tT][^>]*>(.*?)</[tT][tT]>}s) {
      $keywords = $1;
      ### html keywords match: $keywords
    } else {
      # die "Oops, KEYWORD not matched: $anum";
    }
    _set_characteristics ($self, $keywords);

    if (! $self->{'fh'}) {
      # fragile grep out of the html ...
      $contents =~ s{>graph</a>.*}{};
      $contents =~ m{.*<tt>([^<]+)</tt>}i;
      my $list = $1;
      _split_sample_values ($self, $list);
    }

    # %O "OFFSET" is subscript of first number, but for digit expansions
    # it's the position of the decimal point
    # http://oeis.org/eishelp2.html#RO
    if (! $self->{'characteristic'}->{'digits'}) {
      $self->{'i'} = $self->{'i_start'} = $offset;
    }
    ### i: $self->{'i'}
    ### i_start: $self->{'i_start'}

    return 1;
  }
  return 0;
}

# Return the contents of ~/OEIS/$filename.
# $filename is like "A000000.html" to be taken relative to oeis_dir().
# If $filename cannot be read then return undef.
sub _slurp_oeis_file {
  my ($self,$filename) = @_;
  $filename = File::Spec->catfile (oeis_dir(), $filename);
  ### $filename

  if (! open FH, "< $filename") {
    ### cannot open file: $!
    return;
  }
  my $contents = do { local $/; <FH> }; # slurp
  close FH
    or return;
  $self->{'filename'} ||= $filename;
  return ($filename, $contents);
}

sub _set_description {
  my ($self, $description) = @_;
  ### _set_description(): $description

  $description =~ s/\s+$//;       # trailing whitespace
  $description =~ s/\s+/ /g;      # collapse whitespace
  $description =~ s/<[^>]*?>//sg; # tags <foo ...>
  $description =~ s/&lt;/</ig;    # unentitize <
  $description =~ s/&gt;/>/ig;    # unentitize >
  $description =~ s/&amp;/&/ig;   # unentitize &
  $description =~ s/&#(\d+);/chr($1)/ge; # unentitize numeric ' and "

  # ENHANCE-ME: maybe __x() if made available, or an sprintf "... %s" would
  # be enough ...
  $description .= "\n";
  if ($self->{'fh'}) {
    $description .= sprintf(Math::NumSeq::__('Values from B-file %s'),
                            $self->{'filename'})
  } else {
    $description .= sprintf(Math::NumSeq::__('Values from %s'),
                            $self->{'filename'})
  }
  $self->{'description'} = $description;
}

sub _set_characteristics {
  my ($self, $keywords) = @_;
  ### _set_characteristics()
  ### $keywords

  if (! defined $keywords) {
    return; # if perhaps match of .html failed
  }

  $keywords =~ s{<[^>]*>}{}g;  # <foo ...> tags
  ### $keywords

  foreach my $key (split /[, \t]+/, ($keywords||'')) {
    ### $key
    $self->{'characteristic'}->{"OEIS_$key"} = 1;
  }

  # if ($self->{'characteristic'}->{'OEIS_cofr'}) {
  #   $self->{'characteristic'}->{'continued_fraction'} = 1;
  # }

  # "cons" means decimal digits of a constant
  # but don't reckon A000012 all-ones that way
  # "base" means non-decimal, it seems, maybe
  if ($self->{'characteristic'}->{'OEIS_cons'}
      && ! $self->{'characteristic'}->{'OEIS_base'}
      && $self->{'anum'} ne 'A000012'
      && $self->{'anum'} ne 'A000035') {  # 0,1 rep = mod 2
    $self->{'values_min'} = 0;
    $self->{'values_max'} = 9;
    $self->{'characteristic'}->{'digits'} = 10;
  }

  if (defined (my $description = $self->{'description'})) {
    if ($description =~ /expansion of .* in base (\d+)/i) {
      $self->{'values_min'} = 0;
      $self->{'values_max'} = $1 - 1;
      $self->{'characteristic'}->{'digits'} = $1;
    }
    if ($description =~ /^number of /i) {
      $self->{'characteristic'}->{'count'} = 1;
    }
  }
}

sub _split_sample_values {
  my ($self, $str) = @_;
  ### _split_sample_values(): $str
  unless (defined $str && $str =~ m{^([0-9,-]|\s)+$}) {
    croak "Oops list of sample values not recognised in ",$self->{'filename'},"\n",
      (defined $str ? $str : ());
  }
  $self->{'array'} = [ split /[, \t\r\n]+/, $str ];
}

sub _decode_html_charset {
  my ($contents) = @_;

  # eg. <META http-equiv="content-type" content="text/html; charset=utf-8">
  # HTTP::Message has a blob of code for this, using the full HTTP::Parser,
  # but a slack regexp should be enough for OEIS pages.
  #
  if (_HAVE_ENCODE
      && $contents =~ m{<META[^>]+
                        http-equiv=[^>]+
                        content-type[^>]+
                        charset=([a-z0-9-_]+)}isx) {
    return Encode::decode($1, $contents, Encode::FB_PERLQQ());
  } else {
    return $contents;
  }
}

#------------------------------------------------------------------------------

# Similar bsearch to Search::Dict, but Search::Dict doesn't allow for
# comment lines at the start of the file or blank lines at the end.
#
#use Smart::Comments;

sub ith {
  my ($self, $i) = @_;
  ### ith(): "$i  cf fh_i=".($self->{'fh_i'} || -999)

  if (my $fh = $self->{'fh'}) {
    if (! defined $self->{'next_seek'}) {
      $self->{'next_seek'} = tell($fh);
    }

    if (defined $self->{'fh_i'} && $i <= $self->{'fh_i'} + 20) {
      ### fh_i is target ...
      if (my ($line_i, $value) = _readline($self)) {
        if ($line_i == $i) {
          return $value;
        }
      }
    }

    my $lo = 0;
    my $hi = -s $fh;
    for (;;) {
      ### at: "lo=$lo hi=$hi  consider mid=".int(($lo+$hi)/2)
      my $mid = int(($lo+$hi)/2);
      _seek ($self, $mid);

      if (! defined(readline $fh)) {
        ### mid is EOF ...
        last;
      }
      ### skip partial line to: tell($fh)
      $mid = tell($fh);
      if ($mid >= $hi) {
        last;
      }

      my ($line_i,$value) = _readline($self)
        or last;  # only blank lines between $mid and EOF, go linear

      ### $line_i
      ### $value
      if ($line_i == $i) {
        ### found by binary search ...
        return $value;
      }
      if ($line_i < $i) {
        ### line_i before the target, advance lo ...
        $lo = tell($fh);
      } else {
        ### line_i after target, reduce hi ...
        $hi = $mid;
      }
    }

    _seek ($self, $lo);
    for (;;) {
      my ($line_i,$value) = _readline($self)
        or last;
      if ($line_i == $i) {
        ### found by linear search ...
        $self->{'fh_i'} = $line_i+1;
        return $value;
      }
      if ($line_i > $i) {
        return undef;
      }
    }
    return undef;

  } else {
    $i -= $self->i_start;
    unless ($i >= 0) {
      return undef; # negative or NaN
    }
    return $self->{'array'}->[$i];
  }
}

1;
__END__


#------------------------------------------------------------------------------

# foreach my $basefile (anum_to_html($anum), anum_to_html($anum,'.htm')) {
#   my $filename = File::Spec->catfile (oeis_dir(), $basefile);
#   ### $basefile
#   ### $filename
#   if (open FH, "<$filename") {
#     my $contents = do { local $/; <FH> }; # slurp
#     close FH or die;
#
#     # fragile grep out of the html ...
#     $contents =~ s{>graph</a>.*}{};
#     $contents =~ m{.*<tt>([^<]+)</tt>};
#     my $list = $1;
#     unless ($list =~ m{^([0-9,-]|\s)+$}) {
#       croak "Oops list of values not found in ",$filename;
#     }
#     my @array = split /[, \t\r\n]+/, $list;
#     ### $list
#     ### @array
#     return \@array;
#   }
#   ### no html: $!
# }




#------------------------------------------------------------------------------
# stripped.gz and names.gz
# no OFFSET, so i_start would be wrong, in general

# sub _read_stripped {
#   my ($self, $anum) = @_;
#   ### _read_stripped(): $anum
#
#   return 0 if $self->{'_dont_use_stripped'};
#   (my $num = $anum) =~ s/^A//;
#
#   my $filename = File::Spec->catfile (oeis_dir(), "stripped");
#   ### $filename
#   my $line;
#   my $cmpfunc = sub {
#     my ($line) = @_;
#     $line =~ /^A(\d+)/ or return -1;
#     return ($1 <=> $num);
#   };
#   if (open FH, "<$filename") {
#     $line = _bsearch_textfile (\*FH, $cmpfunc);
#   } else {
#     require IO::Zlib;
#     tie *FILE, 'IO::Zlib', $filename, "rb";
#     $line = _lsearch_textfile (\*FH, $cmpfunc);
#   }
#   if (! defined $line) {
#     return 0;
#   }
#
#   return 1; # success
# }
#
# sub _bsearch_textfile {
#   my ($fh, $cmpfunc) = @_;
#   my $lo = 0;
#   my $hi = -s $fh;
#   for (;;) {
#     my $mid = ($lo+$hi)/2;
#     seek $fh, $mid, 0
#       or last;
#
#     # skip partial line
#     defined(readline $fh)
#       or last; # EOF
#
#     # position start of line
#     $mid = tell($fh);
#     if ($mid >= $hi) {
#       last;
#     }
#
#     my $line = readline $fh;
#     defined $line
#       or last; # EOF
#
#     my $cmp = &$cmpfunc ($line);
#     if ($cmp == 0) {
#       return $line;
#     }
#     if ($cmp < 0) {
#       $lo = tell($fh);  # after
#     } else {
#       $hi = $mid;
#     }
#   }
#
#   seek $fh, $lo, 0;
#   while (defined (my $line = readline $fh)) {
#     my $cmp = &$cmpfunc($line);
#     if ($cmp == 0) {
#       return $line;
#     }
#     if ($cmp > 0) {
#       return undef;
#     }
#   }
#   return undef;
# }