use 5.014000;
use strict;
use warnings;
-our $VERSION = '0.000_001';
+our $VERSION = '0.002002';
use constant MAGIC => "!-1SLOB\x1F";
use Carp qw/croak verbose/;
use Encode;
+use Compress::Raw::Bzip2;
+use Compress::Raw::Lzma;
+use Compress::Raw::Zlib;
+
+# MD5 only used for debugging output in tests
+use Digest::MD5 qw/md5_hex/;
+
+our %UNCOMPRESS = (
+ '' => sub { $_[0] },
+ 'lzma2' => sub {
+ my ($input) = @_;
+ my ($lzma2, $code, $output);
+ ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
+ die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
+
+ $code = $lzma2->code($input, $output);
+ die "Did not reach end of stream\n" if $code == LZMA_OK;
+ die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
+ $output
+ },
+
+ 'bz2' => sub {
+ my ($input) = @_;
+ my ($bz2, $code, $output);
+ ($bz2, $code)= Compress::Raw::Bunzip2->new;
+ die "Error creating Bunzip2: $code\n" unless $code == Z_OK;
+
+ $code = $bz2->bzinflate($input, $output);
+ die "Did not reach end of stream\n" if $code == BZ_OK;
+ die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;
+
+ $output
+ },
+
+ 'zlib' => sub {
+ my ($input) = @_;
+ my ($zlib, $code, $output);
+ ($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
+ -WindowBits => WANT_GZIP_OR_ZLIB
+ );
+ die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;
+
+ $code = $zlib->inflate($input, \$output, 1);
+ die "Did not reach end of stream\n" if $code == Z_OK;
+ die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
+ $output
+ }
+);
+
sub new {
my ($class, $path) = @_;
- open my $fh, '<', $path or croak "Cannot open \"$path\": $!";
+ my $fh;
+ if (ref $path eq 'IO') {
+ $fh = $path
+ } else {
+ open $fh, '<', $path or croak "Cannot open \"$path\": $!";
+ binmode $fh;
+ }
my $self = bless {path => $path, fh => $fh}, $class;
$self->{header} = $self->read_header;
$self
}
sub uncompress {
- my ($self, $data) = @_;
- $data
+ my ($self, $data) = @_;
+ my $compression = $self->{header}{compression};
+ if ($ENV{HARNESS_ACTIVE} && $compression eq 'lzma2') {
+ my $prefix = unpack 'H*', substr $data, 0, 10;
+ my $md5sum = md5_hex $data;
+ Test::More::diag "Uncompressing data starting '$prefix', md5sum $md5sum";
+ }
+ $UNCOMPRESS{$compression}->($data)
}
sub read_header {
$self->{encoding} = $encoding;
my $compression = $self->read_tiny_text;
- die "Compression not yet supported" if $compression;
+ die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
my %tags = $self->read_tags;
my @content_types = $self->read_content_types;
my $blob_count = $self->read_int;
substr $start_of_data, 4, $length;
}
+sub seek_and_read_ref_and_data {
+ my ($self, $index) = @_;
+ my $ref = $self->seek_and_read_ref($index);
+ my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
+ my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
+ $ref->{data} = $data;
+ $ref
+}
+
1;
__END__
say "Value at position $second_ref->{item_index} is ",
$slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
+ # instead of the above, we can do
+ my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
+ say "Entry is for $second_ref_and_data->{key}";
+ say "Value is $second_ref_and_data->{data}";
+
=head1 DESCRIPTION
-No documentation yet, see SYNOPSIS.
+Slob is a compressed read-only format for storing natural language
+dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
+dictionaries in slob format.
+
+The following methods are available:
+
+=over
+
+=item Slob->B<new>(I<$path>)
+=item Slob->B<new>(I<$fh>)
+
+Create a new slob reader reading from the given path or filehandle.
+
+=item $slob->B<ref_count>
+
+The number of refs (keys) in the dictionary.
+
+=item $slob->B<seek_and_read_ref>(I<$index>)
+
+Read the ref (key) at the given index. Returns a hashref with the
+following keys:
+
+=over
+
+=item key
+
+The key
+
+=item bin_index
+
+The storage bin that contains the value for this key
+
+=item item_index
+
+The index in the bin_index storage bin of the value for this key
+
+=item fragment
+
+HTML fragment that, when applied to the HTML value, points to the
+definition of the key.
+
+=back
+
+=item $slob->B<seek_and_read_storage_bin>(I<$index>)
+
+Read the storage bin with the given index. Returns the storage bin,
+which can later be given to B<get_entry_of_storage_bin>.
+
+=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
+
+Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
+item index, returns the value at the index i nthe storage bin.
+
+=item $slob->B<seek_and_read_ref_and_data>($index)
+
+Convenience method that returns the key and value at a given index.
+Returns a hashref like C<seek_and_read_ref> with an extra key,
+I<data>, which is the value of the key.
+
+=back
+
+=head1 SEE ALSO
+
+L<https://github.com/itkach/slob>
=head1 AUTHOR
=head1 COPYRIGHT AND LICENSE
-Copyright (C) 2017 by Marius Gavrilescu
+Copyright (C) 2017-2018 by Marius Gavrilescu
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.26.1 or,