[slob.git] / lib / Slob.pm

package Slob;

use 5.014000;
use strict;
use warnings;
our $VERSION = '0.002_001';

use constant MAGIC => "!-1SLOB\x1F";

use Carp qw/croak verbose/;
use Encode;

use Compress::Raw::Bzip2;
use Compress::Raw::Lzma;
use Compress::Raw::Zlib;

# MD5 only used for debugging output in tests
use Digest::MD5 qw/md5_hex/;

our %UNCOMPRESS = (
	'' => sub { $_[0] },
	'lzma2' => sub {
		my ($input) = @_;
		my ($lzma2, $code, $output);
		($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
		die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;

		$code = $lzma2->code($input, $output);
		die "Did not reach end of stream\n" if $code == LZMA_OK;
		die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
		$output
	},

	'bz2' => sub {
		my ($input) = @_;
		my ($bz2, $code, $output);
		($bz2, $code)= Compress::Raw::Bunzip2->new;
		die "Error creating Bunzip2: $code\n" unless $code == Z_OK;

		$code = $bz2->bzinflate($input, $output);
		die "Did not reach end of stream\n" if $code == BZ_OK;
		die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;

		$output
	},

	'zlib' => sub {
		my ($input) = @_;
		my ($zlib, $code, $output);
		($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
			-WindowBits => WANT_GZIP_OR_ZLIB
		);
		die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;

		$code = $zlib->inflate($input, \$output, 1);
		die "Did not reach end of stream\n" if $code == Z_OK;
		die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
		$output
	}
);

sub new {
	my ($class, $path) = @_;
	my $fh;
	if (ref $path eq 'IO') {
		$fh = $path
	} else {
		open $fh, '<', $path or croak "Cannot open \"$path\": $!"
	}
	my $self = bless {path => $path, fh => $fh}, $class;
	$self->{header} = $self->read_header;
	$self
}

sub read_data {
	my ($self, $len) = @_;
	my $data;
	my $result = read $self->{fh}, $data, $len;
	if (!defined $result) {
		croak "Failed to read from $self->{path}: $!"
	} elsif ($result == $len) {
		$data
	} elsif ($result == 0) {
		croak "$self->{path} is at end of file"
	} elsif ($result < $len) {
		croak "Only read $result bytes of $self->{path} before reaching EOF"
	}
}

sub read_formatted {
	my ($self, $len_of_format, $format) = @_;
	unpack $format, $self->read_data($len_of_format);
}

sub read_char  { shift->read_formatted(1, 'C') }
sub read_short { shift->read_formatted(2, 'n') }
sub read_int   { shift->read_formatted(4, 'N') }
sub read_long  { shift->read_formatted(8, 'Q>') }

sub read_tiny_text {
	my ($self, $encoding) = @_;
	my $data = $self->read_data($self->read_char);
	if (length $data == 255) {
		$data = unpack 'Z*', $data;
	}
	$encoding //= $self->{encoding};
	decode $encoding, $data;
}

sub read_text {
	my ($self, $encoding) = @_;
	my $data = $self->read_data($self->read_short);
	$encoding //= $self->{encoding};
	decode $encoding, $data;
}

sub read_large_byte_string {
	my ($self) = @_;
	$self->read_data($self->read_short)
}

sub read_tag {
	my ($self) = @_;
	my $name  = $self->read_tiny_text;
	my $value = $self->read_tiny_text;
	($name, $value)
}

sub read_tags {
	my ($self) = @_;
	my $tag_count = $self->read_char;
	map { $self->read_tag } 1..$tag_count
}

sub read_content_types {
	my ($self) = @_;
	my $content_type_count = $self->read_char;
	map { $self->read_text } 1..$content_type_count
}

sub read_positions {
	my ($self) = @_;
	my $count = $self->read_int;
	my @positions = map { $self->read_long } 1..$count;
	my $relative_to = $self->ftell;
	map { $relative_to + $_ } @positions
}

sub fseek {
	my ($self, $position) = @_;
	seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
}

sub ftell {
	my ($self) = @_;
	my $result = tell $self->{fh};
	croak "Failed to tell position in file" if $result == -1;
	$result
}

sub uncompress {
       my ($self, $data) = @_;
       my $compression = $self->{header}{compression};
       if ($ENV{HARNESS_ACTIVE} && $compression eq 'lzma2') {
               my $prefix = unpack 'H*', substr $data, 0, 10;
               my $md5sum = md5_hex $data;
               Test::More::diag "Uncompressing data starting '$prefix', md5sum $md5sum";
       }
       $UNCOMPRESS{$compression}->($data)
}

sub read_header {
	my ($self) = @_;
	my $magic = $self->read_data(length MAGIC);
	croak "Not a SLOB dictionary" unless MAGIC eq $magic;
	my $uuid = $self->read_data(16);

	my $encoding = $self->read_tiny_text('UTF-8');
	$self->{encoding} = $encoding;

	my $compression = $self->read_tiny_text;
	die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
	my %tags = $self->read_tags;
	my @content_types = $self->read_content_types;
	my $blob_count = $self->read_int;
	my $store_offset = $self->read_long;
	my $size = $self->read_long;
	my @refs = $self->read_positions;

	$self->fseek($store_offset);
	my @storage_bins = $self->read_positions;

	+{
		uuid => $uuid,
		encoding => $encoding,
		compression => $compression,
		tags => \%tags,
		content_types => \@content_types,
		blob_count => $blob_count,
		store_offset => $store_offset,
		size => $size,
		refs => \@refs,
		storage_bins => \@storage_bins,
	}
}

sub read_ref {
	my ($self) = @_;
	my $key = $self->read_text;
	my $bin_index = $self->read_int;
	my $item_index = $self->read_short;
	my $fragment = $self->read_tiny_text;
	+{
		key => $key,
		bin_index => $bin_index,
		item_index => $item_index,
		fragment => $fragment,
	}
}

sub read_storage_bin {
	my ($self) = @_;
	my $count = $self->read_int;
	my @content_types = map { $self->read_char } 1..$count;
	my $compressed_size = $self->read_int;
	my $compressed_data = $self->read_data($compressed_size);
	my $uncompressed_data = $self->uncompress($compressed_data);

	my @positions = unpack "N$count", $uncompressed_data;
	my $data = substr $uncompressed_data, $count * 4;
	+{
		positions => \@positions,
		data => $data
	}
}

sub ref_count { shift @{shift->{header}{refs}} }

sub seek_and_read_ref {
	my ($self, $index) = @_;
	croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
	$self->fseek($self->{header}{refs}[$index]);
	$self->read_ref
}

sub seek_and_read_storage_bin {
	my ($self, $index) = @_;
	croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
	$self->fseek($self->{header}{storage_bins}[$index]);
	$self->read_storage_bin
}

sub get_entry_of_storage_bin {
	my ($self, $storage_bin, $index) = @_;
	my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
	my $length = unpack 'N', $start_of_data;
	substr $start_of_data, 4, $length;
}

sub seek_and_read_ref_and_data {
	my ($self, $index) = @_;
	my $ref = $self->seek_and_read_ref($index);
	my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
	my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
	$ref->{data} = $data;
	$ref
}

1;
__END__

=encoding utf-8

=head1 NAME

Slob - Read .slob dictionaries (as used by Aard 2)

=head1 SYNOPSIS

  use feature qw/:5.14/;
  use Slob;
  my $slob = Slob->new('path/to/dict.slob');

  my $nr_of_entries = $slob->ref_count; # if the same content has two
                                        # keys pointing to it, this
                                        # counts it twice

  my $second_ref = $slob->seek_and_read_ref(4);
  say "Entry is for $second_ref->{key}";
  say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";

  my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
  say "Bin has ", (scalar @{$bin->{positions}}), " entries";
  say "Value at position $second_ref->{item_index} is ",
    $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});

  # instead of the above, we can do
  my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
  say "Entry is for $second_ref_and_data->{key}";
  say "Value is $second_ref_and_data->{data}";

=head1 DESCRIPTION

Slob is a compressed read-only format for storing natural language
dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
dictionaries in slob format.

The following methods are available:

=over

=item Slob->B<new>(I<$path>)
=item Slob->B<new>(I<$fh>)

Create a new slob reader reading from the given path or filehandle.

=item $slob->B<ref_count>

The number of refs (keys) in the dictionary.

=item $slob->B<seek_and_read_ref>(I<$index>)

Read the ref (key) at the given index. Returns a hashref with the
following keys:

=over

=item key

The key

=item bin_index

The storage bin that contains the value for this key

=item item_index

The index in the bin_index storage bin of the value for this key

=item fragment

HTML fragment that, when applied to the HTML value, points to the
definition of the key.

=back

=item $slob->B<seek_and_read_storage_bin>(I<$index>)

Read the storage bin with the given index. Returns the storage bin,
which can later be given to B<get_entry_of_storage_bin>.

=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)

Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
item index, returns the value at the index i nthe storage bin.

=item $slob->B<seek_and_read_ref_and_data>($index)

Convenience method that returns the key and value at a given index.
Returns a hashref like C<seek_and_read_ref> with an extra key,
I<data>, which is the value of the key.

=back

=head1 SEE ALSO

L<https://github.com/itkach/slob>

=head1 AUTHOR

Marius Gavrilescu, E<lt>marius@ieval.roE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2017-2018 by Marius Gavrilescu

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.26.1 or,
at your option, any later version of Perl 5 you may have available.


=cut
Commit	Line	Data
	1	package Slob;
	2
	3	use 5.014000;
	4	use strict;
	5	use warnings;
	6	our $VERSION = '0.002_001';
	7
	8	use constant MAGIC => "!-1SLOB\x1F";
	9
	10	use Carp qw/croak verbose/;
	11	use Encode;
	12
	13	use Compress::Raw::Bzip2;
	14	use Compress::Raw::Lzma;
	15	use Compress::Raw::Zlib;
	16
	17	# MD5 only used for debugging output in tests
	18	use Digest::MD5 qw/md5_hex/;
	19
	20	our %UNCOMPRESS = (
	21	'' => sub { $_[0] },
	22	'lzma2' => sub {
	23	my ($input) = @_;
	24	my ($lzma2, $code, $output);
	25	($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
	26	die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
	27
	28	$code = $lzma2->code($input, $output);
	29	die "Did not reach end of stream\n" if $code == LZMA_OK;
	30	die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
	31	$output
	32	},
	33
	34	'bz2' => sub {
	35	my ($input) = @_;
	36	my ($bz2, $code, $output);
	37	($bz2, $code)= Compress::Raw::Bunzip2->new;
	38	die "Error creating Bunzip2: $code\n" unless $code == Z_OK;
	39
	40	$code = $bz2->bzinflate($input, $output);
	41	die "Did not reach end of stream\n" if $code == BZ_OK;
	42	die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;
	43
	44	$output
	45	},
	46
	47	'zlib' => sub {
	48	my ($input) = @_;
	49	my ($zlib, $code, $output);
	50	($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
	51	-WindowBits => WANT_GZIP_OR_ZLIB
	52	);
	53	die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;
	54
	55	$code = $zlib->inflate($input, \$output, 1);
	56	die "Did not reach end of stream\n" if $code == Z_OK;
	57	die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
	58	$output
	59	}
	60	);
	61
	62	sub new {
	63	my ($class, $path) = @_;
	64	my $fh;
	65	if (ref $path eq 'IO') {
	66	$fh = $path
	67	} else {
	68	open $fh, '<', $path or croak "Cannot open \"$path\": $!"
	69	}
	70	my $self = bless {path => $path, fh => $fh}, $class;
	71	$self->{header} = $self->read_header;
	72	$self
	73	}
	74
	75	sub read_data {
	76	my ($self, $len) = @_;
	77	my $data;
	78	my $result = read $self->{fh}, $data, $len;
	79	if (!defined $result) {
	80	croak "Failed to read from $self->{path}: $!"
	81	} elsif ($result == $len) {
	82	$data
	83	} elsif ($result == 0) {
	84	croak "$self->{path} is at end of file"
	85	} elsif ($result < $len) {
	86	croak "Only read $result bytes of $self->{path} before reaching EOF"
	87	}
	88	}
	89
	90	sub read_formatted {
	91	my ($self, $len_of_format, $format) = @_;
	92	unpack $format, $self->read_data($len_of_format);
	93	}
	94
	95	sub read_char { shift->read_formatted(1, 'C') }
	96	sub read_short { shift->read_formatted(2, 'n') }
	97	sub read_int { shift->read_formatted(4, 'N') }
	98	sub read_long { shift->read_formatted(8, 'Q>') }
	99
	100	sub read_tiny_text {
	101	my ($self, $encoding) = @_;
	102	my $data = $self->read_data($self->read_char);
	103	if (length $data == 255) {
	104	$data = unpack 'Z*', $data;
	105	}
	106	$encoding //= $self->{encoding};
	107	decode $encoding, $data;
	108	}
	109
	110	sub read_text {
	111	my ($self, $encoding) = @_;
	112	my $data = $self->read_data($self->read_short);
	113	$encoding //= $self->{encoding};
	114	decode $encoding, $data;
	115	}
	116
	117	sub read_large_byte_string {
	118	my ($self) = @_;
	119	$self->read_data($self->read_short)
	120	}
	121
	122	sub read_tag {
	123	my ($self) = @_;
	124	my $name = $self->read_tiny_text;
	125	my $value = $self->read_tiny_text;
	126	($name, $value)
	127	}
	128
	129	sub read_tags {
	130	my ($self) = @_;
	131	my $tag_count = $self->read_char;
	132	map { $self->read_tag } 1..$tag_count
	133	}
	134
	135	sub read_content_types {
	136	my ($self) = @_;
	137	my $content_type_count = $self->read_char;
	138	map { $self->read_text } 1..$content_type_count
	139	}
	140
	141	sub read_positions {
	142	my ($self) = @_;
	143	my $count = $self->read_int;
	144	my @positions = map { $self->read_long } 1..$count;
	145	my $relative_to = $self->ftell;
	146	map { $relative_to + $_ } @positions
	147	}
	148
	149	sub fseek {
	150	my ($self, $position) = @_;
	151	seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
	152	}
	153
	154	sub ftell {
	155	my ($self) = @_;
	156	my $result = tell $self->{fh};
	157	croak "Failed to tell position in file" if $result == -1;
	158	$result
	159	}
	160
	161	sub uncompress {
	162	my ($self, $data) = @_;
	163	my $compression = $self->{header}{compression};
	164	if ($ENV{HARNESS_ACTIVE} && $compression eq 'lzma2') {
	165	my $prefix = unpack 'H*', substr $data, 0, 10;
	166	my $md5sum = md5_hex $data;
	167	Test::More::diag "Uncompressing data starting '$prefix', md5sum $md5sum";
	168	}
	169	$UNCOMPRESS{$compression}->($data)
	170	}
	171
	172	sub read_header {
	173	my ($self) = @_;
	174	my $magic = $self->read_data(length MAGIC);
	175	croak "Not a SLOB dictionary" unless MAGIC eq $magic;
	176	my $uuid = $self->read_data(16);
	177
	178	my $encoding = $self->read_tiny_text('UTF-8');
	179	$self->{encoding} = $encoding;
	180
	181	my $compression = $self->read_tiny_text;
	182	die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
	183	my %tags = $self->read_tags;
	184	my @content_types = $self->read_content_types;
	185	my $blob_count = $self->read_int;
	186	my $store_offset = $self->read_long;
	187	my $size = $self->read_long;
	188	my @refs = $self->read_positions;
	189
	190	$self->fseek($store_offset);
	191	my @storage_bins = $self->read_positions;
	192
	193	+{
	194	uuid => $uuid,
	195	encoding => $encoding,
	196	compression => $compression,
	197	tags => \%tags,
	198	content_types => \@content_types,
	199	blob_count => $blob_count,
	200	store_offset => $store_offset,
	201	size => $size,
	202	refs => \@refs,
	203	storage_bins => \@storage_bins,
	204	}
	205	}
	206
	207	sub read_ref {
	208	my ($self) = @_;
	209	my $key = $self->read_text;
	210	my $bin_index = $self->read_int;
	211	my $item_index = $self->read_short;
	212	my $fragment = $self->read_tiny_text;
	213	+{
	214	key => $key,
	215	bin_index => $bin_index,
	216	item_index => $item_index,
	217	fragment => $fragment,
	218	}
	219	}
	220
	221	sub read_storage_bin {
	222	my ($self) = @_;
	223	my $count = $self->read_int;
	224	my @content_types = map { $self->read_char } 1..$count;
	225	my $compressed_size = $self->read_int;
	226	my $compressed_data = $self->read_data($compressed_size);
	227	my $uncompressed_data = $self->uncompress($compressed_data);
	228
	229	my @positions = unpack "N$count", $uncompressed_data;
	230	my $data = substr $uncompressed_data, $count * 4;
	231	+{
	232	positions => \@positions,
	233	data => $data
	234	}
	235	}
	236
	237	sub ref_count { shift @{shift->{header}{refs}} }
	238
	239	sub seek_and_read_ref {
	240	my ($self, $index) = @_;
	241	croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
	242	$self->fseek($self->{header}{refs}[$index]);
	243	$self->read_ref
	244	}
	245
	246	sub seek_and_read_storage_bin {
	247	my ($self, $index) = @_;
	248	croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
	249	$self->fseek($self->{header}{storage_bins}[$index]);
	250	$self->read_storage_bin
	251	}
	252
	253	sub get_entry_of_storage_bin {
	254	my ($self, $storage_bin, $index) = @_;
	255	my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
	256	my $length = unpack 'N', $start_of_data;
	257	substr $start_of_data, 4, $length;
	258	}
	259
	260	sub seek_and_read_ref_and_data {
	261	my ($self, $index) = @_;
	262	my $ref = $self->seek_and_read_ref($index);
	263	my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
	264	my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
	265	$ref->{data} = $data;
	266	$ref
	267	}
	268
	269	1;
	270	__END__
	271
	272	=encoding utf-8
	273
	274	=head1 NAME
	275
	276	Slob - Read .slob dictionaries (as used by Aard 2)
	277
	278	=head1 SYNOPSIS
	279
	280	use feature qw/:5.14/;
	281	use Slob;
	282	my $slob = Slob->new('path/to/dict.slob');
	283
	284	my $nr_of_entries = $slob->ref_count; # if the same content has two
	285	# keys pointing to it, this
	286	# counts it twice
	287
	288	my $second_ref = $slob->seek_and_read_ref(4);
	289	say "Entry is for $second_ref->{key}";
	290	say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
	291
	292	my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
	293	say "Bin has ", (scalar @{$bin->{positions}}), " entries";
	294	say "Value at position $second_ref->{item_index} is ",
	295	$slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
	296
	297	# instead of the above, we can do
	298	my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
	299	say "Entry is for $second_ref_and_data->{key}";
	300	say "Value is $second_ref_and_data->{data}";
	301
	302	=head1 DESCRIPTION
	303
	304	Slob is a compressed read-only format for storing natural language
	305	dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
	306	dictionaries in slob format.
	307
	308	The following methods are available:
	309
	310	=over
	311
	312	=item Slob->B<new>(I<$path>)
	313	=item Slob->B<new>(I<$fh>)
	314
	315	Create a new slob reader reading from the given path or filehandle.
	316
	317	=item $slob->B<ref_count>
	318
	319	The number of refs (keys) in the dictionary.
	320
	321	=item $slob->B<seek_and_read_ref>(I<$index>)
	322
	323	Read the ref (key) at the given index. Returns a hashref with the
	324	following keys:
	325
	326	=over
	327
	328	=item key
	329
	330	The key
	331
	332	=item bin_index
	333
	334	The storage bin that contains the value for this key
	335
	336	=item item_index
	337
	338	The index in the bin_index storage bin of the value for this key
	339
	340	=item fragment
	341
	342	HTML fragment that, when applied to the HTML value, points to the
	343	definition of the key.
	344
	345	=back
	346
	347	=item $slob->B<seek_and_read_storage_bin>(I<$index>)
	348
	349	Read the storage bin with the given index. Returns the storage bin,
	350	which can later be given to B<get_entry_of_storage_bin>.
	351
	352	=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
	353
	354	Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
	355	item index, returns the value at the index i nthe storage bin.
	356
	357	=item $slob->B<seek_and_read_ref_and_data>($index)
	358
	359	Convenience method that returns the key and value at a given index.
	360	Returns a hashref like C<seek_and_read_ref> with an extra key,
	361	I<data>, which is the value of the key.
	362
	363	=back
	364
	365	=head1 SEE ALSO
	366
	367	L<https://github.com/itkach/slob>
	368
	369	=head1 AUTHOR
	370
	371	Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
	372
	373	=head1 COPYRIGHT AND LICENSE
	374
	375	Copyright (C) 2017-2018 by Marius Gavrilescu
	376
	377	This library is free software; you can redistribute it and/or modify
	378	it under the same terms as Perl itself, either Perl version 5.26.1 or,
	379	at your option, any later version of Perl 5 you may have available.
	380
	381
	382	=cut