[slob.git] / lib / Slob.pm

package Slob;

use 5.014000;
use strict;
use warnings;
our $VERSION = '0.002';

use constant MAGIC => "!-1SLOB\x1F";

use Carp qw/croak verbose/;
use Encode;

use Compress::Raw::Bzip2;
use Compress::Raw::Lzma;
use Compress::Raw::Zlib;

our %UNCOMPRESS = (
	'' => sub { $_[0] },
	'lzma2' => sub {
		my ($input) = @_;
		my ($lzma2, $code, $output);
		($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
		die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;

		$code = $lzma2->code($input, $output);
		die "Did not reach end of stream\n" if $code == LZMA_OK;
		die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
		$output
	},

	'bz2' => sub {
		my ($input) = @_;
		my ($bz2, $code, $output);
		($bz2, $code)= Compress::Raw::Bunzip2->new;
		die "Error creating Bunzip2: $code\n" unless $code == Z_OK;

		$code = $bz2->bzinflate($input, $output);
		die "Did not reach end of stream\n" if $code == BZ_OK;
		die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;

		$output
	},

	'zlib' => sub {
		my ($input) = @_;
		my ($zlib, $code, $output);
		($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
			-WindowBits => WANT_GZIP_OR_ZLIB
		);
		die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;

		$code = $zlib->inflate($input, \$output, 1);
		die "Did not reach end of stream\n" if $code == Z_OK;
		die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
		$output
	}
);

sub new {
	my ($class, $path) = @_;
	my $fh;
	if (ref $path eq 'IO') {
		$fh = $path
	} else {
		open $fh, '<', $path or croak "Cannot open \"$path\": $!"
	}
	my $self = bless {path => $path, fh => $fh}, $class;
	$self->{header} = $self->read_header;
	$self
}

sub read_data {
	my ($self, $len) = @_;
	my $data;
	my $result = read $self->{fh}, $data, $len;
	if (!defined $result) {
		croak "Failed to read from $self->{path}: $!"
	} elsif ($result == $len) {
		$data
	} elsif ($result == 0) {
		croak "$self->{path} is at end of file"
	} elsif ($result < $len) {
		croak "Only read $result bytes of $self->{path} before reaching EOF"
	}
}

sub read_formatted {
	my ($self, $len_of_format, $format) = @_;
	unpack $format, $self->read_data($len_of_format);
}

sub read_char  { shift->read_formatted(1, 'C') }
sub read_short { shift->read_formatted(2, 'n') }
sub read_int   { shift->read_formatted(4, 'N') }
sub read_long  { shift->read_formatted(8, 'Q>') }

sub read_tiny_text {
	my ($self, $encoding) = @_;
	my $data = $self->read_data($self->read_char);
	if (length $data == 255) {
		$data = unpack 'Z*', $data;
	}
	$encoding //= $self->{encoding};
	decode $encoding, $data;
}

sub read_text {
	my ($self, $encoding) = @_;
	my $data = $self->read_data($self->read_short);
	$encoding //= $self->{encoding};
	decode $encoding, $data;
}

sub read_large_byte_string {
	my ($self) = @_;
	$self->read_data($self->read_short)
}

sub read_tag {
	my ($self) = @_;
	my $name  = $self->read_tiny_text;
	my $value = $self->read_tiny_text;
	($name, $value)
}

sub read_tags {
	my ($self) = @_;
	my $tag_count = $self->read_char;
	map { $self->read_tag } 1..$tag_count
}

sub read_content_types {
	my ($self) = @_;
	my $content_type_count = $self->read_char;
	map { $self->read_text } 1..$content_type_count
}

sub read_positions {
	my ($self) = @_;
	my $count = $self->read_int;
	my @positions = map { $self->read_long } 1..$count;
	my $relative_to = $self->ftell;
	map { $relative_to + $_ } @positions
}

sub fseek {
	my ($self, $position) = @_;
	seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
}

sub ftell {
	my ($self) = @_;
	my $result = tell $self->{fh};
	croak "Failed to tell position in file" if $result == -1;
	$result
}

sub uncompress {
	my ($self, $data) = @_;
	$UNCOMPRESS{$self->{header}{compression}}->($data)
}

sub read_header {
	my ($self) = @_;
	my $magic = $self->read_data(length MAGIC);
	croak "Not a SLOB dictionary" unless MAGIC eq $magic;
	my $uuid = $self->read_data(16);

	my $encoding = $self->read_tiny_text('UTF-8');
	$self->{encoding} = $encoding;

	my $compression = $self->read_tiny_text;
	die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
	my %tags = $self->read_tags;
	my @content_types = $self->read_content_types;
	my $blob_count = $self->read_int;
	my $store_offset = $self->read_long;
	my $size = $self->read_long;
	my @refs = $self->read_positions;

	$self->fseek($store_offset);
	my @storage_bins = $self->read_positions;

	+{
		uuid => $uuid,
		encoding => $encoding,
		compression => $compression,
		tags => \%tags,
		content_types => \@content_types,
		blob_count => $blob_count,
		store_offset => $store_offset,
		size => $size,
		refs => \@refs,
		storage_bins => \@storage_bins,
	}
}

sub read_ref {
	my ($self) = @_;
	my $key = $self->read_text;
	my $bin_index = $self->read_int;
	my $item_index = $self->read_short;
	my $fragment = $self->read_tiny_text;
	+{
		key => $key,
		bin_index => $bin_index,
		item_index => $item_index,
		fragment => $fragment,
	}
}

sub read_storage_bin {
	my ($self) = @_;
	my $count = $self->read_int;
	my @content_types = map { $self->read_char } 1..$count;
	my $compressed_size = $self->read_int;
	my $compressed_data = $self->read_data($compressed_size);
	my $uncompressed_data = $self->uncompress($compressed_data);

	my @positions = unpack "N$count", $uncompressed_data;
	my $data = substr $uncompressed_data, $count * 4;
	+{
		positions => \@positions,
		data => $data
	}
}

sub ref_count { shift @{shift->{header}{refs}} }

sub seek_and_read_ref {
	my ($self, $index) = @_;
	croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
	$self->fseek($self->{header}{refs}[$index]);
	$self->read_ref
}

sub seek_and_read_storage_bin {
	my ($self, $index) = @_;
	croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
	$self->fseek($self->{header}{storage_bins}[$index]);
	$self->read_storage_bin
}

sub get_entry_of_storage_bin {
	my ($self, $storage_bin, $index) = @_;
	my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
	my $length = unpack 'N', $start_of_data;
	substr $start_of_data, 4, $length;
}

sub seek_and_read_ref_and_data {
	my ($self, $index) = @_;
	my $ref = $self->seek_and_read_ref($index);
	my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
	my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
	$ref->{data} = $data;
	$ref
}

1;
__END__

=encoding utf-8

=head1 NAME

Slob - Read .slob dictionaries (as used by Aard 2)

=head1 SYNOPSIS

  use feature qw/:5.14/;
  use Slob;
  my $slob = Slob->new('path/to/dict.slob');

  my $nr_of_entries = $slob->ref_count; # if the same content has two
                                        # keys pointing to it, this
                                        # counts it twice

  my $second_ref = $slob->seek_and_read_ref(4);
  say "Entry is for $second_ref->{key}";
  say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";

  my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
  say "Bin has ", (scalar @{$bin->{positions}}), " entries";
  say "Value at position $second_ref->{item_index} is ",
    $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});

  # instead of the above, we can do
  my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
  say "Entry is for $second_ref_and_data->{key}";
  say "Value is $second_ref_and_data->{data}";

=head1 DESCRIPTION

Slob is a compressed read-only format for storing natural language
dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
dictionaries in slob format.

The following methods are available:

=over

=item Slob->B<new>(I<$path>)
=item Slob->B<new>(I<$fh>)

Create a new slob reader reading from the given path or filehandle.

=item $slob->B<ref_count>

The number of refs (keys) in the dictionary.

=item $slob->B<seek_and_read_ref>(I<$index>)

Read the ref (key) at the given index. Returns a hashref with the
following keys:

=over

=item key

The key

=item bin_index

The storage bin that contains the value for this key

=item item_index

The index in the bin_index storage bin of the value for this key

=item fragment

HTML fragment that, when applied to the HTML value, points to the
definition of the key.

=back

=item $slob->B<seek_and_read_storage_bin>(I<$index>)

Read the storage bin with the given index. Returns the storage bin,
which can later be given to B<get_entry_of_storage_bin>.

=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)

Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
item index, returns the value at the index i nthe storage bin.

=item $slob->B<seek_and_read_ref_and_data>($index)

Convenience method that returns the key and value at a given index.
Returns a hashref like C<seek_and_read_ref> with an extra key,
I<data>, which is the value of the key.

=back

=head1 SEE ALSO

L<https://github.com/itkach/slob>

=head1 AUTHOR

Marius Gavrilescu, E<lt>marius@ieval.roE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2017-2018 by Marius Gavrilescu

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.26.1 or,
at your option, any later version of Perl 5 you may have available.


=cut
Commit	Line	Data
	1	package Slob;
	2
	3	use 5.014000;
	4	use strict;
	5	use warnings;
	6	our $VERSION = '0.002';
	7
	8	use constant MAGIC => "!-1SLOB\x1F";
	9
	10	use Carp qw/croak verbose/;
	11	use Encode;
	12
	13	use Compress::Raw::Bzip2;
	14	use Compress::Raw::Lzma;
	15	use Compress::Raw::Zlib;
	16
	17	our %UNCOMPRESS = (
	18	'' => sub { $_[0] },
	19	'lzma2' => sub {
	20	my ($input) = @_;
	21	my ($lzma2, $code, $output);
	22	($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
	23	die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
	24
	25	$code = $lzma2->code($input, $output);
	26	die "Did not reach end of stream\n" if $code == LZMA_OK;
	27	die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
	28	$output
	29	},
	30
	31	'bz2' => sub {
	32	my ($input) = @_;
	33	my ($bz2, $code, $output);
	34	($bz2, $code)= Compress::Raw::Bunzip2->new;
	35	die "Error creating Bunzip2: $code\n" unless $code == Z_OK;
	36
	37	$code = $bz2->bzinflate($input, $output);
	38	die "Did not reach end of stream\n" if $code == BZ_OK;
	39	die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;
	40
	41	$output
	42	},
	43
	44	'zlib' => sub {
	45	my ($input) = @_;
	46	my ($zlib, $code, $output);
	47	($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
	48	-WindowBits => WANT_GZIP_OR_ZLIB
	49	);
	50	die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;
	51
	52	$code = $zlib->inflate($input, \$output, 1);
	53	die "Did not reach end of stream\n" if $code == Z_OK;
	54	die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
	55	$output
	56	}
	57	);
	58
	59	sub new {
	60	my ($class, $path) = @_;
	61	my $fh;
	62	if (ref $path eq 'IO') {
	63	$fh = $path
	64	} else {
	65	open $fh, '<', $path or croak "Cannot open \"$path\": $!"
	66	}
	67	my $self = bless {path => $path, fh => $fh}, $class;
	68	$self->{header} = $self->read_header;
	69	$self
	70	}
	71
	72	sub read_data {
	73	my ($self, $len) = @_;
	74	my $data;
	75	my $result = read $self->{fh}, $data, $len;
	76	if (!defined $result) {
	77	croak "Failed to read from $self->{path}: $!"
	78	} elsif ($result == $len) {
	79	$data
	80	} elsif ($result == 0) {
	81	croak "$self->{path} is at end of file"
	82	} elsif ($result < $len) {
	83	croak "Only read $result bytes of $self->{path} before reaching EOF"
	84	}
	85	}
	86
	87	sub read_formatted {
	88	my ($self, $len_of_format, $format) = @_;
	89	unpack $format, $self->read_data($len_of_format);
	90	}
	91
	92	sub read_char { shift->read_formatted(1, 'C') }
	93	sub read_short { shift->read_formatted(2, 'n') }
	94	sub read_int { shift->read_formatted(4, 'N') }
	95	sub read_long { shift->read_formatted(8, 'Q>') }
	96
	97	sub read_tiny_text {
	98	my ($self, $encoding) = @_;
	99	my $data = $self->read_data($self->read_char);
	100	if (length $data == 255) {
	101	$data = unpack 'Z*', $data;
	102	}
	103	$encoding //= $self->{encoding};
	104	decode $encoding, $data;
	105	}
	106
	107	sub read_text {
	108	my ($self, $encoding) = @_;
	109	my $data = $self->read_data($self->read_short);
	110	$encoding //= $self->{encoding};
	111	decode $encoding, $data;
	112	}
	113
	114	sub read_large_byte_string {
	115	my ($self) = @_;
	116	$self->read_data($self->read_short)
	117	}
	118
	119	sub read_tag {
	120	my ($self) = @_;
	121	my $name = $self->read_tiny_text;
	122	my $value = $self->read_tiny_text;
	123	($name, $value)
	124	}
	125
	126	sub read_tags {
	127	my ($self) = @_;
	128	my $tag_count = $self->read_char;
	129	map { $self->read_tag } 1..$tag_count
	130	}
	131
	132	sub read_content_types {
	133	my ($self) = @_;
	134	my $content_type_count = $self->read_char;
	135	map { $self->read_text } 1..$content_type_count
	136	}
	137
	138	sub read_positions {
	139	my ($self) = @_;
	140	my $count = $self->read_int;
	141	my @positions = map { $self->read_long } 1..$count;
	142	my $relative_to = $self->ftell;
	143	map { $relative_to + $_ } @positions
	144	}
	145
	146	sub fseek {
	147	my ($self, $position) = @_;
	148	seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
	149	}
	150
	151	sub ftell {
	152	my ($self) = @_;
	153	my $result = tell $self->{fh};
	154	croak "Failed to tell position in file" if $result == -1;
	155	$result
	156	}
	157
	158	sub uncompress {
	159	my ($self, $data) = @_;
	160	$UNCOMPRESS{$self->{header}{compression}}->($data)
	161	}
	162
	163	sub read_header {
	164	my ($self) = @_;
	165	my $magic = $self->read_data(length MAGIC);
	166	croak "Not a SLOB dictionary" unless MAGIC eq $magic;
	167	my $uuid = $self->read_data(16);
	168
	169	my $encoding = $self->read_tiny_text('UTF-8');
	170	$self->{encoding} = $encoding;
	171
	172	my $compression = $self->read_tiny_text;
	173	die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
	174	my %tags = $self->read_tags;
	175	my @content_types = $self->read_content_types;
	176	my $blob_count = $self->read_int;
	177	my $store_offset = $self->read_long;
	178	my $size = $self->read_long;
	179	my @refs = $self->read_positions;
	180
	181	$self->fseek($store_offset);
	182	my @storage_bins = $self->read_positions;
	183
	184	+{
	185	uuid => $uuid,
	186	encoding => $encoding,
	187	compression => $compression,
	188	tags => \%tags,
	189	content_types => \@content_types,
	190	blob_count => $blob_count,
	191	store_offset => $store_offset,
	192	size => $size,
	193	refs => \@refs,
	194	storage_bins => \@storage_bins,
	195	}
	196	}
	197
	198	sub read_ref {
	199	my ($self) = @_;
	200	my $key = $self->read_text;
	201	my $bin_index = $self->read_int;
	202	my $item_index = $self->read_short;
	203	my $fragment = $self->read_tiny_text;
	204	+{
	205	key => $key,
	206	bin_index => $bin_index,
	207	item_index => $item_index,
	208	fragment => $fragment,
	209	}
	210	}
	211
	212	sub read_storage_bin {
	213	my ($self) = @_;
	214	my $count = $self->read_int;
	215	my @content_types = map { $self->read_char } 1..$count;
	216	my $compressed_size = $self->read_int;
	217	my $compressed_data = $self->read_data($compressed_size);
	218	my $uncompressed_data = $self->uncompress($compressed_data);
	219
	220	my @positions = unpack "N$count", $uncompressed_data;
	221	my $data = substr $uncompressed_data, $count * 4;
	222	+{
	223	positions => \@positions,
	224	data => $data
	225	}
	226	}
	227
	228	sub ref_count { shift @{shift->{header}{refs}} }
	229
	230	sub seek_and_read_ref {
	231	my ($self, $index) = @_;
	232	croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
	233	$self->fseek($self->{header}{refs}[$index]);
	234	$self->read_ref
	235	}
	236
	237	sub seek_and_read_storage_bin {
	238	my ($self, $index) = @_;
	239	croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
	240	$self->fseek($self->{header}{storage_bins}[$index]);
	241	$self->read_storage_bin
	242	}
	243
	244	sub get_entry_of_storage_bin {
	245	my ($self, $storage_bin, $index) = @_;
	246	my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
	247	my $length = unpack 'N', $start_of_data;
	248	substr $start_of_data, 4, $length;
	249	}
	250
	251	sub seek_and_read_ref_and_data {
	252	my ($self, $index) = @_;
	253	my $ref = $self->seek_and_read_ref($index);
	254	my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
	255	my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
	256	$ref->{data} = $data;
	257	$ref
	258	}
	259
	260	1;
	261	__END__
	262
	263	=encoding utf-8
	264
	265	=head1 NAME
	266
	267	Slob - Read .slob dictionaries (as used by Aard 2)
	268
	269	=head1 SYNOPSIS
	270
	271	use feature qw/:5.14/;
	272	use Slob;
	273	my $slob = Slob->new('path/to/dict.slob');
	274
	275	my $nr_of_entries = $slob->ref_count; # if the same content has two
	276	# keys pointing to it, this
	277	# counts it twice
	278
	279	my $second_ref = $slob->seek_and_read_ref(4);
	280	say "Entry is for $second_ref->{key}";
	281	say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
	282
	283	my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
	284	say "Bin has ", (scalar @{$bin->{positions}}), " entries";
	285	say "Value at position $second_ref->{item_index} is ",
	286	$slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
	287
	288	# instead of the above, we can do
	289	my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
	290	say "Entry is for $second_ref_and_data->{key}";
	291	say "Value is $second_ref_and_data->{data}";
	292
	293	=head1 DESCRIPTION
	294
	295	Slob is a compressed read-only format for storing natural language
	296	dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
	297	dictionaries in slob format.
	298
	299	The following methods are available:
	300
	301	=over
	302
	303	=item Slob->B<new>(I<$path>)
	304	=item Slob->B<new>(I<$fh>)
	305
	306	Create a new slob reader reading from the given path or filehandle.
	307
	308	=item $slob->B<ref_count>
	309
	310	The number of refs (keys) in the dictionary.
	311
	312	=item $slob->B<seek_and_read_ref>(I<$index>)
	313
	314	Read the ref (key) at the given index. Returns a hashref with the
	315	following keys:
	316
	317	=over
	318
	319	=item key
	320
	321	The key
	322
	323	=item bin_index
	324
	325	The storage bin that contains the value for this key
	326
	327	=item item_index
	328
	329	The index in the bin_index storage bin of the value for this key
	330
	331	=item fragment
	332
	333	HTML fragment that, when applied to the HTML value, points to the
	334	definition of the key.
	335
	336	=back
	337
	338	=item $slob->B<seek_and_read_storage_bin>(I<$index>)
	339
	340	Read the storage bin with the given index. Returns the storage bin,
	341	which can later be given to B<get_entry_of_storage_bin>.
	342
	343	=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
	344
	345	Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
	346	item index, returns the value at the index i nthe storage bin.
	347
	348	=item $slob->B<seek_and_read_ref_and_data>($index)
	349
	350	Convenience method that returns the key and value at a given index.
	351	Returns a hashref like C<seek_and_read_ref> with an extra key,
	352	I<data>, which is the value of the key.
	353
	354	=back
	355
	356	=head1 SEE ALSO
	357
	358	L<https://github.com/itkach/slob>
	359
	360	=head1 AUTHOR
	361
	362	Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
	363
	364	=head1 COPYRIGHT AND LICENSE
	365
	366	Copyright (C) 2017-2018 by Marius Gavrilescu
	367
	368	This library is free software; you can redistribute it and/or modify
	369	it under the same terms as Perl itself, either Perl version 5.26.1 or,
	370	at your option, any later version of Perl 5 you may have available.
	371
	372
	373	=cut