[slob.git] / lib / Slob.pm

package Slob;

use 5.014000;
use strict;
use warnings;
our $VERSION = '0.001';

use constant MAGIC => "!-1SLOB\x1F";

use Carp qw/croak verbose/;
use Encode;

use Compress::Raw::Lzma;

our %UNCOMPRESS = (
	'' => sub { $_[0] },
	'lzma2' => sub {
		my ($input) = @_;
		my ($lzma2, $code, $output);
		($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
		die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
		$code = $lzma2->code($input, $output);
		die "Did not reach end of stream" if $code == LZMA_OK;
		die "Error decoding LZMA2: $code" if $code != LZMA_STREAM_END;
		$output
	}
);

sub new {
	my ($class, $path) = @_;
	my $fh;
	if (ref $path eq 'IO') {
		$fh = $path
	} else {
		open $fh, '<', $path or croak "Cannot open \"$path\": $!"
	}
	my $self = bless {path => $path, fh => $fh}, $class;
	$self->{header} = $self->read_header;
	$self
}

sub read_data {
	my ($self, $len) = @_;
	my $data;
	my $result = read $self->{fh}, $data, $len;
	if (!defined $result) {
		croak "Failed to read from $self->{path}: $!"
	} elsif ($result == $len) {
		$data
	} elsif ($result == 0) {
		croak "$self->{path} is at end of file"
	} elsif ($result < $len) {
		croak "Only read $result bytes of $self->{path} before reaching EOF"
	}
}

sub read_formatted {
	my ($self, $len_of_format, $format) = @_;
	unpack $format, $self->read_data($len_of_format);
}

sub read_char  { shift->read_formatted(1, 'C') }
sub read_short { shift->read_formatted(2, 'n') }
sub read_int   { shift->read_formatted(4, 'N') }
sub read_long  { shift->read_formatted(8, 'Q>') }

sub read_tiny_text {
	my ($self, $encoding) = @_;
	my $data = $self->read_data($self->read_char);
	if (length $data == 255) {
		$data = unpack 'Z*', $data;
	}
	$encoding //= $self->{encoding};
	decode $encoding, $data;
}

sub read_text {
	my ($self, $encoding) = @_;
	my $data = $self->read_data($self->read_short);
	$encoding //= $self->{encoding};
	decode $encoding, $data;
}

sub read_large_byte_string {
	my ($self) = @_;
	$self->read_data($self->read_short)
}

sub read_tag {
	my ($self) = @_;
	my $name  = $self->read_tiny_text;
	my $value = $self->read_tiny_text;
	($name, $value)
}

sub read_tags {
	my ($self) = @_;
	my $tag_count = $self->read_char;
	map { $self->read_tag } 1..$tag_count
}

sub read_content_types {
	my ($self) = @_;
	my $content_type_count = $self->read_char;
	map { $self->read_text } 1..$content_type_count
}

sub read_positions {
	my ($self) = @_;
	my $count = $self->read_int;
	my @positions = map { $self->read_long } 1..$count;
	my $relative_to = $self->ftell;
	map { $relative_to + $_ } @positions
}

sub fseek {
	my ($self, $position) = @_;
	seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
}

sub ftell {
	my ($self) = @_;
	my $result = tell $self->{fh};
	croak "Failed to tell position in file" if $result == -1;
	$result
}

sub uncompress {
	my ($self, $data) = @_;
	$UNCOMPRESS{$self->{header}{compression}}->($data)
}

sub read_header {
	my ($self) = @_;
	my $magic = $self->read_data(length MAGIC);
	croak "Not a SLOB dictionary" unless MAGIC eq $magic;
	my $uuid = $self->read_data(16);

	my $encoding = $self->read_tiny_text('UTF-8');
	$self->{encoding} = $encoding;

	my $compression = $self->read_tiny_text;
	die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
	my %tags = $self->read_tags;
	my @content_types = $self->read_content_types;
	my $blob_count = $self->read_int;
	my $store_offset = $self->read_long;
	my $size = $self->read_long;
	my @refs = $self->read_positions;

	$self->fseek($store_offset);
	my @storage_bins = $self->read_positions;

	+{
		uuid => $uuid,
		encoding => $encoding,
		compression => $compression,
		tags => \%tags,
		content_types => \@content_types,
		blob_count => $blob_count,
		store_offset => $store_offset,
		size => $size,
		refs => \@refs,
		storage_bins => \@storage_bins,
	}
}

sub read_ref {
	my ($self) = @_;
	my $key = $self->read_text;
	my $bin_index = $self->read_int;
	my $item_index = $self->read_short;
	my $fragment = $self->read_tiny_text;
	+{
		key => $key,
		bin_index => $bin_index,
		item_index => $item_index,
		fragment => $fragment,
	}
}

sub read_storage_bin {
	my ($self) = @_;
	my $count = $self->read_int;
	my @content_types = map { $self->read_char } 1..$count;
	my $compressed_size = $self->read_int;
	my $compressed_data = $self->read_data($compressed_size);
	my $uncompressed_data = $self->uncompress($compressed_data);

	my @positions = unpack "N$count", $uncompressed_data;
	my $data = substr $uncompressed_data, $count * 4;
	+{
		positions => \@positions,
		data => $data
	}
}

sub ref_count { shift @{shift->{header}{refs}} }

sub seek_and_read_ref {
	my ($self, $index) = @_;
	croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
	$self->fseek($self->{header}{refs}[$index]);
	$self->read_ref
}

sub seek_and_read_storage_bin {
	my ($self, $index) = @_;
	croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
	$self->fseek($self->{header}{storage_bins}[$index]);
	$self->read_storage_bin
}

sub get_entry_of_storage_bin {
	my ($self, $storage_bin, $index) = @_;
	my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
	my $length = unpack 'N', $start_of_data;
	substr $start_of_data, 4, $length;
}

sub seek_and_read_ref_and_data {
	my ($self, $index) = @_;
	my $ref = $self->seek_and_read_ref($index);
	my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
	my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
	$ref->{data} = $data;
	$ref
}

1;
__END__

=encoding utf-8

=head1 NAME

Slob - Read .slob dictionaries (as used by Aard 2)

=head1 SYNOPSIS

  use feature qw/:5.14/;
  use Slob;
  my $slob = Slob->new('path/to/dict.slob');

  my $nr_of_entries = $slob->ref_count; # if the same content has two
                                        # keys pointing to it, this
                                        # counts it twice

  my $second_ref = $slob->seek_and_read_ref(4);
  say "Entry is for $second_ref->{key}";
  say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";

  my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
  say "Bin has ", (scalar @{$bin->{positions}}), " entries";
  say "Value at position $second_ref->{item_index} is ",
    $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});

  # instead of the above, we can do
  my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
  say "Entry is for $second_ref_and_data->{key}";
  say "Value is $second_ref_and_data->{data}";

=head1 DESCRIPTION

Slob is a compressed read-only format for storing natural language
dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
dictionaries in slob format.

The following methods are available:

=over

=item Slob->B<new>(I<$path>)
=item Slob->B<new>(I<$fh>)

Create a new slob reader reading from the given path or filehandle.

=item $slob->B<ref_count>

The number of refs (keys) in the dictionary.

=item $slob->B<seek_and_read_ref>(I<$index>)

Read the ref (key) at the given index. Returns a hashref with the
following keys:

=over

=item key

The key

=item bin_index

The storage bin that contains the value for this key

=item item_index

The index in the bin_index storage bin of the value for this key

=item fragment

HTML fragment that, when applied to the HTML value, points to the
definition of the key.

=back

=item $slob->B<seek_and_read_storage_bin>(I<$index>)

Read the storage bin with the given index. Returns the storage bin,
which can later be given to B<get_entry_of_storage_bin>.

=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)

Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
item index, returns the value at the index i nthe storage bin.

=item $slob->B<seek_and_read_ref_and_data>($index)

Convenience method that returns the key and value at a given index.
Returns a hashref like C<seek_and_read_ref> with an extra key,
I<data>, which is the value of the key.

=back

=head1 SEE ALSO

L<https://github.com/itkach/slob>

=head1 AUTHOR

Marius Gavrilescu, E<lt>marius@ieval.roE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2017 by Marius Gavrilescu

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.26.1 or,
at your option, any later version of Perl 5 you may have available.


=cut
Commit	Line	Data
	1	package Slob;
	2
	3	use 5.014000;
	4	use strict;
	5	use warnings;
	6	our $VERSION = '0.001';
	7
	8	use constant MAGIC => "!-1SLOB\x1F";
	9
	10	use Carp qw/croak verbose/;
	11	use Encode;
	12
	13	use Compress::Raw::Lzma;
	14
	15	our %UNCOMPRESS = (
	16	'' => sub { $_[0] },
	17	'lzma2' => sub {
	18	my ($input) = @_;
	19	my ($lzma2, $code, $output);
	20	($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
	21	die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
	22	$code = $lzma2->code($input, $output);
	23	die "Did not reach end of stream" if $code == LZMA_OK;
	24	die "Error decoding LZMA2: $code" if $code != LZMA_STREAM_END;
	25	$output
	26	}
	27	);
	28
	29	sub new {
	30	my ($class, $path) = @_;
	31	my $fh;
	32	if (ref $path eq 'IO') {
	33	$fh = $path
	34	} else {
	35	open $fh, '<', $path or croak "Cannot open \"$path\": $!"
	36	}
	37	my $self = bless {path => $path, fh => $fh}, $class;
	38	$self->{header} = $self->read_header;
	39	$self
	40	}
	41
	42	sub read_data {
	43	my ($self, $len) = @_;
	44	my $data;
	45	my $result = read $self->{fh}, $data, $len;
	46	if (!defined $result) {
	47	croak "Failed to read from $self->{path}: $!"
	48	} elsif ($result == $len) {
	49	$data
	50	} elsif ($result == 0) {
	51	croak "$self->{path} is at end of file"
	52	} elsif ($result < $len) {
	53	croak "Only read $result bytes of $self->{path} before reaching EOF"
	54	}
	55	}
	56
	57	sub read_formatted {
	58	my ($self, $len_of_format, $format) = @_;
	59	unpack $format, $self->read_data($len_of_format);
	60	}
	61
	62	sub read_char { shift->read_formatted(1, 'C') }
	63	sub read_short { shift->read_formatted(2, 'n') }
	64	sub read_int { shift->read_formatted(4, 'N') }
	65	sub read_long { shift->read_formatted(8, 'Q>') }
	66
	67	sub read_tiny_text {
	68	my ($self, $encoding) = @_;
	69	my $data = $self->read_data($self->read_char);
	70	if (length $data == 255) {
	71	$data = unpack 'Z*', $data;
	72	}
	73	$encoding //= $self->{encoding};
	74	decode $encoding, $data;
	75	}
	76
	77	sub read_text {
	78	my ($self, $encoding) = @_;
	79	my $data = $self->read_data($self->read_short);
	80	$encoding //= $self->{encoding};
	81	decode $encoding, $data;
	82	}
	83
	84	sub read_large_byte_string {
	85	my ($self) = @_;
	86	$self->read_data($self->read_short)
	87	}
	88
	89	sub read_tag {
	90	my ($self) = @_;
	91	my $name = $self->read_tiny_text;
	92	my $value = $self->read_tiny_text;
	93	($name, $value)
	94	}
	95
	96	sub read_tags {
	97	my ($self) = @_;
	98	my $tag_count = $self->read_char;
	99	map { $self->read_tag } 1..$tag_count
	100	}
	101
	102	sub read_content_types {
	103	my ($self) = @_;
	104	my $content_type_count = $self->read_char;
	105	map { $self->read_text } 1..$content_type_count
	106	}
	107
	108	sub read_positions {
	109	my ($self) = @_;
	110	my $count = $self->read_int;
	111	my @positions = map { $self->read_long } 1..$count;
	112	my $relative_to = $self->ftell;
	113	map { $relative_to + $_ } @positions
	114	}
	115
	116	sub fseek {
	117	my ($self, $position) = @_;
	118	seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
	119	}
	120
	121	sub ftell {
	122	my ($self) = @_;
	123	my $result = tell $self->{fh};
	124	croak "Failed to tell position in file" if $result == -1;
	125	$result
	126	}
	127
	128	sub uncompress {
	129	my ($self, $data) = @_;
	130	$UNCOMPRESS{$self->{header}{compression}}->($data)
	131	}
	132
	133	sub read_header {
	134	my ($self) = @_;
	135	my $magic = $self->read_data(length MAGIC);
	136	croak "Not a SLOB dictionary" unless MAGIC eq $magic;
	137	my $uuid = $self->read_data(16);
	138
	139	my $encoding = $self->read_tiny_text('UTF-8');
	140	$self->{encoding} = $encoding;
	141
	142	my $compression = $self->read_tiny_text;
	143	die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
	144	my %tags = $self->read_tags;
	145	my @content_types = $self->read_content_types;
	146	my $blob_count = $self->read_int;
	147	my $store_offset = $self->read_long;
	148	my $size = $self->read_long;
	149	my @refs = $self->read_positions;
	150
	151	$self->fseek($store_offset);
	152	my @storage_bins = $self->read_positions;
	153
	154	+{
	155	uuid => $uuid,
	156	encoding => $encoding,
	157	compression => $compression,
	158	tags => \%tags,
	159	content_types => \@content_types,
	160	blob_count => $blob_count,
	161	store_offset => $store_offset,
	162	size => $size,
	163	refs => \@refs,
	164	storage_bins => \@storage_bins,
	165	}
	166	}
	167
	168	sub read_ref {
	169	my ($self) = @_;
	170	my $key = $self->read_text;
	171	my $bin_index = $self->read_int;
	172	my $item_index = $self->read_short;
	173	my $fragment = $self->read_tiny_text;
	174	+{
	175	key => $key,
	176	bin_index => $bin_index,
	177	item_index => $item_index,
	178	fragment => $fragment,
	179	}
	180	}
	181
	182	sub read_storage_bin {
	183	my ($self) = @_;
	184	my $count = $self->read_int;
	185	my @content_types = map { $self->read_char } 1..$count;
	186	my $compressed_size = $self->read_int;
	187	my $compressed_data = $self->read_data($compressed_size);
	188	my $uncompressed_data = $self->uncompress($compressed_data);
	189
	190	my @positions = unpack "N$count", $uncompressed_data;
	191	my $data = substr $uncompressed_data, $count * 4;
	192	+{
	193	positions => \@positions,
	194	data => $data
	195	}
	196	}
	197
	198	sub ref_count { shift @{shift->{header}{refs}} }
	199
	200	sub seek_and_read_ref {
	201	my ($self, $index) = @_;
	202	croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
	203	$self->fseek($self->{header}{refs}[$index]);
	204	$self->read_ref
	205	}
	206
	207	sub seek_and_read_storage_bin {
	208	my ($self, $index) = @_;
	209	croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
	210	$self->fseek($self->{header}{storage_bins}[$index]);
	211	$self->read_storage_bin
	212	}
	213
	214	sub get_entry_of_storage_bin {
	215	my ($self, $storage_bin, $index) = @_;
	216	my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
	217	my $length = unpack 'N', $start_of_data;
	218	substr $start_of_data, 4, $length;
	219	}
	220
	221	sub seek_and_read_ref_and_data {
	222	my ($self, $index) = @_;
	223	my $ref = $self->seek_and_read_ref($index);
	224	my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
	225	my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
	226	$ref->{data} = $data;
	227	$ref
	228	}
	229
	230	1;
	231	__END__
	232
	233	=encoding utf-8
	234
	235	=head1 NAME
	236
	237	Slob - Read .slob dictionaries (as used by Aard 2)
	238
	239	=head1 SYNOPSIS
	240
	241	use feature qw/:5.14/;
	242	use Slob;
	243	my $slob = Slob->new('path/to/dict.slob');
	244
	245	my $nr_of_entries = $slob->ref_count; # if the same content has two
	246	# keys pointing to it, this
	247	# counts it twice
	248
	249	my $second_ref = $slob->seek_and_read_ref(4);
	250	say "Entry is for $second_ref->{key}";
	251	say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
	252
	253	my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
	254	say "Bin has ", (scalar @{$bin->{positions}}), " entries";
	255	say "Value at position $second_ref->{item_index} is ",
	256	$slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
	257
	258	# instead of the above, we can do
	259	my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
	260	say "Entry is for $second_ref_and_data->{key}";
	261	say "Value is $second_ref_and_data->{data}";
	262
	263	=head1 DESCRIPTION
	264
	265	Slob is a compressed read-only format for storing natural language
	266	dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
	267	dictionaries in slob format.
	268
	269	The following methods are available:
	270
	271	=over
	272
	273	=item Slob->B<new>(I<$path>)
	274	=item Slob->B<new>(I<$fh>)
	275
	276	Create a new slob reader reading from the given path or filehandle.
	277
	278	=item $slob->B<ref_count>
	279
	280	The number of refs (keys) in the dictionary.
	281
	282	=item $slob->B<seek_and_read_ref>(I<$index>)
	283
	284	Read the ref (key) at the given index. Returns a hashref with the
	285	following keys:
	286
	287	=over
	288
	289	=item key
	290
	291	The key
	292
	293	=item bin_index
	294
	295	The storage bin that contains the value for this key
	296
	297	=item item_index
	298
	299	The index in the bin_index storage bin of the value for this key
	300
	301	=item fragment
	302
	303	HTML fragment that, when applied to the HTML value, points to the
	304	definition of the key.
	305
	306	=back
	307
	308	=item $slob->B<seek_and_read_storage_bin>(I<$index>)
	309
	310	Read the storage bin with the given index. Returns the storage bin,
	311	which can later be given to B<get_entry_of_storage_bin>.
	312
	313	=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
	314
	315	Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
	316	item index, returns the value at the index i nthe storage bin.
	317
	318	=item $slob->B<seek_and_read_ref_and_data>($index)
	319
	320	Convenience method that returns the key and value at a given index.
	321	Returns a hashref like C<seek_and_read_ref> with an extra key,
	322	I<data>, which is the value of the key.
	323
	324	=back
	325
	326	=head1 SEE ALSO
	327
	328	L<https://github.com/itkach/slob>
	329
	330	=head1 AUTHOR
	331
	332	Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
	333
	334	=head1 COPYRIGHT AND LICENSE
	335
	336	Copyright (C) 2017 by Marius Gavrilescu
	337
	338	This library is free software; you can redistribute it and/or modify
	339	it under the same terms as Perl itself, either Perl version 5.26.1 or,
	340	at your option, any later version of Perl 5 you may have available.
	341
	342
	343	=cut