6 our $VERSION = '0.001';
8 use constant MAGIC
=> "!-1SLOB\x1F";
10 use Carp qw
/croak verbose/;
13 use Compress
::Raw
::Lzma
;
19 my ($lzma2, $code, $output);
20 ($lzma2, $code) = Compress
::Raw
::Lzma
::RawDecoder
->new(Filter
=> Lzma
::Filter
::Lzma2
());
21 die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK
;
22 $code = $lzma2->code($input, $output);
23 die "Did not reach end of stream" if $code == LZMA_OK
;
24 die "Error decoding LZMA2: $code" if $code != LZMA_STREAM_END
;
30 my ($class, $path) = @_;
32 if (ref $path eq 'IO') {
35 open $fh, '<', $path or croak
"Cannot open \"$path\": $!"
37 my $self = bless {path
=> $path, fh
=> $fh}, $class;
38 $self->{header
} = $self->read_header;
43 my ($self, $len) = @_;
45 my $result = read $self->{fh
}, $data, $len;
46 if (!defined $result) {
47 croak
"Failed to read from $self->{path}: $!"
48 } elsif ($result == $len) {
50 } elsif ($result == 0) {
51 croak
"$self->{path} is at end of file"
52 } elsif ($result < $len) {
53 croak
"Only read $result bytes of $self->{path} before reaching EOF"
58 my ($self, $len_of_format, $format) = @_;
59 unpack $format, $self->read_data($len_of_format);
62 sub read_char
{ shift->read_formatted(1, 'C') }
63 sub read_short
{ shift->read_formatted(2, 'n') }
64 sub read_int
{ shift->read_formatted(4, 'N') }
65 sub read_long
{ shift->read_formatted(8, 'Q>') }
68 my ($self, $encoding) = @_;
69 my $data = $self->read_data($self->read_char);
70 if (length $data == 255) {
71 $data = unpack 'Z*', $data;
73 $encoding //= $self->{encoding
};
74 decode
$encoding, $data;
78 my ($self, $encoding) = @_;
79 my $data = $self->read_data($self->read_short);
80 $encoding //= $self->{encoding
};
81 decode
$encoding, $data;
84 sub read_large_byte_string
{
86 $self->read_data($self->read_short)
91 my $name = $self->read_tiny_text;
92 my $value = $self->read_tiny_text;
98 my $tag_count = $self->read_char;
99 map { $self->read_tag } 1..$tag_count
102 sub read_content_types
{
104 my $content_type_count = $self->read_char;
105 map { $self->read_text } 1..$content_type_count
110 my $count = $self->read_int;
111 my @positions = map { $self->read_long } 1..$count;
112 my $relative_to = $self->ftell;
113 map { $relative_to + $_ } @positions
117 my ($self, $position) = @_;
118 seek $self->{fh
}, $position, 0 or croak
"Failed to seek to byte $position"
123 my $result = tell $self->{fh
};
124 croak
"Failed to tell position in file" if $result == -1;
129 my ($self, $data) = @_;
130 $UNCOMPRESS{$self->{header
}{compression
}}->($data)
135 my $magic = $self->read_data(length MAGIC
);
136 croak
"Not a SLOB dictionary" unless MAGIC
eq $magic;
137 my $uuid = $self->read_data(16);
139 my $encoding = $self->read_tiny_text('UTF-8');
140 $self->{encoding
} = $encoding;
142 my $compression = $self->read_tiny_text;
143 die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
144 my %tags = $self->read_tags;
145 my @content_types = $self->read_content_types;
146 my $blob_count = $self->read_int;
147 my $store_offset = $self->read_long;
148 my $size = $self->read_long;
149 my @refs = $self->read_positions;
151 $self->fseek($store_offset);
152 my @storage_bins = $self->read_positions;
156 encoding
=> $encoding,
157 compression
=> $compression,
159 content_types
=> \
@content_types,
160 blob_count
=> $blob_count,
161 store_offset
=> $store_offset,
164 storage_bins
=> \
@storage_bins,
170 my $key = $self->read_text;
171 my $bin_index = $self->read_int;
172 my $item_index = $self->read_short;
173 my $fragment = $self->read_tiny_text;
176 bin_index
=> $bin_index,
177 item_index
=> $item_index,
178 fragment
=> $fragment,
182 sub read_storage_bin
{
184 my $count = $self->read_int;
185 my @content_types = map { $self->read_char } 1..$count;
186 my $compressed_size = $self->read_int;
187 my $compressed_data = $self->read_data($compressed_size);
188 my $uncompressed_data = $self->uncompress($compressed_data);
190 my @positions = unpack "N$count", $uncompressed_data;
191 my $data = substr $uncompressed_data, $count * 4;
193 positions
=> \
@positions,
198 sub ref_count
{ shift @
{shift->{header
}{refs
}} }
200 sub seek_and_read_ref
{
201 my ($self, $index) = @_;
202 croak
"No ref has index $index" unless exists $self->{header
}{refs
}[$index];
203 $self->fseek($self->{header
}{refs
}[$index]);
207 sub seek_and_read_storage_bin
{
208 my ($self, $index) = @_;
209 croak
"No storage bin has index $index" unless exists $self->{header
}{storage_bins
}[$index];
210 $self->fseek($self->{header
}{storage_bins
}[$index]);
211 $self->read_storage_bin
214 sub get_entry_of_storage_bin
{
215 my ($self, $storage_bin, $index) = @_;
216 my $start_of_data = substr $storage_bin->{data
}, $storage_bin->{positions
}[$index];
217 my $length = unpack 'N', $start_of_data;
218 substr $start_of_data, 4, $length;
221 sub seek_and_read_ref_and_data
{
222 my ($self, $index) = @_;
223 my $ref = $self->seek_and_read_ref($index);
224 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index
});
225 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index
});
226 $ref->{data
} = $data;
237 Slob - Read .slob dictionaries (as used by Aard 2)
241 use feature qw/:5.14/;
243 my $slob = Slob->new('path/to/dict.slob');
245 my $nr_of_entries = $slob->ref_count; # if the same content has two
246 # keys pointing to it, this
249 my $second_ref = $slob->seek_and_read_ref(4);
250 say "Entry is for $second_ref->{key}";
251 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
253 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
254 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
255 say "Value at position $second_ref->{item_index} is ",
256 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
258 # instead of the above, we can do
259 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
260 say "Entry is for $second_ref_and_data->{key}";
261 say "Value is $second_ref_and_data->{data}";
265 Slob is a compressed read-only format for storing natural language
266 dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
267 dictionaries in slob format.
269 The following methods are available:
273 =item Slob->B<new>(I<$path>)
274 =item Slob->B<new>(I<$fh>)
276 Create a new slob reader reading from the given path or filehandle.
278 =item $slob->B<ref_count>
280 The number of refs (keys) in the dictionary.
282 =item $slob->B<seek_and_read_ref>(I<$index>)
284 Read the ref (key) at the given index. Returns a hashref with the
295 The storage bin that contains the value for this key
299 The index in the bin_index storage bin of the value for this key
303 HTML fragment that, when applied to the HTML value, points to the
304 definition of the key.
308 =item $slob->B<seek_and_read_storage_bin>(I<$index>)
310 Read the storage bin with the given index. Returns the storage bin,
311 which can later be given to B<get_entry_of_storage_bin>.
313 =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
315 Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
316 item index, returns the value at the index i nthe storage bin.
318 =item $slob->B<seek_and_read_ref_and_data>($index)
320 Convenience method that returns the key and value at a given index.
321 Returns a hashref like C<seek_and_read_ref> with an extra key,
322 I<data>, which is the value of the key.
328 L<https://github.com/itkach/slob>
332 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
334 =head1 COPYRIGHT AND LICENSE
336 Copyright (C) 2017 by Marius Gavrilescu
338 This library is free software; you can redistribute it and/or modify
339 it under the same terms as Perl itself, either Perl version 5.26.1 or,
340 at your option, any later version of Perl 5 you may have available.