6 our $VERSION = '0.000_001';
8 use constant MAGIC
=> "!-1SLOB\x1F";
10 use Carp qw
/croak verbose/;
14 my ($class, $path) = @_;
18 : open my $fh, '<', $path or croak
"Cannot open \"$path\": $!";
19 my $self = bless {path
=> $path, fh
=> $fh}, $class;
20 $self->{header
} = $self->read_header;
25 my ($self, $len) = @_;
27 my $result = read $self->{fh
}, $data, $len;
28 if (!defined $result) {
29 croak
"Failed to read from $self->{path}: $!"
30 } elsif ($result == $len) {
32 } elsif ($result == 0) {
33 croak
"$self->{path} is at end of file"
34 } elsif ($result < $len) {
35 croak
"Only read $result bytes of $self->{path} before reaching EOF"
40 my ($self, $len_of_format, $format) = @_;
41 unpack $format, $self->read_data($len_of_format);
44 sub read_char
{ shift->read_formatted(1, 'C') }
45 sub read_short
{ shift->read_formatted(2, 'n') }
46 sub read_int
{ shift->read_formatted(4, 'N') }
47 sub read_long
{ shift->read_formatted(8, 'Q>') }
50 my ($self, $encoding) = @_;
51 my $data = $self->read_data($self->read_char);
52 if (length $data == 255) {
53 $data = unpack 'Z*', $data;
55 $encoding //= $self->{encoding
};
56 decode
$encoding, $data;
60 my ($self, $encoding) = @_;
61 my $data = $self->read_data($self->read_short);
62 $encoding //= $self->{encoding
};
63 decode
$encoding, $data;
66 sub read_large_byte_string
{
68 $self->read_data($self->read_short)
73 my $name = $self->read_tiny_text;
74 my $value = $self->read_tiny_text;
80 my $tag_count = $self->read_char;
81 map { $self->read_tag } 1..$tag_count
84 sub read_content_types
{
86 my $content_type_count = $self->read_char;
87 map { $self->read_text } 1..$content_type_count
92 my $count = $self->read_int;
93 my @positions = map { $self->read_long } 1..$count;
94 my $relative_to = $self->ftell;
95 map { $relative_to + $_ } @positions
99 my ($self, $position) = @_;
100 seek $self->{fh
}, $position, 0 or croak
"Failed to seek to byte $position"
105 my $result = tell $self->{fh
};
106 croak
"Failed to tell position in file" if $result == -1;
111 my ($self, $data) = @_;
117 my $magic = $self->read_data(length MAGIC
);
118 croak
"Not a SLOB dictionary" unless MAGIC
eq $magic;
119 my $uuid = $self->read_data(16);
121 my $encoding = $self->read_tiny_text('UTF-8');
122 $self->{encoding
} = $encoding;
124 my $compression = $self->read_tiny_text;
125 die "Compression not yet supported" if $compression;
126 my %tags = $self->read_tags;
127 my @content_types = $self->read_content_types;
128 my $blob_count = $self->read_int;
129 my $store_offset = $self->read_long;
130 my $size = $self->read_long;
131 my @refs = $self->read_positions;
133 $self->fseek($store_offset);
134 my @storage_bins = $self->read_positions;
138 encoding
=> $encoding,
139 compression
=> $compression,
141 content_types
=> \
@content_types,
142 blob_count
=> $blob_count,
143 store_offset
=> $store_offset,
146 storage_bins
=> \
@storage_bins,
152 my $key = $self->read_text;
153 my $bin_index = $self->read_int;
154 my $item_index = $self->read_short;
155 my $fragment = $self->read_tiny_text;
158 bin_index
=> $bin_index,
159 item_index
=> $item_index,
160 fragment
=> $fragment,
164 sub read_storage_bin
{
166 my $count = $self->read_int;
167 my @content_types = map { $self->read_char } 1..$count;
168 my $compressed_size = $self->read_int;
169 my $compressed_data = $self->read_data($compressed_size);
170 my $uncompressed_data = $self->uncompress($compressed_data);
172 my @positions = unpack "N$count", $uncompressed_data;
173 my $data = substr $uncompressed_data, $count * 4;
175 positions
=> \
@positions,
180 sub ref_count
{ shift @
{shift->{header
}{refs
}} }
182 sub seek_and_read_ref
{
183 my ($self, $index) = @_;
184 croak
"No ref has index $index" unless exists $self->{header
}{refs
}[$index];
185 $self->fseek($self->{header
}{refs
}[$index]);
189 sub seek_and_read_storage_bin
{
190 my ($self, $index) = @_;
191 croak
"No storage bin has index $index" unless exists $self->{header
}{storage_bins
}[$index];
192 $self->fseek($self->{header
}{storage_bins
}[$index]);
193 $self->read_storage_bin
196 sub get_entry_of_storage_bin
{
197 my ($self, $storage_bin, $index) = @_;
198 my $start_of_data = substr $storage_bin->{data
}, $storage_bin->{positions
}[$index];
199 my $length = unpack 'N', $start_of_data;
200 substr $start_of_data, 4, $length;
203 sub seek_and_read_ref_and_data
{
204 my ($self, $index) = @_;
205 my $ref = $self->seek_and_read_ref($index);
206 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index
});
207 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index
});
208 $ref->{data
} = $data;
219 Slob - Read .slob dictionaries (as used by Aard 2)
223 use feature qw/:5.14/;
225 my $slob = Slob->new('path/to/dict.slob');
227 my $nr_of_entries = $slob->ref_count; # if the same content has two
228 # keys pointing to it, this
231 my $second_ref = $slob->seek_and_read_ref(4);
232 say "Entry is for $second_ref->{key}";
233 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
235 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
236 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
237 say "Value at position $second_ref->{item_index} is ",
238 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
240 # instead of the above, we can do
241 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
242 say "Entry is for $second_ref_and_data->{key}";
243 say "Value is $second_ref_and_data->{data}";
247 Slob is a compressed read-only format for storing natural language
248 dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
249 dictionaries in slob format.
251 The following methods are available:
255 =item Slob->B<new>(I<$path>)
256 =item Slob->B<new>(I<$fh>)
258 Create a new slob reader reading from the given path or filehandle.
260 =item $slob->B<ref_count>
262 The number of refs (keys) in the dictionary.
264 =item $slob->B<seek_and_read_ref>(I<$index>)
266 Read the ref (key) at the given index. Returns a hashref with the
277 The storage bin that contains the value for this key
281 The index in the bin_index storage bin of the value for this key
285 HTML fragment that, when applied to the HTML value, points to the
286 definition of the key.
290 =item $slob->B<seek_and_read_storage_bin>(I<$index>)
292 Read the storage bin with the given index. Returns the storage bin,
293 which can later be given to B<get_entry_of_storage_bin>.
295 =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
297 Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
298 item index, returns the value at the index i nthe storage bin.
300 =item $slob->B<seek_and_read_ref_and_data>($index)
302 Convenience method that returns the key and value at a given index.
303 Returns a hashref like C<seek_and_read_ref> with an extra key,
304 I<data>, which is the value of the key.
310 L<https://github.com/itkach/slob>
314 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
316 =head1 COPYRIGHT AND LICENSE
318 Copyright (C) 2017 by Marius Gavrilescu
320 This library is free software; you can redistribute it and/or modify
321 it under the same terms as Perl itself, either Perl version 5.26.1 or,
322 at your option, any later version of Perl 5 you may have available.