Bump version and update Changes
[slob.git] / lib / Slob.pm
CommitLineData
d3779104
MG
1package Slob;
2
3use 5.014000;
4use strict;
5use warnings;
ec907799 6our $VERSION = '0.002_001';
d3779104
MG
7
8use constant MAGIC => "!-1SLOB\x1F";
9
10use Carp qw/croak verbose/;
11use Encode;
12
1ca60f55 13use Compress::Raw::Bzip2;
d50f3958 14use Compress::Raw::Lzma;
1ca60f55 15use Compress::Raw::Zlib;
d50f3958 16
c4911b1f
MG
17# MD5 only used for debugging output in tests
18use Digest::MD5 qw/md5_hex/;
19
d50f3958
MG
20our %UNCOMPRESS = (
21 '' => sub { $_[0] },
22 'lzma2' => sub {
23 my ($input) = @_;
24 my ($lzma2, $code, $output);
25 ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
26 die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
1ca60f55 27
d50f3958 28 $code = $lzma2->code($input, $output);
1ca60f55
MG
29 die "Did not reach end of stream\n" if $code == LZMA_OK;
30 die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
31 $output
32 },
33
34 'bz2' => sub {
35 my ($input) = @_;
36 my ($bz2, $code, $output);
37 ($bz2, $code)= Compress::Raw::Bunzip2->new;
38 die "Error creating Bunzip2: $code\n" unless $code == Z_OK;
39
40 $code = $bz2->bzinflate($input, $output);
41 die "Did not reach end of stream\n" if $code == BZ_OK;
42 die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;
43
44 $output
45 },
46
47 'zlib' => sub {
48 my ($input) = @_;
49 my ($zlib, $code, $output);
50 ($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
51 -WindowBits => WANT_GZIP_OR_ZLIB
52 );
53 die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;
54
55 $code = $zlib->inflate($input, \$output, 1);
56 die "Did not reach end of stream\n" if $code == Z_OK;
57 die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
d50f3958
MG
58 $output
59 }
60);
61
d3779104
MG
62sub new {
63 my ($class, $path) = @_;
d50f3958
MG
64 my $fh;
65 if (ref $path eq 'IO') {
66 $fh = $path
67 } else {
68 open $fh, '<', $path or croak "Cannot open \"$path\": $!"
69 }
d3779104
MG
70 my $self = bless {path => $path, fh => $fh}, $class;
71 $self->{header} = $self->read_header;
72 $self
73}
74
75sub read_data {
76 my ($self, $len) = @_;
77 my $data;
78 my $result = read $self->{fh}, $data, $len;
79 if (!defined $result) {
80 croak "Failed to read from $self->{path}: $!"
81 } elsif ($result == $len) {
82 $data
83 } elsif ($result == 0) {
84 croak "$self->{path} is at end of file"
85 } elsif ($result < $len) {
86 croak "Only read $result bytes of $self->{path} before reaching EOF"
87 }
88}
89
90sub read_formatted {
91 my ($self, $len_of_format, $format) = @_;
92 unpack $format, $self->read_data($len_of_format);
93}
94
95sub read_char { shift->read_formatted(1, 'C') }
96sub read_short { shift->read_formatted(2, 'n') }
97sub read_int { shift->read_formatted(4, 'N') }
98sub read_long { shift->read_formatted(8, 'Q>') }
99
100sub read_tiny_text {
101 my ($self, $encoding) = @_;
102 my $data = $self->read_data($self->read_char);
103 if (length $data == 255) {
104 $data = unpack 'Z*', $data;
105 }
106 $encoding //= $self->{encoding};
107 decode $encoding, $data;
108}
109
110sub read_text {
111 my ($self, $encoding) = @_;
112 my $data = $self->read_data($self->read_short);
113 $encoding //= $self->{encoding};
114 decode $encoding, $data;
115}
116
117sub read_large_byte_string {
118 my ($self) = @_;
119 $self->read_data($self->read_short)
120}
121
122sub read_tag {
123 my ($self) = @_;
124 my $name = $self->read_tiny_text;
125 my $value = $self->read_tiny_text;
126 ($name, $value)
127}
128
129sub read_tags {
130 my ($self) = @_;
131 my $tag_count = $self->read_char;
132 map { $self->read_tag } 1..$tag_count
133}
134
135sub read_content_types {
136 my ($self) = @_;
137 my $content_type_count = $self->read_char;
138 map { $self->read_text } 1..$content_type_count
139}
140
141sub read_positions {
142 my ($self) = @_;
143 my $count = $self->read_int;
144 my @positions = map { $self->read_long } 1..$count;
145 my $relative_to = $self->ftell;
146 map { $relative_to + $_ } @positions
147}
148
149sub fseek {
150 my ($self, $position) = @_;
151 seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
152}
153
154sub ftell {
155 my ($self) = @_;
156 my $result = tell $self->{fh};
157 croak "Failed to tell position in file" if $result == -1;
158 $result
159}
160
161sub uncompress {
c4911b1f
MG
162 my ($self, $data) = @_;
163 my $compression = $self->{header}{compression};
164 if ($ENV{HARNESS_ACTIVE} && $compression eq 'lzma2') {
165 my $prefix = unpack 'H*', substr $data, 0, 10;
166 my $md5sum = md5_hex $data;
167 Test::More::diag "Uncompressing data starting '$prefix', md5sum $md5sum";
168 }
169 $UNCOMPRESS{$compression}->($data)
d3779104
MG
170}
171
172sub read_header {
173 my ($self) = @_;
174 my $magic = $self->read_data(length MAGIC);
175 croak "Not a SLOB dictionary" unless MAGIC eq $magic;
176 my $uuid = $self->read_data(16);
177
178 my $encoding = $self->read_tiny_text('UTF-8');
179 $self->{encoding} = $encoding;
180
181 my $compression = $self->read_tiny_text;
d50f3958 182 die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
d3779104
MG
183 my %tags = $self->read_tags;
184 my @content_types = $self->read_content_types;
185 my $blob_count = $self->read_int;
186 my $store_offset = $self->read_long;
187 my $size = $self->read_long;
188 my @refs = $self->read_positions;
189
190 $self->fseek($store_offset);
191 my @storage_bins = $self->read_positions;
192
193 +{
194 uuid => $uuid,
195 encoding => $encoding,
196 compression => $compression,
197 tags => \%tags,
198 content_types => \@content_types,
199 blob_count => $blob_count,
200 store_offset => $store_offset,
201 size => $size,
202 refs => \@refs,
203 storage_bins => \@storage_bins,
204 }
205}
206
207sub read_ref {
208 my ($self) = @_;
209 my $key = $self->read_text;
210 my $bin_index = $self->read_int;
211 my $item_index = $self->read_short;
212 my $fragment = $self->read_tiny_text;
213 +{
214 key => $key,
215 bin_index => $bin_index,
216 item_index => $item_index,
217 fragment => $fragment,
218 }
219}
220
221sub read_storage_bin {
222 my ($self) = @_;
223 my $count = $self->read_int;
224 my @content_types = map { $self->read_char } 1..$count;
225 my $compressed_size = $self->read_int;
226 my $compressed_data = $self->read_data($compressed_size);
227 my $uncompressed_data = $self->uncompress($compressed_data);
228
229 my @positions = unpack "N$count", $uncompressed_data;
230 my $data = substr $uncompressed_data, $count * 4;
231 +{
232 positions => \@positions,
233 data => $data
234 }
235}
236
237sub ref_count { shift @{shift->{header}{refs}} }
238
239sub seek_and_read_ref {
240 my ($self, $index) = @_;
241 croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
242 $self->fseek($self->{header}{refs}[$index]);
243 $self->read_ref
244}
245
246sub seek_and_read_storage_bin {
247 my ($self, $index) = @_;
248 croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
249 $self->fseek($self->{header}{storage_bins}[$index]);
250 $self->read_storage_bin
251}
252
253sub get_entry_of_storage_bin {
254 my ($self, $storage_bin, $index) = @_;
255 my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
256 my $length = unpack 'N', $start_of_data;
257 substr $start_of_data, 4, $length;
258}
259
0e15b496
MG
260sub seek_and_read_ref_and_data {
261 my ($self, $index) = @_;
262 my $ref = $self->seek_and_read_ref($index);
263 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
264 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
265 $ref->{data} = $data;
266 $ref
267}
268
d3779104
MG
2691;
270__END__
271
272=encoding utf-8
273
274=head1 NAME
275
276Slob - Read .slob dictionaries (as used by Aard 2)
277
278=head1 SYNOPSIS
279
280 use feature qw/:5.14/;
281 use Slob;
282 my $slob = Slob->new('path/to/dict.slob');
283
284 my $nr_of_entries = $slob->ref_count; # if the same content has two
285 # keys pointing to it, this
286 # counts it twice
287
288 my $second_ref = $slob->seek_and_read_ref(4);
289 say "Entry is for $second_ref->{key}";
290 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
291
292 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
293 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
294 say "Value at position $second_ref->{item_index} is ",
295 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
296
0e15b496
MG
297 # instead of the above, we can do
298 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
299 say "Entry is for $second_ref_and_data->{key}";
300 say "Value is $second_ref_and_data->{data}";
301
d3779104
MG
302=head1 DESCRIPTION
303
632d3de3
MG
304Slob is a compressed read-only format for storing natural language
305dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
306dictionaries in slob format.
307
308The following methods are available:
309
310=over
311
312=item Slob->B<new>(I<$path>)
313=item Slob->B<new>(I<$fh>)
314
315Create a new slob reader reading from the given path or filehandle.
316
317=item $slob->B<ref_count>
318
319The number of refs (keys) in the dictionary.
320
321=item $slob->B<seek_and_read_ref>(I<$index>)
322
323Read the ref (key) at the given index. Returns a hashref with the
324following keys:
325
326=over
327
328=item key
329
330The key
331
332=item bin_index
333
334The storage bin that contains the value for this key
335
336=item item_index
337
338The index in the bin_index storage bin of the value for this key
339
340=item fragment
341
342HTML fragment that, when applied to the HTML value, points to the
343definition of the key.
344
345=back
346
347=item $slob->B<seek_and_read_storage_bin>(I<$index>)
348
349Read the storage bin with the given index. Returns the storage bin,
350which can later be given to B<get_entry_of_storage_bin>.
351
352=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
353
354Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
355item index, returns the value at the index i nthe storage bin.
356
357=item $slob->B<seek_and_read_ref_and_data>($index)
358
359Convenience method that returns the key and value at a given index.
360Returns a hashref like C<seek_and_read_ref> with an extra key,
361I<data>, which is the value of the key.
362
363=back
364
365=head1 SEE ALSO
366
367L<https://github.com/itkach/slob>
d3779104
MG
368
369=head1 AUTHOR
370
371Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
372
373=head1 COPYRIGHT AND LICENSE
374
35224ef9 375Copyright (C) 2017-2018 by Marius Gavrilescu
d3779104
MG
376
377This library is free software; you can redistribute it and/or modify
378it under the same terms as Perl itself, either Perl version 5.26.1 or,
379at your option, any later version of Perl 5 you may have available.
380
381
382=cut
This page took 0.03068 seconds and 4 git commands to generate.