Binmode + Bump version and update Changes
[slob.git] / lib / Slob.pm
1 package Slob;
2
3 use 5.014000;
4 use strict;
5 use warnings;
6 our $VERSION = '0.002002';
7
8 use constant MAGIC => "!-1SLOB\x1F";
9
10 use Carp qw/croak verbose/;
11 use Encode;
12
13 use Compress::Raw::Bzip2;
14 use Compress::Raw::Lzma;
15 use Compress::Raw::Zlib;
16
17 # MD5 only used for debugging output in tests
18 use Digest::MD5 qw/md5_hex/;
19
20 our %UNCOMPRESS = (
21 '' => sub { $_[0] },
22 'lzma2' => sub {
23 my ($input) = @_;
24 my ($lzma2, $code, $output);
25 ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
26 die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
27
28 $code = $lzma2->code($input, $output);
29 die "Did not reach end of stream\n" if $code == LZMA_OK;
30 die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
31 $output
32 },
33
34 'bz2' => sub {
35 my ($input) = @_;
36 my ($bz2, $code, $output);
37 ($bz2, $code)= Compress::Raw::Bunzip2->new;
38 die "Error creating Bunzip2: $code\n" unless $code == Z_OK;
39
40 $code = $bz2->bzinflate($input, $output);
41 die "Did not reach end of stream\n" if $code == BZ_OK;
42 die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;
43
44 $output
45 },
46
47 'zlib' => sub {
48 my ($input) = @_;
49 my ($zlib, $code, $output);
50 ($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
51 -WindowBits => WANT_GZIP_OR_ZLIB
52 );
53 die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;
54
55 $code = $zlib->inflate($input, \$output, 1);
56 die "Did not reach end of stream\n" if $code == Z_OK;
57 die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
58 $output
59 }
60 );
61
62 sub new {
63 my ($class, $path) = @_;
64 my $fh;
65 if (ref $path eq 'IO') {
66 $fh = $path
67 } else {
68 open $fh, '<', $path or croak "Cannot open \"$path\": $!";
69 binmode $fh;
70 }
71 my $self = bless {path => $path, fh => $fh}, $class;
72 $self->{header} = $self->read_header;
73 $self
74 }
75
76 sub read_data {
77 my ($self, $len) = @_;
78 my $data;
79 my $result = read $self->{fh}, $data, $len;
80 if (!defined $result) {
81 croak "Failed to read from $self->{path}: $!"
82 } elsif ($result == $len) {
83 $data
84 } elsif ($result == 0) {
85 croak "$self->{path} is at end of file"
86 } elsif ($result < $len) {
87 croak "Only read $result bytes of $self->{path} before reaching EOF"
88 }
89 }
90
91 sub read_formatted {
92 my ($self, $len_of_format, $format) = @_;
93 unpack $format, $self->read_data($len_of_format);
94 }
95
96 sub read_char { shift->read_formatted(1, 'C') }
97 sub read_short { shift->read_formatted(2, 'n') }
98 sub read_int { shift->read_formatted(4, 'N') }
99 sub read_long { shift->read_formatted(8, 'Q>') }
100
101 sub read_tiny_text {
102 my ($self, $encoding) = @_;
103 my $data = $self->read_data($self->read_char);
104 if (length $data == 255) {
105 $data = unpack 'Z*', $data;
106 }
107 $encoding //= $self->{encoding};
108 decode $encoding, $data;
109 }
110
111 sub read_text {
112 my ($self, $encoding) = @_;
113 my $data = $self->read_data($self->read_short);
114 $encoding //= $self->{encoding};
115 decode $encoding, $data;
116 }
117
118 sub read_large_byte_string {
119 my ($self) = @_;
120 $self->read_data($self->read_short)
121 }
122
123 sub read_tag {
124 my ($self) = @_;
125 my $name = $self->read_tiny_text;
126 my $value = $self->read_tiny_text;
127 ($name, $value)
128 }
129
130 sub read_tags {
131 my ($self) = @_;
132 my $tag_count = $self->read_char;
133 map { $self->read_tag } 1..$tag_count
134 }
135
136 sub read_content_types {
137 my ($self) = @_;
138 my $content_type_count = $self->read_char;
139 map { $self->read_text } 1..$content_type_count
140 }
141
142 sub read_positions {
143 my ($self) = @_;
144 my $count = $self->read_int;
145 my @positions = map { $self->read_long } 1..$count;
146 my $relative_to = $self->ftell;
147 map { $relative_to + $_ } @positions
148 }
149
150 sub fseek {
151 my ($self, $position) = @_;
152 seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
153 }
154
155 sub ftell {
156 my ($self) = @_;
157 my $result = tell $self->{fh};
158 croak "Failed to tell position in file" if $result == -1;
159 $result
160 }
161
162 sub uncompress {
163 my ($self, $data) = @_;
164 my $compression = $self->{header}{compression};
165 if ($ENV{HARNESS_ACTIVE} && $compression eq 'lzma2') {
166 my $prefix = unpack 'H*', substr $data, 0, 10;
167 my $md5sum = md5_hex $data;
168 Test::More::diag "Uncompressing data starting '$prefix', md5sum $md5sum";
169 }
170 $UNCOMPRESS{$compression}->($data)
171 }
172
173 sub read_header {
174 my ($self) = @_;
175 my $magic = $self->read_data(length MAGIC);
176 croak "Not a SLOB dictionary" unless MAGIC eq $magic;
177 my $uuid = $self->read_data(16);
178
179 my $encoding = $self->read_tiny_text('UTF-8');
180 $self->{encoding} = $encoding;
181
182 my $compression = $self->read_tiny_text;
183 die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
184 my %tags = $self->read_tags;
185 my @content_types = $self->read_content_types;
186 my $blob_count = $self->read_int;
187 my $store_offset = $self->read_long;
188 my $size = $self->read_long;
189 my @refs = $self->read_positions;
190
191 $self->fseek($store_offset);
192 my @storage_bins = $self->read_positions;
193
194 +{
195 uuid => $uuid,
196 encoding => $encoding,
197 compression => $compression,
198 tags => \%tags,
199 content_types => \@content_types,
200 blob_count => $blob_count,
201 store_offset => $store_offset,
202 size => $size,
203 refs => \@refs,
204 storage_bins => \@storage_bins,
205 }
206 }
207
208 sub read_ref {
209 my ($self) = @_;
210 my $key = $self->read_text;
211 my $bin_index = $self->read_int;
212 my $item_index = $self->read_short;
213 my $fragment = $self->read_tiny_text;
214 +{
215 key => $key,
216 bin_index => $bin_index,
217 item_index => $item_index,
218 fragment => $fragment,
219 }
220 }
221
222 sub read_storage_bin {
223 my ($self) = @_;
224 my $count = $self->read_int;
225 my @content_types = map { $self->read_char } 1..$count;
226 my $compressed_size = $self->read_int;
227 my $compressed_data = $self->read_data($compressed_size);
228 my $uncompressed_data = $self->uncompress($compressed_data);
229
230 my @positions = unpack "N$count", $uncompressed_data;
231 my $data = substr $uncompressed_data, $count * 4;
232 +{
233 positions => \@positions,
234 data => $data
235 }
236 }
237
238 sub ref_count { shift @{shift->{header}{refs}} }
239
240 sub seek_and_read_ref {
241 my ($self, $index) = @_;
242 croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
243 $self->fseek($self->{header}{refs}[$index]);
244 $self->read_ref
245 }
246
247 sub seek_and_read_storage_bin {
248 my ($self, $index) = @_;
249 croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
250 $self->fseek($self->{header}{storage_bins}[$index]);
251 $self->read_storage_bin
252 }
253
254 sub get_entry_of_storage_bin {
255 my ($self, $storage_bin, $index) = @_;
256 my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
257 my $length = unpack 'N', $start_of_data;
258 substr $start_of_data, 4, $length;
259 }
260
261 sub seek_and_read_ref_and_data {
262 my ($self, $index) = @_;
263 my $ref = $self->seek_and_read_ref($index);
264 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
265 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
266 $ref->{data} = $data;
267 $ref
268 }
269
270 1;
271 __END__
272
273 =encoding utf-8
274
275 =head1 NAME
276
277 Slob - Read .slob dictionaries (as used by Aard 2)
278
279 =head1 SYNOPSIS
280
281 use feature qw/:5.14/;
282 use Slob;
283 my $slob = Slob->new('path/to/dict.slob');
284
285 my $nr_of_entries = $slob->ref_count; # if the same content has two
286 # keys pointing to it, this
287 # counts it twice
288
289 my $second_ref = $slob->seek_and_read_ref(4);
290 say "Entry is for $second_ref->{key}";
291 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
292
293 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
294 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
295 say "Value at position $second_ref->{item_index} is ",
296 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
297
298 # instead of the above, we can do
299 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
300 say "Entry is for $second_ref_and_data->{key}";
301 say "Value is $second_ref_and_data->{data}";
302
303 =head1 DESCRIPTION
304
305 Slob is a compressed read-only format for storing natural language
306 dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
307 dictionaries in slob format.
308
309 The following methods are available:
310
311 =over
312
313 =item Slob->B<new>(I<$path>)
314 =item Slob->B<new>(I<$fh>)
315
316 Create a new slob reader reading from the given path or filehandle.
317
318 =item $slob->B<ref_count>
319
320 The number of refs (keys) in the dictionary.
321
322 =item $slob->B<seek_and_read_ref>(I<$index>)
323
324 Read the ref (key) at the given index. Returns a hashref with the
325 following keys:
326
327 =over
328
329 =item key
330
331 The key
332
333 =item bin_index
334
335 The storage bin that contains the value for this key
336
337 =item item_index
338
339 The index in the bin_index storage bin of the value for this key
340
341 =item fragment
342
343 HTML fragment that, when applied to the HTML value, points to the
344 definition of the key.
345
346 =back
347
348 =item $slob->B<seek_and_read_storage_bin>(I<$index>)
349
350 Read the storage bin with the given index. Returns the storage bin,
351 which can later be given to B<get_entry_of_storage_bin>.
352
353 =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
354
355 Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
356 item index, returns the value at the index i nthe storage bin.
357
358 =item $slob->B<seek_and_read_ref_and_data>($index)
359
360 Convenience method that returns the key and value at a given index.
361 Returns a hashref like C<seek_and_read_ref> with an extra key,
362 I<data>, which is the value of the key.
363
364 =back
365
366 =head1 SEE ALSO
367
368 L<https://github.com/itkach/slob>
369
370 =head1 AUTHOR
371
372 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
373
374 =head1 COPYRIGHT AND LICENSE
375
376 Copyright (C) 2017-2018 by Marius Gavrilescu
377
378 This library is free software; you can redistribute it and/or modify
379 it under the same terms as Perl itself, either Perl version 5.26.1 or,
380 at your option, any later version of Perl 5 you may have available.
381
382
383 =cut
This page took 0.040388 seconds and 4 git commands to generate.