]>
Commit | Line | Data |
---|---|---|
1 | package Slob; | |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
6 | our $VERSION = '0.002002'; | |
7 | ||
8 | use constant MAGIC => "!-1SLOB\x1F"; | |
9 | ||
10 | use Carp qw/croak verbose/; | |
11 | use Encode; | |
12 | ||
13 | use Compress::Raw::Bzip2; | |
14 | use Compress::Raw::Lzma; | |
15 | use Compress::Raw::Zlib; | |
16 | ||
17 | # MD5 only used for debugging output in tests | |
18 | use Digest::MD5 qw/md5_hex/; | |
19 | ||
20 | our %UNCOMPRESS = ( | |
21 | '' => sub { $_[0] }, | |
22 | 'lzma2' => sub { | |
23 | my ($input) = @_; | |
24 | my ($lzma2, $code, $output); | |
25 | ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2()); | |
26 | die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK; | |
27 | ||
28 | $code = $lzma2->code($input, $output); | |
29 | die "Did not reach end of stream\n" if $code == LZMA_OK; | |
30 | die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END; | |
31 | $output | |
32 | }, | |
33 | ||
34 | 'bz2' => sub { | |
35 | my ($input) = @_; | |
36 | my ($bz2, $code, $output); | |
37 | ($bz2, $code)= Compress::Raw::Bunzip2->new; | |
38 | die "Error creating Bunzip2: $code\n" unless $code == Z_OK; | |
39 | ||
40 | $code = $bz2->bzinflate($input, $output); | |
41 | die "Did not reach end of stream\n" if $code == BZ_OK; | |
42 | die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END; | |
43 | ||
44 | $output | |
45 | }, | |
46 | ||
47 | 'zlib' => sub { | |
48 | my ($input) = @_; | |
49 | my ($zlib, $code, $output); | |
50 | ($zlib, $code) = Compress::Raw::Zlib::Inflate->new( | |
51 | -WindowBits => WANT_GZIP_OR_ZLIB | |
52 | ); | |
53 | die "Error creating Zlib inflate: $code\n" unless $code == Z_OK; | |
54 | ||
55 | $code = $zlib->inflate($input, \$output, 1); | |
56 | die "Did not reach end of stream\n" if $code == Z_OK; | |
57 | die "Error inflating zlib: $code\n" if $code != Z_STREAM_END; | |
58 | $output | |
59 | } | |
60 | ); | |
61 | ||
62 | sub new { | |
63 | my ($class, $path) = @_; | |
64 | my $fh; | |
65 | if (ref $path eq 'IO') { | |
66 | $fh = $path | |
67 | } else { | |
68 | open $fh, '<', $path or croak "Cannot open \"$path\": $!"; | |
69 | binmode $fh; | |
70 | } | |
71 | my $self = bless {path => $path, fh => $fh}, $class; | |
72 | $self->{header} = $self->read_header; | |
73 | $self | |
74 | } | |
75 | ||
76 | sub read_data { | |
77 | my ($self, $len) = @_; | |
78 | my $data; | |
79 | my $result = read $self->{fh}, $data, $len; | |
80 | if (!defined $result) { | |
81 | croak "Failed to read from $self->{path}: $!" | |
82 | } elsif ($result == $len) { | |
83 | $data | |
84 | } elsif ($result == 0) { | |
85 | croak "$self->{path} is at end of file" | |
86 | } elsif ($result < $len) { | |
87 | croak "Only read $result bytes of $self->{path} before reaching EOF" | |
88 | } | |
89 | } | |
90 | ||
91 | sub read_formatted { | |
92 | my ($self, $len_of_format, $format) = @_; | |
93 | unpack $format, $self->read_data($len_of_format); | |
94 | } | |
95 | ||
96 | sub read_char { shift->read_formatted(1, 'C') } | |
97 | sub read_short { shift->read_formatted(2, 'n') } | |
98 | sub read_int { shift->read_formatted(4, 'N') } | |
99 | sub read_long { shift->read_formatted(8, 'Q>') } | |
100 | ||
101 | sub read_tiny_text { | |
102 | my ($self, $encoding) = @_; | |
103 | my $data = $self->read_data($self->read_char); | |
104 | if (length $data == 255) { | |
105 | $data = unpack 'Z*', $data; | |
106 | } | |
107 | $encoding //= $self->{encoding}; | |
108 | decode $encoding, $data; | |
109 | } | |
110 | ||
111 | sub read_text { | |
112 | my ($self, $encoding) = @_; | |
113 | my $data = $self->read_data($self->read_short); | |
114 | $encoding //= $self->{encoding}; | |
115 | decode $encoding, $data; | |
116 | } | |
117 | ||
118 | sub read_large_byte_string { | |
119 | my ($self) = @_; | |
120 | $self->read_data($self->read_short) | |
121 | } | |
122 | ||
123 | sub read_tag { | |
124 | my ($self) = @_; | |
125 | my $name = $self->read_tiny_text; | |
126 | my $value = $self->read_tiny_text; | |
127 | ($name, $value) | |
128 | } | |
129 | ||
130 | sub read_tags { | |
131 | my ($self) = @_; | |
132 | my $tag_count = $self->read_char; | |
133 | map { $self->read_tag } 1..$tag_count | |
134 | } | |
135 | ||
136 | sub read_content_types { | |
137 | my ($self) = @_; | |
138 | my $content_type_count = $self->read_char; | |
139 | map { $self->read_text } 1..$content_type_count | |
140 | } | |
141 | ||
142 | sub read_positions { | |
143 | my ($self) = @_; | |
144 | my $count = $self->read_int; | |
145 | my @positions = map { $self->read_long } 1..$count; | |
146 | my $relative_to = $self->ftell; | |
147 | map { $relative_to + $_ } @positions | |
148 | } | |
149 | ||
150 | sub fseek { | |
151 | my ($self, $position) = @_; | |
152 | seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position" | |
153 | } | |
154 | ||
155 | sub ftell { | |
156 | my ($self) = @_; | |
157 | my $result = tell $self->{fh}; | |
158 | croak "Failed to tell position in file" if $result == -1; | |
159 | $result | |
160 | } | |
161 | ||
162 | sub uncompress { | |
163 | my ($self, $data) = @_; | |
164 | my $compression = $self->{header}{compression}; | |
165 | if ($ENV{HARNESS_ACTIVE} && $compression eq 'lzma2') { | |
166 | my $prefix = unpack 'H*', substr $data, 0, 10; | |
167 | my $md5sum = md5_hex $data; | |
168 | Test::More::diag "Uncompressing data starting '$prefix', md5sum $md5sum"; | |
169 | } | |
170 | $UNCOMPRESS{$compression}->($data) | |
171 | } | |
172 | ||
173 | sub read_header { | |
174 | my ($self) = @_; | |
175 | my $magic = $self->read_data(length MAGIC); | |
176 | croak "Not a SLOB dictionary" unless MAGIC eq $magic; | |
177 | my $uuid = $self->read_data(16); | |
178 | ||
179 | my $encoding = $self->read_tiny_text('UTF-8'); | |
180 | $self->{encoding} = $encoding; | |
181 | ||
182 | my $compression = $self->read_tiny_text; | |
183 | die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression}; | |
184 | my %tags = $self->read_tags; | |
185 | my @content_types = $self->read_content_types; | |
186 | my $blob_count = $self->read_int; | |
187 | my $store_offset = $self->read_long; | |
188 | my $size = $self->read_long; | |
189 | my @refs = $self->read_positions; | |
190 | ||
191 | $self->fseek($store_offset); | |
192 | my @storage_bins = $self->read_positions; | |
193 | ||
194 | +{ | |
195 | uuid => $uuid, | |
196 | encoding => $encoding, | |
197 | compression => $compression, | |
198 | tags => \%tags, | |
199 | content_types => \@content_types, | |
200 | blob_count => $blob_count, | |
201 | store_offset => $store_offset, | |
202 | size => $size, | |
203 | refs => \@refs, | |
204 | storage_bins => \@storage_bins, | |
205 | } | |
206 | } | |
207 | ||
208 | sub read_ref { | |
209 | my ($self) = @_; | |
210 | my $key = $self->read_text; | |
211 | my $bin_index = $self->read_int; | |
212 | my $item_index = $self->read_short; | |
213 | my $fragment = $self->read_tiny_text; | |
214 | +{ | |
215 | key => $key, | |
216 | bin_index => $bin_index, | |
217 | item_index => $item_index, | |
218 | fragment => $fragment, | |
219 | } | |
220 | } | |
221 | ||
222 | sub read_storage_bin { | |
223 | my ($self) = @_; | |
224 | my $count = $self->read_int; | |
225 | my @content_types = map { $self->read_char } 1..$count; | |
226 | my $compressed_size = $self->read_int; | |
227 | my $compressed_data = $self->read_data($compressed_size); | |
228 | my $uncompressed_data = $self->uncompress($compressed_data); | |
229 | ||
230 | my @positions = unpack "N$count", $uncompressed_data; | |
231 | my $data = substr $uncompressed_data, $count * 4; | |
232 | +{ | |
233 | positions => \@positions, | |
234 | data => $data | |
235 | } | |
236 | } | |
237 | ||
238 | sub ref_count { shift @{shift->{header}{refs}} } | |
239 | ||
240 | sub seek_and_read_ref { | |
241 | my ($self, $index) = @_; | |
242 | croak "No ref has index $index" unless exists $self->{header}{refs}[$index]; | |
243 | $self->fseek($self->{header}{refs}[$index]); | |
244 | $self->read_ref | |
245 | } | |
246 | ||
247 | sub seek_and_read_storage_bin { | |
248 | my ($self, $index) = @_; | |
249 | croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index]; | |
250 | $self->fseek($self->{header}{storage_bins}[$index]); | |
251 | $self->read_storage_bin | |
252 | } | |
253 | ||
254 | sub get_entry_of_storage_bin { | |
255 | my ($self, $storage_bin, $index) = @_; | |
256 | my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index]; | |
257 | my $length = unpack 'N', $start_of_data; | |
258 | substr $start_of_data, 4, $length; | |
259 | } | |
260 | ||
261 | sub seek_and_read_ref_and_data { | |
262 | my ($self, $index) = @_; | |
263 | my $ref = $self->seek_and_read_ref($index); | |
264 | my $bin = $self->seek_and_read_storage_bin($ref->{bin_index}); | |
265 | my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index}); | |
266 | $ref->{data} = $data; | |
267 | $ref | |
268 | } | |
269 | ||
270 | 1; | |
271 | __END__ | |
272 | ||
273 | =encoding utf-8 | |
274 | ||
275 | =head1 NAME | |
276 | ||
277 | Slob - Read .slob dictionaries (as used by Aard 2) | |
278 | ||
279 | =head1 SYNOPSIS | |
280 | ||
281 | use feature qw/:5.14/; | |
282 | use Slob; | |
283 | my $slob = Slob->new('path/to/dict.slob'); | |
284 | ||
285 | my $nr_of_entries = $slob->ref_count; # if the same content has two | |
286 | # keys pointing to it, this | |
287 | # counts it twice | |
288 | ||
289 | my $second_ref = $slob->seek_and_read_ref(4); | |
290 | say "Entry is for $second_ref->{key}"; | |
291 | say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; | |
292 | ||
293 | my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); | |
294 | say "Bin has ", (scalar @{$bin->{positions}}), " entries"; | |
295 | say "Value at position $second_ref->{item_index} is ", | |
296 | $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); | |
297 | ||
298 | # instead of the above, we can do | |
299 | my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4); | |
300 | say "Entry is for $second_ref_and_data->{key}"; | |
301 | say "Value is $second_ref_and_data->{data}"; | |
302 | ||
303 | =head1 DESCRIPTION | |
304 | ||
305 | Slob is a compressed read-only format for storing natural language | |
306 | dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads | |
307 | dictionaries in slob format. | |
308 | ||
309 | The following methods are available: | |
310 | ||
311 | =over | |
312 | ||
313 | =item Slob->B<new>(I<$path>) | |
314 | =item Slob->B<new>(I<$fh>) | |
315 | ||
316 | Create a new slob reader reading from the given path or filehandle. | |
317 | ||
318 | =item $slob->B<ref_count> | |
319 | ||
320 | The number of refs (keys) in the dictionary. | |
321 | ||
322 | =item $slob->B<seek_and_read_ref>(I<$index>) | |
323 | ||
324 | Read the ref (key) at the given index. Returns a hashref with the | |
325 | following keys: | |
326 | ||
327 | =over | |
328 | ||
329 | =item key | |
330 | ||
331 | The key | |
332 | ||
333 | =item bin_index | |
334 | ||
335 | The storage bin that contains the value for this key | |
336 | ||
337 | =item item_index | |
338 | ||
339 | The index in the bin_index storage bin of the value for this key | |
340 | ||
341 | =item fragment | |
342 | ||
343 | HTML fragment that, when applied to the HTML value, points to the | |
344 | definition of the key. | |
345 | ||
346 | =back | |
347 | ||
348 | =item $slob->B<seek_and_read_storage_bin>(I<$index>) | |
349 | ||
350 | Read the storage bin with the given index. Returns the storage bin, | |
351 | which can later be given to B<get_entry_of_storage_bin>. | |
352 | ||
353 | =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>) | |
354 | ||
355 | Given a storage bin (as returned by C<seek_and_read_storage_bin>) and | |
356 | item index, returns the value at the index i nthe storage bin. | |
357 | ||
358 | =item $slob->B<seek_and_read_ref_and_data>($index) | |
359 | ||
360 | Convenience method that returns the key and value at a given index. | |
361 | Returns a hashref like C<seek_and_read_ref> with an extra key, | |
362 | I<data>, which is the value of the key. | |
363 | ||
364 | =back | |
365 | ||
366 | =head1 SEE ALSO | |
367 | ||
368 | L<https://github.com/itkach/slob> | |
369 | ||
370 | =head1 AUTHOR | |
371 | ||
372 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
373 | ||
374 | =head1 COPYRIGHT AND LICENSE | |
375 | ||
376 | Copyright (C) 2017-2018 by Marius Gavrilescu | |
377 | ||
378 | This library is free software; you can redistribute it and/or modify | |
379 | it under the same terms as Perl itself, either Perl version 5.26.1 or, | |
380 | at your option, any later version of Perl 5 you may have available. | |
381 | ||
382 | ||
383 | =cut |