]>
Commit | Line | Data |
---|---|---|
1 | package Slob; | |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
6 | our $VERSION = '0.002_001'; | |
7 | ||
8 | use constant MAGIC => "!-1SLOB\x1F"; | |
9 | ||
10 | use Carp qw/croak verbose/; | |
11 | use Encode; | |
12 | ||
13 | use Compress::Raw::Bzip2; | |
14 | use Compress::Raw::Lzma; | |
15 | use Compress::Raw::Zlib; | |
16 | ||
17 | # MD5 only used for debugging output in tests | |
18 | use Digest::MD5 qw/md5_hex/; | |
19 | ||
20 | our %UNCOMPRESS = ( | |
21 | '' => sub { $_[0] }, | |
22 | 'lzma2' => sub { | |
23 | my ($input) = @_; | |
24 | my ($lzma2, $code, $output); | |
25 | ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2()); | |
26 | die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK; | |
27 | ||
28 | $code = $lzma2->code($input, $output); | |
29 | die "Did not reach end of stream\n" if $code == LZMA_OK; | |
30 | die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END; | |
31 | $output | |
32 | }, | |
33 | ||
34 | 'bz2' => sub { | |
35 | my ($input) = @_; | |
36 | my ($bz2, $code, $output); | |
37 | ($bz2, $code)= Compress::Raw::Bunzip2->new; | |
38 | die "Error creating Bunzip2: $code\n" unless $code == Z_OK; | |
39 | ||
40 | $code = $bz2->bzinflate($input, $output); | |
41 | die "Did not reach end of stream\n" if $code == BZ_OK; | |
42 | die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END; | |
43 | ||
44 | $output | |
45 | }, | |
46 | ||
47 | 'zlib' => sub { | |
48 | my ($input) = @_; | |
49 | my ($zlib, $code, $output); | |
50 | ($zlib, $code) = Compress::Raw::Zlib::Inflate->new( | |
51 | -WindowBits => WANT_GZIP_OR_ZLIB | |
52 | ); | |
53 | die "Error creating Zlib inflate: $code\n" unless $code == Z_OK; | |
54 | ||
55 | $code = $zlib->inflate($input, \$output, 1); | |
56 | die "Did not reach end of stream\n" if $code == Z_OK; | |
57 | die "Error inflating zlib: $code\n" if $code != Z_STREAM_END; | |
58 | $output | |
59 | } | |
60 | ); | |
61 | ||
62 | sub new { | |
63 | my ($class, $path) = @_; | |
64 | my $fh; | |
65 | if (ref $path eq 'IO') { | |
66 | $fh = $path | |
67 | } else { | |
68 | open $fh, '<', $path or croak "Cannot open \"$path\": $!" | |
69 | } | |
70 | my $self = bless {path => $path, fh => $fh}, $class; | |
71 | $self->{header} = $self->read_header; | |
72 | $self | |
73 | } | |
74 | ||
75 | sub read_data { | |
76 | my ($self, $len) = @_; | |
77 | my $data; | |
78 | my $result = read $self->{fh}, $data, $len; | |
79 | if (!defined $result) { | |
80 | croak "Failed to read from $self->{path}: $!" | |
81 | } elsif ($result == $len) { | |
82 | $data | |
83 | } elsif ($result == 0) { | |
84 | croak "$self->{path} is at end of file" | |
85 | } elsif ($result < $len) { | |
86 | croak "Only read $result bytes of $self->{path} before reaching EOF" | |
87 | } | |
88 | } | |
89 | ||
90 | sub read_formatted { | |
91 | my ($self, $len_of_format, $format) = @_; | |
92 | unpack $format, $self->read_data($len_of_format); | |
93 | } | |
94 | ||
95 | sub read_char { shift->read_formatted(1, 'C') } | |
96 | sub read_short { shift->read_formatted(2, 'n') } | |
97 | sub read_int { shift->read_formatted(4, 'N') } | |
98 | sub read_long { shift->read_formatted(8, 'Q>') } | |
99 | ||
100 | sub read_tiny_text { | |
101 | my ($self, $encoding) = @_; | |
102 | my $data = $self->read_data($self->read_char); | |
103 | if (length $data == 255) { | |
104 | $data = unpack 'Z*', $data; | |
105 | } | |
106 | $encoding //= $self->{encoding}; | |
107 | decode $encoding, $data; | |
108 | } | |
109 | ||
110 | sub read_text { | |
111 | my ($self, $encoding) = @_; | |
112 | my $data = $self->read_data($self->read_short); | |
113 | $encoding //= $self->{encoding}; | |
114 | decode $encoding, $data; | |
115 | } | |
116 | ||
117 | sub read_large_byte_string { | |
118 | my ($self) = @_; | |
119 | $self->read_data($self->read_short) | |
120 | } | |
121 | ||
122 | sub read_tag { | |
123 | my ($self) = @_; | |
124 | my $name = $self->read_tiny_text; | |
125 | my $value = $self->read_tiny_text; | |
126 | ($name, $value) | |
127 | } | |
128 | ||
129 | sub read_tags { | |
130 | my ($self) = @_; | |
131 | my $tag_count = $self->read_char; | |
132 | map { $self->read_tag } 1..$tag_count | |
133 | } | |
134 | ||
135 | sub read_content_types { | |
136 | my ($self) = @_; | |
137 | my $content_type_count = $self->read_char; | |
138 | map { $self->read_text } 1..$content_type_count | |
139 | } | |
140 | ||
141 | sub read_positions { | |
142 | my ($self) = @_; | |
143 | my $count = $self->read_int; | |
144 | my @positions = map { $self->read_long } 1..$count; | |
145 | my $relative_to = $self->ftell; | |
146 | map { $relative_to + $_ } @positions | |
147 | } | |
148 | ||
149 | sub fseek { | |
150 | my ($self, $position) = @_; | |
151 | seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position" | |
152 | } | |
153 | ||
154 | sub ftell { | |
155 | my ($self) = @_; | |
156 | my $result = tell $self->{fh}; | |
157 | croak "Failed to tell position in file" if $result == -1; | |
158 | $result | |
159 | } | |
160 | ||
161 | sub uncompress { | |
162 | my ($self, $data) = @_; | |
163 | my $compression = $self->{header}{compression}; | |
164 | if ($ENV{HARNESS_ACTIVE} && $compression eq 'lzma2') { | |
165 | my $prefix = unpack 'H*', substr $data, 0, 10; | |
166 | my $md5sum = md5_hex $data; | |
167 | Test::More::diag "Uncompressing data starting '$prefix', md5sum $md5sum"; | |
168 | } | |
169 | $UNCOMPRESS{$compression}->($data) | |
170 | } | |
171 | ||
172 | sub read_header { | |
173 | my ($self) = @_; | |
174 | my $magic = $self->read_data(length MAGIC); | |
175 | croak "Not a SLOB dictionary" unless MAGIC eq $magic; | |
176 | my $uuid = $self->read_data(16); | |
177 | ||
178 | my $encoding = $self->read_tiny_text('UTF-8'); | |
179 | $self->{encoding} = $encoding; | |
180 | ||
181 | my $compression = $self->read_tiny_text; | |
182 | die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression}; | |
183 | my %tags = $self->read_tags; | |
184 | my @content_types = $self->read_content_types; | |
185 | my $blob_count = $self->read_int; | |
186 | my $store_offset = $self->read_long; | |
187 | my $size = $self->read_long; | |
188 | my @refs = $self->read_positions; | |
189 | ||
190 | $self->fseek($store_offset); | |
191 | my @storage_bins = $self->read_positions; | |
192 | ||
193 | +{ | |
194 | uuid => $uuid, | |
195 | encoding => $encoding, | |
196 | compression => $compression, | |
197 | tags => \%tags, | |
198 | content_types => \@content_types, | |
199 | blob_count => $blob_count, | |
200 | store_offset => $store_offset, | |
201 | size => $size, | |
202 | refs => \@refs, | |
203 | storage_bins => \@storage_bins, | |
204 | } | |
205 | } | |
206 | ||
207 | sub read_ref { | |
208 | my ($self) = @_; | |
209 | my $key = $self->read_text; | |
210 | my $bin_index = $self->read_int; | |
211 | my $item_index = $self->read_short; | |
212 | my $fragment = $self->read_tiny_text; | |
213 | +{ | |
214 | key => $key, | |
215 | bin_index => $bin_index, | |
216 | item_index => $item_index, | |
217 | fragment => $fragment, | |
218 | } | |
219 | } | |
220 | ||
221 | sub read_storage_bin { | |
222 | my ($self) = @_; | |
223 | my $count = $self->read_int; | |
224 | my @content_types = map { $self->read_char } 1..$count; | |
225 | my $compressed_size = $self->read_int; | |
226 | my $compressed_data = $self->read_data($compressed_size); | |
227 | my $uncompressed_data = $self->uncompress($compressed_data); | |
228 | ||
229 | my @positions = unpack "N$count", $uncompressed_data; | |
230 | my $data = substr $uncompressed_data, $count * 4; | |
231 | +{ | |
232 | positions => \@positions, | |
233 | data => $data | |
234 | } | |
235 | } | |
236 | ||
237 | sub ref_count { shift @{shift->{header}{refs}} } | |
238 | ||
239 | sub seek_and_read_ref { | |
240 | my ($self, $index) = @_; | |
241 | croak "No ref has index $index" unless exists $self->{header}{refs}[$index]; | |
242 | $self->fseek($self->{header}{refs}[$index]); | |
243 | $self->read_ref | |
244 | } | |
245 | ||
246 | sub seek_and_read_storage_bin { | |
247 | my ($self, $index) = @_; | |
248 | croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index]; | |
249 | $self->fseek($self->{header}{storage_bins}[$index]); | |
250 | $self->read_storage_bin | |
251 | } | |
252 | ||
253 | sub get_entry_of_storage_bin { | |
254 | my ($self, $storage_bin, $index) = @_; | |
255 | my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index]; | |
256 | my $length = unpack 'N', $start_of_data; | |
257 | substr $start_of_data, 4, $length; | |
258 | } | |
259 | ||
260 | sub seek_and_read_ref_and_data { | |
261 | my ($self, $index) = @_; | |
262 | my $ref = $self->seek_and_read_ref($index); | |
263 | my $bin = $self->seek_and_read_storage_bin($ref->{bin_index}); | |
264 | my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index}); | |
265 | $ref->{data} = $data; | |
266 | $ref | |
267 | } | |
268 | ||
269 | 1; | |
270 | __END__ | |
271 | ||
272 | =encoding utf-8 | |
273 | ||
274 | =head1 NAME | |
275 | ||
276 | Slob - Read .slob dictionaries (as used by Aard 2) | |
277 | ||
278 | =head1 SYNOPSIS | |
279 | ||
280 | use feature qw/:5.14/; | |
281 | use Slob; | |
282 | my $slob = Slob->new('path/to/dict.slob'); | |
283 | ||
284 | my $nr_of_entries = $slob->ref_count; # if the same content has two | |
285 | # keys pointing to it, this | |
286 | # counts it twice | |
287 | ||
288 | my $second_ref = $slob->seek_and_read_ref(4); | |
289 | say "Entry is for $second_ref->{key}"; | |
290 | say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; | |
291 | ||
292 | my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); | |
293 | say "Bin has ", (scalar @{$bin->{positions}}), " entries"; | |
294 | say "Value at position $second_ref->{item_index} is ", | |
295 | $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); | |
296 | ||
297 | # instead of the above, we can do | |
298 | my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4); | |
299 | say "Entry is for $second_ref_and_data->{key}"; | |
300 | say "Value is $second_ref_and_data->{data}"; | |
301 | ||
302 | =head1 DESCRIPTION | |
303 | ||
304 | Slob is a compressed read-only format for storing natural language | |
305 | dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads | |
306 | dictionaries in slob format. | |
307 | ||
308 | The following methods are available: | |
309 | ||
310 | =over | |
311 | ||
312 | =item Slob->B<new>(I<$path>) | |
313 | =item Slob->B<new>(I<$fh>) | |
314 | ||
315 | Create a new slob reader reading from the given path or filehandle. | |
316 | ||
317 | =item $slob->B<ref_count> | |
318 | ||
319 | The number of refs (keys) in the dictionary. | |
320 | ||
321 | =item $slob->B<seek_and_read_ref>(I<$index>) | |
322 | ||
323 | Read the ref (key) at the given index. Returns a hashref with the | |
324 | following keys: | |
325 | ||
326 | =over | |
327 | ||
328 | =item key | |
329 | ||
330 | The key | |
331 | ||
332 | =item bin_index | |
333 | ||
334 | The storage bin that contains the value for this key | |
335 | ||
336 | =item item_index | |
337 | ||
338 | The index in the bin_index storage bin of the value for this key | |
339 | ||
340 | =item fragment | |
341 | ||
342 | HTML fragment that, when applied to the HTML value, points to the | |
343 | definition of the key. | |
344 | ||
345 | =back | |
346 | ||
347 | =item $slob->B<seek_and_read_storage_bin>(I<$index>) | |
348 | ||
349 | Read the storage bin with the given index. Returns the storage bin, | |
350 | which can later be given to B<get_entry_of_storage_bin>. | |
351 | ||
352 | =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>) | |
353 | ||
354 | Given a storage bin (as returned by C<seek_and_read_storage_bin>) and | |
355 | item index, returns the value at the index i nthe storage bin. | |
356 | ||
357 | =item $slob->B<seek_and_read_ref_and_data>($index) | |
358 | ||
359 | Convenience method that returns the key and value at a given index. | |
360 | Returns a hashref like C<seek_and_read_ref> with an extra key, | |
361 | I<data>, which is the value of the key. | |
362 | ||
363 | =back | |
364 | ||
365 | =head1 SEE ALSO | |
366 | ||
367 | L<https://github.com/itkach/slob> | |
368 | ||
369 | =head1 AUTHOR | |
370 | ||
371 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
372 | ||
373 | =head1 COPYRIGHT AND LICENSE | |
374 | ||
375 | Copyright (C) 2017-2018 by Marius Gavrilescu | |
376 | ||
377 | This library is free software; you can redistribute it and/or modify | |
378 | it under the same terms as Perl itself, either Perl version 5.26.1 or, | |
379 | at your option, any later version of Perl 5 you may have available. | |
380 | ||
381 | ||
382 | =cut |