]>
Commit | Line | Data |
---|---|---|
1 | package Slob; | |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
6 | our $VERSION = '0.001'; | |
7 | ||
8 | use constant MAGIC => "!-1SLOB\x1F"; | |
9 | ||
10 | use Carp qw/croak verbose/; | |
11 | use Encode; | |
12 | ||
13 | use Compress::Raw::Lzma; | |
14 | ||
15 | our %UNCOMPRESS = ( | |
16 | '' => sub { $_[0] }, | |
17 | 'lzma2' => sub { | |
18 | my ($input) = @_; | |
19 | my ($lzma2, $code, $output); | |
20 | ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2()); | |
21 | die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK; | |
22 | $code = $lzma2->code($input, $output); | |
23 | die "Did not reach end of stream" if $code == LZMA_OK; | |
24 | die "Error decoding LZMA2: $code" if $code != LZMA_STREAM_END; | |
25 | $output | |
26 | } | |
27 | ); | |
28 | ||
29 | sub new { | |
30 | my ($class, $path) = @_; | |
31 | my $fh; | |
32 | if (ref $path eq 'IO') { | |
33 | $fh = $path | |
34 | } else { | |
35 | open $fh, '<', $path or croak "Cannot open \"$path\": $!" | |
36 | } | |
37 | my $self = bless {path => $path, fh => $fh}, $class; | |
38 | $self->{header} = $self->read_header; | |
39 | $self | |
40 | } | |
41 | ||
42 | sub read_data { | |
43 | my ($self, $len) = @_; | |
44 | my $data; | |
45 | my $result = read $self->{fh}, $data, $len; | |
46 | if (!defined $result) { | |
47 | croak "Failed to read from $self->{path}: $!" | |
48 | } elsif ($result == $len) { | |
49 | $data | |
50 | } elsif ($result == 0) { | |
51 | croak "$self->{path} is at end of file" | |
52 | } elsif ($result < $len) { | |
53 | croak "Only read $result bytes of $self->{path} before reaching EOF" | |
54 | } | |
55 | } | |
56 | ||
57 | sub read_formatted { | |
58 | my ($self, $len_of_format, $format) = @_; | |
59 | unpack $format, $self->read_data($len_of_format); | |
60 | } | |
61 | ||
62 | sub read_char { shift->read_formatted(1, 'C') } | |
63 | sub read_short { shift->read_formatted(2, 'n') } | |
64 | sub read_int { shift->read_formatted(4, 'N') } | |
65 | sub read_long { shift->read_formatted(8, 'Q>') } | |
66 | ||
67 | sub read_tiny_text { | |
68 | my ($self, $encoding) = @_; | |
69 | my $data = $self->read_data($self->read_char); | |
70 | if (length $data == 255) { | |
71 | $data = unpack 'Z*', $data; | |
72 | } | |
73 | $encoding //= $self->{encoding}; | |
74 | decode $encoding, $data; | |
75 | } | |
76 | ||
77 | sub read_text { | |
78 | my ($self, $encoding) = @_; | |
79 | my $data = $self->read_data($self->read_short); | |
80 | $encoding //= $self->{encoding}; | |
81 | decode $encoding, $data; | |
82 | } | |
83 | ||
84 | sub read_large_byte_string { | |
85 | my ($self) = @_; | |
86 | $self->read_data($self->read_short) | |
87 | } | |
88 | ||
89 | sub read_tag { | |
90 | my ($self) = @_; | |
91 | my $name = $self->read_tiny_text; | |
92 | my $value = $self->read_tiny_text; | |
93 | ($name, $value) | |
94 | } | |
95 | ||
96 | sub read_tags { | |
97 | my ($self) = @_; | |
98 | my $tag_count = $self->read_char; | |
99 | map { $self->read_tag } 1..$tag_count | |
100 | } | |
101 | ||
102 | sub read_content_types { | |
103 | my ($self) = @_; | |
104 | my $content_type_count = $self->read_char; | |
105 | map { $self->read_text } 1..$content_type_count | |
106 | } | |
107 | ||
108 | sub read_positions { | |
109 | my ($self) = @_; | |
110 | my $count = $self->read_int; | |
111 | my @positions = map { $self->read_long } 1..$count; | |
112 | my $relative_to = $self->ftell; | |
113 | map { $relative_to + $_ } @positions | |
114 | } | |
115 | ||
116 | sub fseek { | |
117 | my ($self, $position) = @_; | |
118 | seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position" | |
119 | } | |
120 | ||
121 | sub ftell { | |
122 | my ($self) = @_; | |
123 | my $result = tell $self->{fh}; | |
124 | croak "Failed to tell position in file" if $result == -1; | |
125 | $result | |
126 | } | |
127 | ||
128 | sub uncompress { | |
129 | my ($self, $data) = @_; | |
130 | $UNCOMPRESS{$self->{header}{compression}}->($data) | |
131 | } | |
132 | ||
133 | sub read_header { | |
134 | my ($self) = @_; | |
135 | my $magic = $self->read_data(length MAGIC); | |
136 | croak "Not a SLOB dictionary" unless MAGIC eq $magic; | |
137 | my $uuid = $self->read_data(16); | |
138 | ||
139 | my $encoding = $self->read_tiny_text('UTF-8'); | |
140 | $self->{encoding} = $encoding; | |
141 | ||
142 | my $compression = $self->read_tiny_text; | |
143 | die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression}; | |
144 | my %tags = $self->read_tags; | |
145 | my @content_types = $self->read_content_types; | |
146 | my $blob_count = $self->read_int; | |
147 | my $store_offset = $self->read_long; | |
148 | my $size = $self->read_long; | |
149 | my @refs = $self->read_positions; | |
150 | ||
151 | $self->fseek($store_offset); | |
152 | my @storage_bins = $self->read_positions; | |
153 | ||
154 | +{ | |
155 | uuid => $uuid, | |
156 | encoding => $encoding, | |
157 | compression => $compression, | |
158 | tags => \%tags, | |
159 | content_types => \@content_types, | |
160 | blob_count => $blob_count, | |
161 | store_offset => $store_offset, | |
162 | size => $size, | |
163 | refs => \@refs, | |
164 | storage_bins => \@storage_bins, | |
165 | } | |
166 | } | |
167 | ||
168 | sub read_ref { | |
169 | my ($self) = @_; | |
170 | my $key = $self->read_text; | |
171 | my $bin_index = $self->read_int; | |
172 | my $item_index = $self->read_short; | |
173 | my $fragment = $self->read_tiny_text; | |
174 | +{ | |
175 | key => $key, | |
176 | bin_index => $bin_index, | |
177 | item_index => $item_index, | |
178 | fragment => $fragment, | |
179 | } | |
180 | } | |
181 | ||
182 | sub read_storage_bin { | |
183 | my ($self) = @_; | |
184 | my $count = $self->read_int; | |
185 | my @content_types = map { $self->read_char } 1..$count; | |
186 | my $compressed_size = $self->read_int; | |
187 | my $compressed_data = $self->read_data($compressed_size); | |
188 | my $uncompressed_data = $self->uncompress($compressed_data); | |
189 | ||
190 | my @positions = unpack "N$count", $uncompressed_data; | |
191 | my $data = substr $uncompressed_data, $count * 4; | |
192 | +{ | |
193 | positions => \@positions, | |
194 | data => $data | |
195 | } | |
196 | } | |
197 | ||
198 | sub ref_count { shift @{shift->{header}{refs}} } | |
199 | ||
200 | sub seek_and_read_ref { | |
201 | my ($self, $index) = @_; | |
202 | croak "No ref has index $index" unless exists $self->{header}{refs}[$index]; | |
203 | $self->fseek($self->{header}{refs}[$index]); | |
204 | $self->read_ref | |
205 | } | |
206 | ||
207 | sub seek_and_read_storage_bin { | |
208 | my ($self, $index) = @_; | |
209 | croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index]; | |
210 | $self->fseek($self->{header}{storage_bins}[$index]); | |
211 | $self->read_storage_bin | |
212 | } | |
213 | ||
214 | sub get_entry_of_storage_bin { | |
215 | my ($self, $storage_bin, $index) = @_; | |
216 | my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index]; | |
217 | my $length = unpack 'N', $start_of_data; | |
218 | substr $start_of_data, 4, $length; | |
219 | } | |
220 | ||
221 | sub seek_and_read_ref_and_data { | |
222 | my ($self, $index) = @_; | |
223 | my $ref = $self->seek_and_read_ref($index); | |
224 | my $bin = $self->seek_and_read_storage_bin($ref->{bin_index}); | |
225 | my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index}); | |
226 | $ref->{data} = $data; | |
227 | $ref | |
228 | } | |
229 | ||
230 | 1; | |
231 | __END__ | |
232 | ||
233 | =encoding utf-8 | |
234 | ||
235 | =head1 NAME | |
236 | ||
237 | Slob - Read .slob dictionaries (as used by Aard 2) | |
238 | ||
239 | =head1 SYNOPSIS | |
240 | ||
241 | use feature qw/:5.14/; | |
242 | use Slob; | |
243 | my $slob = Slob->new('path/to/dict.slob'); | |
244 | ||
245 | my $nr_of_entries = $slob->ref_count; # if the same content has two | |
246 | # keys pointing to it, this | |
247 | # counts it twice | |
248 | ||
249 | my $second_ref = $slob->seek_and_read_ref(4); | |
250 | say "Entry is for $second_ref->{key}"; | |
251 | say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; | |
252 | ||
253 | my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); | |
254 | say "Bin has ", (scalar @{$bin->{positions}}), " entries"; | |
255 | say "Value at position $second_ref->{item_index} is ", | |
256 | $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); | |
257 | ||
258 | # instead of the above, we can do | |
259 | my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4); | |
260 | say "Entry is for $second_ref_and_data->{key}"; | |
261 | say "Value is $second_ref_and_data->{data}"; | |
262 | ||
263 | =head1 DESCRIPTION | |
264 | ||
265 | Slob is a compressed read-only format for storing natural language | |
266 | dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads | |
267 | dictionaries in slob format. | |
268 | ||
269 | The following methods are available: | |
270 | ||
271 | =over | |
272 | ||
273 | =item Slob->B<new>(I<$path>) | |
274 | =item Slob->B<new>(I<$fh>) | |
275 | ||
276 | Create a new slob reader reading from the given path or filehandle. | |
277 | ||
278 | =item $slob->B<ref_count> | |
279 | ||
280 | The number of refs (keys) in the dictionary. | |
281 | ||
282 | =item $slob->B<seek_and_read_ref>(I<$index>) | |
283 | ||
284 | Read the ref (key) at the given index. Returns a hashref with the | |
285 | following keys: | |
286 | ||
287 | =over | |
288 | ||
289 | =item key | |
290 | ||
291 | The key | |
292 | ||
293 | =item bin_index | |
294 | ||
295 | The storage bin that contains the value for this key | |
296 | ||
297 | =item item_index | |
298 | ||
299 | The index in the bin_index storage bin of the value for this key | |
300 | ||
301 | =item fragment | |
302 | ||
303 | HTML fragment that, when applied to the HTML value, points to the | |
304 | definition of the key. | |
305 | ||
306 | =back | |
307 | ||
308 | =item $slob->B<seek_and_read_storage_bin>(I<$index>) | |
309 | ||
310 | Read the storage bin with the given index. Returns the storage bin, | |
311 | which can later be given to B<get_entry_of_storage_bin>. | |
312 | ||
313 | =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>) | |
314 | ||
315 | Given a storage bin (as returned by C<seek_and_read_storage_bin>) and | |
316 | item index, returns the value at the index i nthe storage bin. | |
317 | ||
318 | =item $slob->B<seek_and_read_ref_and_data>($index) | |
319 | ||
320 | Convenience method that returns the key and value at a given index. | |
321 | Returns a hashref like C<seek_and_read_ref> with an extra key, | |
322 | I<data>, which is the value of the key. | |
323 | ||
324 | =back | |
325 | ||
326 | =head1 SEE ALSO | |
327 | ||
328 | L<https://github.com/itkach/slob> | |
329 | ||
330 | =head1 AUTHOR | |
331 | ||
332 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
333 | ||
334 | =head1 COPYRIGHT AND LICENSE | |
335 | ||
336 | Copyright (C) 2017 by Marius Gavrilescu | |
337 | ||
338 | This library is free software; you can redistribute it and/or modify | |
339 | it under the same terms as Perl itself, either Perl version 5.26.1 or, | |
340 | at your option, any later version of Perl 5 you may have available. | |
341 | ||
342 | ||
343 | =cut |