Commit | Line | Data |
---|---|---|
d3779104 MG |
1 | package Slob; |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
6 | our $VERSION = '0.000_001'; | |
7 | ||
8 | use constant MAGIC => "!-1SLOB\x1F"; | |
9 | ||
10 | use Carp qw/croak verbose/; | |
11 | use Encode; | |
12 | ||
13 | sub new { | |
14 | my ($class, $path) = @_; | |
632d3de3 MG |
15 | my $fh = |
16 | ref $path eq 'IO' | |
17 | ? $path | |
18 | : open my $fh, '<', $path or croak "Cannot open \"$path\": $!"; | |
d3779104 MG |
19 | my $self = bless {path => $path, fh => $fh}, $class; |
20 | $self->{header} = $self->read_header; | |
21 | $self | |
22 | } | |
23 | ||
24 | sub read_data { | |
25 | my ($self, $len) = @_; | |
26 | my $data; | |
27 | my $result = read $self->{fh}, $data, $len; | |
28 | if (!defined $result) { | |
29 | croak "Failed to read from $self->{path}: $!" | |
30 | } elsif ($result == $len) { | |
31 | $data | |
32 | } elsif ($result == 0) { | |
33 | croak "$self->{path} is at end of file" | |
34 | } elsif ($result < $len) { | |
35 | croak "Only read $result bytes of $self->{path} before reaching EOF" | |
36 | } | |
37 | } | |
38 | ||
39 | sub read_formatted { | |
40 | my ($self, $len_of_format, $format) = @_; | |
41 | unpack $format, $self->read_data($len_of_format); | |
42 | } | |
43 | ||
44 | sub read_char { shift->read_formatted(1, 'C') } | |
45 | sub read_short { shift->read_formatted(2, 'n') } | |
46 | sub read_int { shift->read_formatted(4, 'N') } | |
47 | sub read_long { shift->read_formatted(8, 'Q>') } | |
48 | ||
49 | sub read_tiny_text { | |
50 | my ($self, $encoding) = @_; | |
51 | my $data = $self->read_data($self->read_char); | |
52 | if (length $data == 255) { | |
53 | $data = unpack 'Z*', $data; | |
54 | } | |
55 | $encoding //= $self->{encoding}; | |
56 | decode $encoding, $data; | |
57 | } | |
58 | ||
59 | sub read_text { | |
60 | my ($self, $encoding) = @_; | |
61 | my $data = $self->read_data($self->read_short); | |
62 | $encoding //= $self->{encoding}; | |
63 | decode $encoding, $data; | |
64 | } | |
65 | ||
66 | sub read_large_byte_string { | |
67 | my ($self) = @_; | |
68 | $self->read_data($self->read_short) | |
69 | } | |
70 | ||
71 | sub read_tag { | |
72 | my ($self) = @_; | |
73 | my $name = $self->read_tiny_text; | |
74 | my $value = $self->read_tiny_text; | |
75 | ($name, $value) | |
76 | } | |
77 | ||
78 | sub read_tags { | |
79 | my ($self) = @_; | |
80 | my $tag_count = $self->read_char; | |
81 | map { $self->read_tag } 1..$tag_count | |
82 | } | |
83 | ||
84 | sub read_content_types { | |
85 | my ($self) = @_; | |
86 | my $content_type_count = $self->read_char; | |
87 | map { $self->read_text } 1..$content_type_count | |
88 | } | |
89 | ||
90 | sub read_positions { | |
91 | my ($self) = @_; | |
92 | my $count = $self->read_int; | |
93 | my @positions = map { $self->read_long } 1..$count; | |
94 | my $relative_to = $self->ftell; | |
95 | map { $relative_to + $_ } @positions | |
96 | } | |
97 | ||
98 | sub fseek { | |
99 | my ($self, $position) = @_; | |
100 | seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position" | |
101 | } | |
102 | ||
103 | sub ftell { | |
104 | my ($self) = @_; | |
105 | my $result = tell $self->{fh}; | |
106 | croak "Failed to tell position in file" if $result == -1; | |
107 | $result | |
108 | } | |
109 | ||
110 | sub uncompress { | |
111 | my ($self, $data) = @_; | |
112 | $data | |
113 | } | |
114 | ||
115 | sub read_header { | |
116 | my ($self) = @_; | |
117 | my $magic = $self->read_data(length MAGIC); | |
118 | croak "Not a SLOB dictionary" unless MAGIC eq $magic; | |
119 | my $uuid = $self->read_data(16); | |
120 | ||
121 | my $encoding = $self->read_tiny_text('UTF-8'); | |
122 | $self->{encoding} = $encoding; | |
123 | ||
124 | my $compression = $self->read_tiny_text; | |
125 | die "Compression not yet supported" if $compression; | |
126 | my %tags = $self->read_tags; | |
127 | my @content_types = $self->read_content_types; | |
128 | my $blob_count = $self->read_int; | |
129 | my $store_offset = $self->read_long; | |
130 | my $size = $self->read_long; | |
131 | my @refs = $self->read_positions; | |
132 | ||
133 | $self->fseek($store_offset); | |
134 | my @storage_bins = $self->read_positions; | |
135 | ||
136 | +{ | |
137 | uuid => $uuid, | |
138 | encoding => $encoding, | |
139 | compression => $compression, | |
140 | tags => \%tags, | |
141 | content_types => \@content_types, | |
142 | blob_count => $blob_count, | |
143 | store_offset => $store_offset, | |
144 | size => $size, | |
145 | refs => \@refs, | |
146 | storage_bins => \@storage_bins, | |
147 | } | |
148 | } | |
149 | ||
150 | sub read_ref { | |
151 | my ($self) = @_; | |
152 | my $key = $self->read_text; | |
153 | my $bin_index = $self->read_int; | |
154 | my $item_index = $self->read_short; | |
155 | my $fragment = $self->read_tiny_text; | |
156 | +{ | |
157 | key => $key, | |
158 | bin_index => $bin_index, | |
159 | item_index => $item_index, | |
160 | fragment => $fragment, | |
161 | } | |
162 | } | |
163 | ||
164 | sub read_storage_bin { | |
165 | my ($self) = @_; | |
166 | my $count = $self->read_int; | |
167 | my @content_types = map { $self->read_char } 1..$count; | |
168 | my $compressed_size = $self->read_int; | |
169 | my $compressed_data = $self->read_data($compressed_size); | |
170 | my $uncompressed_data = $self->uncompress($compressed_data); | |
171 | ||
172 | my @positions = unpack "N$count", $uncompressed_data; | |
173 | my $data = substr $uncompressed_data, $count * 4; | |
174 | +{ | |
175 | positions => \@positions, | |
176 | data => $data | |
177 | } | |
178 | } | |
179 | ||
180 | sub ref_count { shift @{shift->{header}{refs}} } | |
181 | ||
182 | sub seek_and_read_ref { | |
183 | my ($self, $index) = @_; | |
184 | croak "No ref has index $index" unless exists $self->{header}{refs}[$index]; | |
185 | $self->fseek($self->{header}{refs}[$index]); | |
186 | $self->read_ref | |
187 | } | |
188 | ||
189 | sub seek_and_read_storage_bin { | |
190 | my ($self, $index) = @_; | |
191 | croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index]; | |
192 | $self->fseek($self->{header}{storage_bins}[$index]); | |
193 | $self->read_storage_bin | |
194 | } | |
195 | ||
196 | sub get_entry_of_storage_bin { | |
197 | my ($self, $storage_bin, $index) = @_; | |
198 | my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index]; | |
199 | my $length = unpack 'N', $start_of_data; | |
200 | substr $start_of_data, 4, $length; | |
201 | } | |
202 | ||
0e15b496 MG |
203 | sub seek_and_read_ref_and_data { |
204 | my ($self, $index) = @_; | |
205 | my $ref = $self->seek_and_read_ref($index); | |
206 | my $bin = $self->seek_and_read_storage_bin($ref->{bin_index}); | |
207 | my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index}); | |
208 | $ref->{data} = $data; | |
209 | $ref | |
210 | } | |
211 | ||
d3779104 MG |
212 | 1; |
213 | __END__ | |
214 | ||
215 | =encoding utf-8 | |
216 | ||
217 | =head1 NAME | |
218 | ||
219 | Slob - Read .slob dictionaries (as used by Aard 2) | |
220 | ||
221 | =head1 SYNOPSIS | |
222 | ||
223 | use feature qw/:5.14/; | |
224 | use Slob; | |
225 | my $slob = Slob->new('path/to/dict.slob'); | |
226 | ||
227 | my $nr_of_entries = $slob->ref_count; # if the same content has two | |
228 | # keys pointing to it, this | |
229 | # counts it twice | |
230 | ||
231 | my $second_ref = $slob->seek_and_read_ref(4); | |
232 | say "Entry is for $second_ref->{key}"; | |
233 | say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; | |
234 | ||
235 | my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); | |
236 | say "Bin has ", (scalar @{$bin->{positions}}), " entries"; | |
237 | say "Value at position $second_ref->{item_index} is ", | |
238 | $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); | |
239 | ||
0e15b496 MG |
240 | # instead of the above, we can do |
241 | my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4); | |
242 | say "Entry is for $second_ref_and_data->{key}"; | |
243 | say "Value is $second_ref_and_data->{data}"; | |
244 | ||
d3779104 MG |
245 | =head1 DESCRIPTION |
246 | ||
632d3de3 MG |
247 | Slob is a compressed read-only format for storing natural language |
248 | dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads | |
249 | dictionaries in slob format. | |
250 | ||
251 | The following methods are available: | |
252 | ||
253 | =over | |
254 | ||
255 | =item Slob->B<new>(I<$path>) | |
256 | =item Slob->B<new>(I<$fh>) | |
257 | ||
258 | Create a new slob reader reading from the given path or filehandle. | |
259 | ||
260 | =item $slob->B<ref_count> | |
261 | ||
262 | The number of refs (keys) in the dictionary. | |
263 | ||
264 | =item $slob->B<seek_and_read_ref>(I<$index>) | |
265 | ||
266 | Read the ref (key) at the given index. Returns a hashref with the | |
267 | following keys: | |
268 | ||
269 | =over | |
270 | ||
271 | =item key | |
272 | ||
273 | The key | |
274 | ||
275 | =item bin_index | |
276 | ||
277 | The storage bin that contains the value for this key | |
278 | ||
279 | =item item_index | |
280 | ||
281 | The index in the bin_index storage bin of the value for this key | |
282 | ||
283 | =item fragment | |
284 | ||
285 | HTML fragment that, when applied to the HTML value, points to the | |
286 | definition of the key. | |
287 | ||
288 | =back | |
289 | ||
290 | =item $slob->B<seek_and_read_storage_bin>(I<$index>) | |
291 | ||
292 | Read the storage bin with the given index. Returns the storage bin, | |
293 | which can later be given to B<get_entry_of_storage_bin>. | |
294 | ||
295 | =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>) | |
296 | ||
297 | Given a storage bin (as returned by C<seek_and_read_storage_bin>) and | |
298 | item index, returns the value at the index i nthe storage bin. | |
299 | ||
300 | =item $slob->B<seek_and_read_ref_and_data>($index) | |
301 | ||
302 | Convenience method that returns the key and value at a given index. | |
303 | Returns a hashref like C<seek_and_read_ref> with an extra key, | |
304 | I<data>, which is the value of the key. | |
305 | ||
306 | =back | |
307 | ||
308 | =head1 SEE ALSO | |
309 | ||
310 | L<https://github.com/itkach/slob> | |
d3779104 MG |
311 | |
312 | =head1 AUTHOR | |
313 | ||
314 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
315 | ||
316 | =head1 COPYRIGHT AND LICENSE | |
317 | ||
318 | Copyright (C) 2017 by Marius Gavrilescu | |
319 | ||
320 | This library is free software; you can redistribute it and/or modify | |
321 | it under the same terms as Perl itself, either Perl version 5.26.1 or, | |
322 | at your option, any later version of Perl 5 you may have available. | |
323 | ||
324 | ||
325 | =cut |