Bump version and update Changes
[slob.git] / lib / Slob.pm
1 package Slob;
2
3 use 5.014000;
4 use strict;
5 use warnings;
6 our $VERSION = '0.001';
7
8 use constant MAGIC => "!-1SLOB\x1F";
9
10 use Carp qw/croak verbose/;
11 use Encode;
12
13 use Compress::Raw::Lzma;
14
15 our %UNCOMPRESS = (
16 '' => sub { $_[0] },
17 'lzma2' => sub {
18 my ($input) = @_;
19 my ($lzma2, $code, $output);
20 ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
21 die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
22 $code = $lzma2->code($input, $output);
23 die "Did not reach end of stream" if $code == LZMA_OK;
24 die "Error decoding LZMA2: $code" if $code != LZMA_STREAM_END;
25 $output
26 }
27 );
28
29 sub new {
30 my ($class, $path) = @_;
31 my $fh;
32 if (ref $path eq 'IO') {
33 $fh = $path
34 } else {
35 open $fh, '<', $path or croak "Cannot open \"$path\": $!"
36 }
37 my $self = bless {path => $path, fh => $fh}, $class;
38 $self->{header} = $self->read_header;
39 $self
40 }
41
42 sub read_data {
43 my ($self, $len) = @_;
44 my $data;
45 my $result = read $self->{fh}, $data, $len;
46 if (!defined $result) {
47 croak "Failed to read from $self->{path}: $!"
48 } elsif ($result == $len) {
49 $data
50 } elsif ($result == 0) {
51 croak "$self->{path} is at end of file"
52 } elsif ($result < $len) {
53 croak "Only read $result bytes of $self->{path} before reaching EOF"
54 }
55 }
56
57 sub read_formatted {
58 my ($self, $len_of_format, $format) = @_;
59 unpack $format, $self->read_data($len_of_format);
60 }
61
62 sub read_char { shift->read_formatted(1, 'C') }
63 sub read_short { shift->read_formatted(2, 'n') }
64 sub read_int { shift->read_formatted(4, 'N') }
65 sub read_long { shift->read_formatted(8, 'Q>') }
66
67 sub read_tiny_text {
68 my ($self, $encoding) = @_;
69 my $data = $self->read_data($self->read_char);
70 if (length $data == 255) {
71 $data = unpack 'Z*', $data;
72 }
73 $encoding //= $self->{encoding};
74 decode $encoding, $data;
75 }
76
77 sub read_text {
78 my ($self, $encoding) = @_;
79 my $data = $self->read_data($self->read_short);
80 $encoding //= $self->{encoding};
81 decode $encoding, $data;
82 }
83
84 sub read_large_byte_string {
85 my ($self) = @_;
86 $self->read_data($self->read_short)
87 }
88
89 sub read_tag {
90 my ($self) = @_;
91 my $name = $self->read_tiny_text;
92 my $value = $self->read_tiny_text;
93 ($name, $value)
94 }
95
96 sub read_tags {
97 my ($self) = @_;
98 my $tag_count = $self->read_char;
99 map { $self->read_tag } 1..$tag_count
100 }
101
102 sub read_content_types {
103 my ($self) = @_;
104 my $content_type_count = $self->read_char;
105 map { $self->read_text } 1..$content_type_count
106 }
107
108 sub read_positions {
109 my ($self) = @_;
110 my $count = $self->read_int;
111 my @positions = map { $self->read_long } 1..$count;
112 my $relative_to = $self->ftell;
113 map { $relative_to + $_ } @positions
114 }
115
116 sub fseek {
117 my ($self, $position) = @_;
118 seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
119 }
120
121 sub ftell {
122 my ($self) = @_;
123 my $result = tell $self->{fh};
124 croak "Failed to tell position in file" if $result == -1;
125 $result
126 }
127
128 sub uncompress {
129 my ($self, $data) = @_;
130 $UNCOMPRESS{$self->{header}{compression}}->($data)
131 }
132
133 sub read_header {
134 my ($self) = @_;
135 my $magic = $self->read_data(length MAGIC);
136 croak "Not a SLOB dictionary" unless MAGIC eq $magic;
137 my $uuid = $self->read_data(16);
138
139 my $encoding = $self->read_tiny_text('UTF-8');
140 $self->{encoding} = $encoding;
141
142 my $compression = $self->read_tiny_text;
143 die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
144 my %tags = $self->read_tags;
145 my @content_types = $self->read_content_types;
146 my $blob_count = $self->read_int;
147 my $store_offset = $self->read_long;
148 my $size = $self->read_long;
149 my @refs = $self->read_positions;
150
151 $self->fseek($store_offset);
152 my @storage_bins = $self->read_positions;
153
154 +{
155 uuid => $uuid,
156 encoding => $encoding,
157 compression => $compression,
158 tags => \%tags,
159 content_types => \@content_types,
160 blob_count => $blob_count,
161 store_offset => $store_offset,
162 size => $size,
163 refs => \@refs,
164 storage_bins => \@storage_bins,
165 }
166 }
167
168 sub read_ref {
169 my ($self) = @_;
170 my $key = $self->read_text;
171 my $bin_index = $self->read_int;
172 my $item_index = $self->read_short;
173 my $fragment = $self->read_tiny_text;
174 +{
175 key => $key,
176 bin_index => $bin_index,
177 item_index => $item_index,
178 fragment => $fragment,
179 }
180 }
181
182 sub read_storage_bin {
183 my ($self) = @_;
184 my $count = $self->read_int;
185 my @content_types = map { $self->read_char } 1..$count;
186 my $compressed_size = $self->read_int;
187 my $compressed_data = $self->read_data($compressed_size);
188 my $uncompressed_data = $self->uncompress($compressed_data);
189
190 my @positions = unpack "N$count", $uncompressed_data;
191 my $data = substr $uncompressed_data, $count * 4;
192 +{
193 positions => \@positions,
194 data => $data
195 }
196 }
197
198 sub ref_count { shift @{shift->{header}{refs}} }
199
200 sub seek_and_read_ref {
201 my ($self, $index) = @_;
202 croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
203 $self->fseek($self->{header}{refs}[$index]);
204 $self->read_ref
205 }
206
207 sub seek_and_read_storage_bin {
208 my ($self, $index) = @_;
209 croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
210 $self->fseek($self->{header}{storage_bins}[$index]);
211 $self->read_storage_bin
212 }
213
214 sub get_entry_of_storage_bin {
215 my ($self, $storage_bin, $index) = @_;
216 my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
217 my $length = unpack 'N', $start_of_data;
218 substr $start_of_data, 4, $length;
219 }
220
221 sub seek_and_read_ref_and_data {
222 my ($self, $index) = @_;
223 my $ref = $self->seek_and_read_ref($index);
224 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
225 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
226 $ref->{data} = $data;
227 $ref
228 }
229
230 1;
231 __END__
232
233 =encoding utf-8
234
235 =head1 NAME
236
237 Slob - Read .slob dictionaries (as used by Aard 2)
238
239 =head1 SYNOPSIS
240
241 use feature qw/:5.14/;
242 use Slob;
243 my $slob = Slob->new('path/to/dict.slob');
244
245 my $nr_of_entries = $slob->ref_count; # if the same content has two
246 # keys pointing to it, this
247 # counts it twice
248
249 my $second_ref = $slob->seek_and_read_ref(4);
250 say "Entry is for $second_ref->{key}";
251 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
252
253 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
254 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
255 say "Value at position $second_ref->{item_index} is ",
256 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
257
258 # instead of the above, we can do
259 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
260 say "Entry is for $second_ref_and_data->{key}";
261 say "Value is $second_ref_and_data->{data}";
262
263 =head1 DESCRIPTION
264
265 Slob is a compressed read-only format for storing natural language
266 dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
267 dictionaries in slob format.
268
269 The following methods are available:
270
271 =over
272
273 =item Slob->B<new>(I<$path>)
274 =item Slob->B<new>(I<$fh>)
275
276 Create a new slob reader reading from the given path or filehandle.
277
278 =item $slob->B<ref_count>
279
280 The number of refs (keys) in the dictionary.
281
282 =item $slob->B<seek_and_read_ref>(I<$index>)
283
284 Read the ref (key) at the given index. Returns a hashref with the
285 following keys:
286
287 =over
288
289 =item key
290
291 The key
292
293 =item bin_index
294
295 The storage bin that contains the value for this key
296
297 =item item_index
298
299 The index in the bin_index storage bin of the value for this key
300
301 =item fragment
302
303 HTML fragment that, when applied to the HTML value, points to the
304 definition of the key.
305
306 =back
307
308 =item $slob->B<seek_and_read_storage_bin>(I<$index>)
309
310 Read the storage bin with the given index. Returns the storage bin,
311 which can later be given to B<get_entry_of_storage_bin>.
312
313 =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
314
315 Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
316 item index, returns the value at the index i nthe storage bin.
317
318 =item $slob->B<seek_and_read_ref_and_data>($index)
319
320 Convenience method that returns the key and value at a given index.
321 Returns a hashref like C<seek_and_read_ref> with an extra key,
322 I<data>, which is the value of the key.
323
324 =back
325
326 =head1 SEE ALSO
327
328 L<https://github.com/itkach/slob>
329
330 =head1 AUTHOR
331
332 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
333
334 =head1 COPYRIGHT AND LICENSE
335
336 Copyright (C) 2017 by Marius Gavrilescu
337
338 This library is free software; you can redistribute it and/or modify
339 it under the same terms as Perl itself, either Perl version 5.26.1 or,
340 at your option, any later version of Perl 5 you may have available.
341
342
343 =cut
This page took 0.040258 seconds and 4 git commands to generate.