Bump version and update Changes
[slob.git] / lib / Slob.pm
1 package Slob;
2
3 use 5.014000;
4 use strict;
5 use warnings;
6 our $VERSION = '0.002';
7
8 use constant MAGIC => "!-1SLOB\x1F";
9
10 use Carp qw/croak verbose/;
11 use Encode;
12
13 use Compress::Raw::Bzip2;
14 use Compress::Raw::Lzma;
15 use Compress::Raw::Zlib;
16
17 our %UNCOMPRESS = (
18 '' => sub { $_[0] },
19 'lzma2' => sub {
20 my ($input) = @_;
21 my ($lzma2, $code, $output);
22 ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
23 die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
24
25 $code = $lzma2->code($input, $output);
26 die "Did not reach end of stream\n" if $code == LZMA_OK;
27 die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
28 $output
29 },
30
31 'bz2' => sub {
32 my ($input) = @_;
33 my ($bz2, $code, $output);
34 ($bz2, $code)= Compress::Raw::Bunzip2->new;
35 die "Error creating Bunzip2: $code\n" unless $code == Z_OK;
36
37 $code = $bz2->bzinflate($input, $output);
38 die "Did not reach end of stream\n" if $code == BZ_OK;
39 die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;
40
41 $output
42 },
43
44 'zlib' => sub {
45 my ($input) = @_;
46 my ($zlib, $code, $output);
47 ($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
48 -WindowBits => WANT_GZIP_OR_ZLIB
49 );
50 die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;
51
52 $code = $zlib->inflate($input, \$output, 1);
53 die "Did not reach end of stream\n" if $code == Z_OK;
54 die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
55 $output
56 }
57 );
58
59 sub new {
60 my ($class, $path) = @_;
61 my $fh;
62 if (ref $path eq 'IO') {
63 $fh = $path
64 } else {
65 open $fh, '<', $path or croak "Cannot open \"$path\": $!"
66 }
67 my $self = bless {path => $path, fh => $fh}, $class;
68 $self->{header} = $self->read_header;
69 $self
70 }
71
72 sub read_data {
73 my ($self, $len) = @_;
74 my $data;
75 my $result = read $self->{fh}, $data, $len;
76 if (!defined $result) {
77 croak "Failed to read from $self->{path}: $!"
78 } elsif ($result == $len) {
79 $data
80 } elsif ($result == 0) {
81 croak "$self->{path} is at end of file"
82 } elsif ($result < $len) {
83 croak "Only read $result bytes of $self->{path} before reaching EOF"
84 }
85 }
86
87 sub read_formatted {
88 my ($self, $len_of_format, $format) = @_;
89 unpack $format, $self->read_data($len_of_format);
90 }
91
92 sub read_char { shift->read_formatted(1, 'C') }
93 sub read_short { shift->read_formatted(2, 'n') }
94 sub read_int { shift->read_formatted(4, 'N') }
95 sub read_long { shift->read_formatted(8, 'Q>') }
96
97 sub read_tiny_text {
98 my ($self, $encoding) = @_;
99 my $data = $self->read_data($self->read_char);
100 if (length $data == 255) {
101 $data = unpack 'Z*', $data;
102 }
103 $encoding //= $self->{encoding};
104 decode $encoding, $data;
105 }
106
107 sub read_text {
108 my ($self, $encoding) = @_;
109 my $data = $self->read_data($self->read_short);
110 $encoding //= $self->{encoding};
111 decode $encoding, $data;
112 }
113
114 sub read_large_byte_string {
115 my ($self) = @_;
116 $self->read_data($self->read_short)
117 }
118
119 sub read_tag {
120 my ($self) = @_;
121 my $name = $self->read_tiny_text;
122 my $value = $self->read_tiny_text;
123 ($name, $value)
124 }
125
126 sub read_tags {
127 my ($self) = @_;
128 my $tag_count = $self->read_char;
129 map { $self->read_tag } 1..$tag_count
130 }
131
132 sub read_content_types {
133 my ($self) = @_;
134 my $content_type_count = $self->read_char;
135 map { $self->read_text } 1..$content_type_count
136 }
137
138 sub read_positions {
139 my ($self) = @_;
140 my $count = $self->read_int;
141 my @positions = map { $self->read_long } 1..$count;
142 my $relative_to = $self->ftell;
143 map { $relative_to + $_ } @positions
144 }
145
146 sub fseek {
147 my ($self, $position) = @_;
148 seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
149 }
150
151 sub ftell {
152 my ($self) = @_;
153 my $result = tell $self->{fh};
154 croak "Failed to tell position in file" if $result == -1;
155 $result
156 }
157
158 sub uncompress {
159 my ($self, $data) = @_;
160 $UNCOMPRESS{$self->{header}{compression}}->($data)
161 }
162
163 sub read_header {
164 my ($self) = @_;
165 my $magic = $self->read_data(length MAGIC);
166 croak "Not a SLOB dictionary" unless MAGIC eq $magic;
167 my $uuid = $self->read_data(16);
168
169 my $encoding = $self->read_tiny_text('UTF-8');
170 $self->{encoding} = $encoding;
171
172 my $compression = $self->read_tiny_text;
173 die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
174 my %tags = $self->read_tags;
175 my @content_types = $self->read_content_types;
176 my $blob_count = $self->read_int;
177 my $store_offset = $self->read_long;
178 my $size = $self->read_long;
179 my @refs = $self->read_positions;
180
181 $self->fseek($store_offset);
182 my @storage_bins = $self->read_positions;
183
184 +{
185 uuid => $uuid,
186 encoding => $encoding,
187 compression => $compression,
188 tags => \%tags,
189 content_types => \@content_types,
190 blob_count => $blob_count,
191 store_offset => $store_offset,
192 size => $size,
193 refs => \@refs,
194 storage_bins => \@storage_bins,
195 }
196 }
197
198 sub read_ref {
199 my ($self) = @_;
200 my $key = $self->read_text;
201 my $bin_index = $self->read_int;
202 my $item_index = $self->read_short;
203 my $fragment = $self->read_tiny_text;
204 +{
205 key => $key,
206 bin_index => $bin_index,
207 item_index => $item_index,
208 fragment => $fragment,
209 }
210 }
211
212 sub read_storage_bin {
213 my ($self) = @_;
214 my $count = $self->read_int;
215 my @content_types = map { $self->read_char } 1..$count;
216 my $compressed_size = $self->read_int;
217 my $compressed_data = $self->read_data($compressed_size);
218 my $uncompressed_data = $self->uncompress($compressed_data);
219
220 my @positions = unpack "N$count", $uncompressed_data;
221 my $data = substr $uncompressed_data, $count * 4;
222 +{
223 positions => \@positions,
224 data => $data
225 }
226 }
227
228 sub ref_count { shift @{shift->{header}{refs}} }
229
230 sub seek_and_read_ref {
231 my ($self, $index) = @_;
232 croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
233 $self->fseek($self->{header}{refs}[$index]);
234 $self->read_ref
235 }
236
237 sub seek_and_read_storage_bin {
238 my ($self, $index) = @_;
239 croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
240 $self->fseek($self->{header}{storage_bins}[$index]);
241 $self->read_storage_bin
242 }
243
244 sub get_entry_of_storage_bin {
245 my ($self, $storage_bin, $index) = @_;
246 my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
247 my $length = unpack 'N', $start_of_data;
248 substr $start_of_data, 4, $length;
249 }
250
251 sub seek_and_read_ref_and_data {
252 my ($self, $index) = @_;
253 my $ref = $self->seek_and_read_ref($index);
254 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
255 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
256 $ref->{data} = $data;
257 $ref
258 }
259
260 1;
261 __END__
262
263 =encoding utf-8
264
265 =head1 NAME
266
267 Slob - Read .slob dictionaries (as used by Aard 2)
268
269 =head1 SYNOPSIS
270
271 use feature qw/:5.14/;
272 use Slob;
273 my $slob = Slob->new('path/to/dict.slob');
274
275 my $nr_of_entries = $slob->ref_count; # if the same content has two
276 # keys pointing to it, this
277 # counts it twice
278
279 my $second_ref = $slob->seek_and_read_ref(4);
280 say "Entry is for $second_ref->{key}";
281 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
282
283 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
284 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
285 say "Value at position $second_ref->{item_index} is ",
286 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
287
288 # instead of the above, we can do
289 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
290 say "Entry is for $second_ref_and_data->{key}";
291 say "Value is $second_ref_and_data->{data}";
292
293 =head1 DESCRIPTION
294
295 Slob is a compressed read-only format for storing natural language
296 dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
297 dictionaries in slob format.
298
299 The following methods are available:
300
301 =over
302
303 =item Slob->B<new>(I<$path>)
304 =item Slob->B<new>(I<$fh>)
305
306 Create a new slob reader reading from the given path or filehandle.
307
308 =item $slob->B<ref_count>
309
310 The number of refs (keys) in the dictionary.
311
312 =item $slob->B<seek_and_read_ref>(I<$index>)
313
314 Read the ref (key) at the given index. Returns a hashref with the
315 following keys:
316
317 =over
318
319 =item key
320
321 The key
322
323 =item bin_index
324
325 The storage bin that contains the value for this key
326
327 =item item_index
328
329 The index in the bin_index storage bin of the value for this key
330
331 =item fragment
332
333 HTML fragment that, when applied to the HTML value, points to the
334 definition of the key.
335
336 =back
337
338 =item $slob->B<seek_and_read_storage_bin>(I<$index>)
339
340 Read the storage bin with the given index. Returns the storage bin,
341 which can later be given to B<get_entry_of_storage_bin>.
342
343 =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
344
345 Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
346 item index, returns the value at the index i nthe storage bin.
347
348 =item $slob->B<seek_and_read_ref_and_data>($index)
349
350 Convenience method that returns the key and value at a given index.
351 Returns a hashref like C<seek_and_read_ref> with an extra key,
352 I<data>, which is the value of the key.
353
354 =back
355
356 =head1 SEE ALSO
357
358 L<https://github.com/itkach/slob>
359
360 =head1 AUTHOR
361
362 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
363
364 =head1 COPYRIGHT AND LICENSE
365
366 Copyright (C) 2017-2018 by Marius Gavrilescu
367
368 This library is free software; you can redistribute it and/or modify
369 it under the same terms as Perl itself, either Perl version 5.26.1 or,
370 at your option, any later version of Perl 5 you may have available.
371
372
373 =cut
This page took 0.043075 seconds and 4 git commands to generate.