Bump version and update Changes
[slob.git] / lib / Slob.pm
CommitLineData
d3779104
MG
1package Slob;
2
3use 5.014000;
4use strict;
5use warnings;
35224ef9 6our $VERSION = '0.002';
d3779104
MG
7
8use constant MAGIC => "!-1SLOB\x1F";
9
10use Carp qw/croak verbose/;
11use Encode;
12
1ca60f55 13use Compress::Raw::Bzip2;
d50f3958 14use Compress::Raw::Lzma;
1ca60f55 15use Compress::Raw::Zlib;
d50f3958
MG
16
17our %UNCOMPRESS = (
18 '' => sub { $_[0] },
19 'lzma2' => sub {
20 my ($input) = @_;
21 my ($lzma2, $code, $output);
22 ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2());
23 die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK;
1ca60f55 24
d50f3958 25 $code = $lzma2->code($input, $output);
1ca60f55
MG
26 die "Did not reach end of stream\n" if $code == LZMA_OK;
27 die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END;
28 $output
29 },
30
31 'bz2' => sub {
32 my ($input) = @_;
33 my ($bz2, $code, $output);
34 ($bz2, $code)= Compress::Raw::Bunzip2->new;
35 die "Error creating Bunzip2: $code\n" unless $code == Z_OK;
36
37 $code = $bz2->bzinflate($input, $output);
38 die "Did not reach end of stream\n" if $code == BZ_OK;
39 die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END;
40
41 $output
42 },
43
44 'zlib' => sub {
45 my ($input) = @_;
46 my ($zlib, $code, $output);
47 ($zlib, $code) = Compress::Raw::Zlib::Inflate->new(
48 -WindowBits => WANT_GZIP_OR_ZLIB
49 );
50 die "Error creating Zlib inflate: $code\n" unless $code == Z_OK;
51
52 $code = $zlib->inflate($input, \$output, 1);
53 die "Did not reach end of stream\n" if $code == Z_OK;
54 die "Error inflating zlib: $code\n" if $code != Z_STREAM_END;
d50f3958
MG
55 $output
56 }
57);
58
d3779104
MG
59sub new {
60 my ($class, $path) = @_;
d50f3958
MG
61 my $fh;
62 if (ref $path eq 'IO') {
63 $fh = $path
64 } else {
65 open $fh, '<', $path or croak "Cannot open \"$path\": $!"
66 }
d3779104
MG
67 my $self = bless {path => $path, fh => $fh}, $class;
68 $self->{header} = $self->read_header;
69 $self
70}
71
72sub read_data {
73 my ($self, $len) = @_;
74 my $data;
75 my $result = read $self->{fh}, $data, $len;
76 if (!defined $result) {
77 croak "Failed to read from $self->{path}: $!"
78 } elsif ($result == $len) {
79 $data
80 } elsif ($result == 0) {
81 croak "$self->{path} is at end of file"
82 } elsif ($result < $len) {
83 croak "Only read $result bytes of $self->{path} before reaching EOF"
84 }
85}
86
87sub read_formatted {
88 my ($self, $len_of_format, $format) = @_;
89 unpack $format, $self->read_data($len_of_format);
90}
91
92sub read_char { shift->read_formatted(1, 'C') }
93sub read_short { shift->read_formatted(2, 'n') }
94sub read_int { shift->read_formatted(4, 'N') }
95sub read_long { shift->read_formatted(8, 'Q>') }
96
97sub read_tiny_text {
98 my ($self, $encoding) = @_;
99 my $data = $self->read_data($self->read_char);
100 if (length $data == 255) {
101 $data = unpack 'Z*', $data;
102 }
103 $encoding //= $self->{encoding};
104 decode $encoding, $data;
105}
106
107sub read_text {
108 my ($self, $encoding) = @_;
109 my $data = $self->read_data($self->read_short);
110 $encoding //= $self->{encoding};
111 decode $encoding, $data;
112}
113
114sub read_large_byte_string {
115 my ($self) = @_;
116 $self->read_data($self->read_short)
117}
118
119sub read_tag {
120 my ($self) = @_;
121 my $name = $self->read_tiny_text;
122 my $value = $self->read_tiny_text;
123 ($name, $value)
124}
125
126sub read_tags {
127 my ($self) = @_;
128 my $tag_count = $self->read_char;
129 map { $self->read_tag } 1..$tag_count
130}
131
132sub read_content_types {
133 my ($self) = @_;
134 my $content_type_count = $self->read_char;
135 map { $self->read_text } 1..$content_type_count
136}
137
138sub read_positions {
139 my ($self) = @_;
140 my $count = $self->read_int;
141 my @positions = map { $self->read_long } 1..$count;
142 my $relative_to = $self->ftell;
143 map { $relative_to + $_ } @positions
144}
145
146sub fseek {
147 my ($self, $position) = @_;
148 seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
149}
150
151sub ftell {
152 my ($self) = @_;
153 my $result = tell $self->{fh};
154 croak "Failed to tell position in file" if $result == -1;
155 $result
156}
157
158sub uncompress {
159 my ($self, $data) = @_;
d50f3958 160 $UNCOMPRESS{$self->{header}{compression}}->($data)
d3779104
MG
161}
162
163sub read_header {
164 my ($self) = @_;
165 my $magic = $self->read_data(length MAGIC);
166 croak "Not a SLOB dictionary" unless MAGIC eq $magic;
167 my $uuid = $self->read_data(16);
168
169 my $encoding = $self->read_tiny_text('UTF-8');
170 $self->{encoding} = $encoding;
171
172 my $compression = $self->read_tiny_text;
d50f3958 173 die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression};
d3779104
MG
174 my %tags = $self->read_tags;
175 my @content_types = $self->read_content_types;
176 my $blob_count = $self->read_int;
177 my $store_offset = $self->read_long;
178 my $size = $self->read_long;
179 my @refs = $self->read_positions;
180
181 $self->fseek($store_offset);
182 my @storage_bins = $self->read_positions;
183
184 +{
185 uuid => $uuid,
186 encoding => $encoding,
187 compression => $compression,
188 tags => \%tags,
189 content_types => \@content_types,
190 blob_count => $blob_count,
191 store_offset => $store_offset,
192 size => $size,
193 refs => \@refs,
194 storage_bins => \@storage_bins,
195 }
196}
197
198sub read_ref {
199 my ($self) = @_;
200 my $key = $self->read_text;
201 my $bin_index = $self->read_int;
202 my $item_index = $self->read_short;
203 my $fragment = $self->read_tiny_text;
204 +{
205 key => $key,
206 bin_index => $bin_index,
207 item_index => $item_index,
208 fragment => $fragment,
209 }
210}
211
212sub read_storage_bin {
213 my ($self) = @_;
214 my $count = $self->read_int;
215 my @content_types = map { $self->read_char } 1..$count;
216 my $compressed_size = $self->read_int;
217 my $compressed_data = $self->read_data($compressed_size);
218 my $uncompressed_data = $self->uncompress($compressed_data);
219
220 my @positions = unpack "N$count", $uncompressed_data;
221 my $data = substr $uncompressed_data, $count * 4;
222 +{
223 positions => \@positions,
224 data => $data
225 }
226}
227
228sub ref_count { shift @{shift->{header}{refs}} }
229
230sub seek_and_read_ref {
231 my ($self, $index) = @_;
232 croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
233 $self->fseek($self->{header}{refs}[$index]);
234 $self->read_ref
235}
236
237sub seek_and_read_storage_bin {
238 my ($self, $index) = @_;
239 croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
240 $self->fseek($self->{header}{storage_bins}[$index]);
241 $self->read_storage_bin
242}
243
244sub get_entry_of_storage_bin {
245 my ($self, $storage_bin, $index) = @_;
246 my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
247 my $length = unpack 'N', $start_of_data;
248 substr $start_of_data, 4, $length;
249}
250
0e15b496
MG
251sub seek_and_read_ref_and_data {
252 my ($self, $index) = @_;
253 my $ref = $self->seek_and_read_ref($index);
254 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
255 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
256 $ref->{data} = $data;
257 $ref
258}
259
d3779104
MG
2601;
261__END__
262
263=encoding utf-8
264
265=head1 NAME
266
267Slob - Read .slob dictionaries (as used by Aard 2)
268
269=head1 SYNOPSIS
270
271 use feature qw/:5.14/;
272 use Slob;
273 my $slob = Slob->new('path/to/dict.slob');
274
275 my $nr_of_entries = $slob->ref_count; # if the same content has two
276 # keys pointing to it, this
277 # counts it twice
278
279 my $second_ref = $slob->seek_and_read_ref(4);
280 say "Entry is for $second_ref->{key}";
281 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
282
283 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
284 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
285 say "Value at position $second_ref->{item_index} is ",
286 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
287
0e15b496
MG
288 # instead of the above, we can do
289 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
290 say "Entry is for $second_ref_and_data->{key}";
291 say "Value is $second_ref_and_data->{data}";
292
d3779104
MG
293=head1 DESCRIPTION
294
632d3de3
MG
295Slob is a compressed read-only format for storing natural language
296dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
297dictionaries in slob format.
298
299The following methods are available:
300
301=over
302
303=item Slob->B<new>(I<$path>)
304=item Slob->B<new>(I<$fh>)
305
306Create a new slob reader reading from the given path or filehandle.
307
308=item $slob->B<ref_count>
309
310The number of refs (keys) in the dictionary.
311
312=item $slob->B<seek_and_read_ref>(I<$index>)
313
314Read the ref (key) at the given index. Returns a hashref with the
315following keys:
316
317=over
318
319=item key
320
321The key
322
323=item bin_index
324
325The storage bin that contains the value for this key
326
327=item item_index
328
329The index in the bin_index storage bin of the value for this key
330
331=item fragment
332
333HTML fragment that, when applied to the HTML value, points to the
334definition of the key.
335
336=back
337
338=item $slob->B<seek_and_read_storage_bin>(I<$index>)
339
340Read the storage bin with the given index. Returns the storage bin,
341which can later be given to B<get_entry_of_storage_bin>.
342
343=item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
344
345Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
346item index, returns the value at the index i nthe storage bin.
347
348=item $slob->B<seek_and_read_ref_and_data>($index)
349
350Convenience method that returns the key and value at a given index.
351Returns a hashref like C<seek_and_read_ref> with an extra key,
352I<data>, which is the value of the key.
353
354=back
355
356=head1 SEE ALSO
357
358L<https://github.com/itkach/slob>
d3779104
MG
359
360=head1 AUTHOR
361
362Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
363
364=head1 COPYRIGHT AND LICENSE
365
35224ef9 366Copyright (C) 2017-2018 by Marius Gavrilescu
d3779104
MG
367
368This library is free software; you can redistribute it and/or modify
369it under the same terms as Perl itself, either Perl version 5.26.1 or,
370at your option, any later version of Perl 5 you may have available.
371
372
373=cut
This page took 0.030047 seconds and 4 git commands to generate.