Better new + doc
[slob.git] / lib / Slob.pm
1 package Slob;
2
3 use 5.014000;
4 use strict;
5 use warnings;
6 our $VERSION = '0.000_001';
7
8 use constant MAGIC => "!-1SLOB\x1F";
9
10 use Carp qw/croak verbose/;
11 use Encode;
12
13 sub new {
14 my ($class, $path) = @_;
15 my $fh =
16 ref $path eq 'IO'
17 ? $path
18 : open my $fh, '<', $path or croak "Cannot open \"$path\": $!";
19 my $self = bless {path => $path, fh => $fh}, $class;
20 $self->{header} = $self->read_header;
21 $self
22 }
23
24 sub read_data {
25 my ($self, $len) = @_;
26 my $data;
27 my $result = read $self->{fh}, $data, $len;
28 if (!defined $result) {
29 croak "Failed to read from $self->{path}: $!"
30 } elsif ($result == $len) {
31 $data
32 } elsif ($result == 0) {
33 croak "$self->{path} is at end of file"
34 } elsif ($result < $len) {
35 croak "Only read $result bytes of $self->{path} before reaching EOF"
36 }
37 }
38
39 sub read_formatted {
40 my ($self, $len_of_format, $format) = @_;
41 unpack $format, $self->read_data($len_of_format);
42 }
43
44 sub read_char { shift->read_formatted(1, 'C') }
45 sub read_short { shift->read_formatted(2, 'n') }
46 sub read_int { shift->read_formatted(4, 'N') }
47 sub read_long { shift->read_formatted(8, 'Q>') }
48
49 sub read_tiny_text {
50 my ($self, $encoding) = @_;
51 my $data = $self->read_data($self->read_char);
52 if (length $data == 255) {
53 $data = unpack 'Z*', $data;
54 }
55 $encoding //= $self->{encoding};
56 decode $encoding, $data;
57 }
58
59 sub read_text {
60 my ($self, $encoding) = @_;
61 my $data = $self->read_data($self->read_short);
62 $encoding //= $self->{encoding};
63 decode $encoding, $data;
64 }
65
66 sub read_large_byte_string {
67 my ($self) = @_;
68 $self->read_data($self->read_short)
69 }
70
71 sub read_tag {
72 my ($self) = @_;
73 my $name = $self->read_tiny_text;
74 my $value = $self->read_tiny_text;
75 ($name, $value)
76 }
77
78 sub read_tags {
79 my ($self) = @_;
80 my $tag_count = $self->read_char;
81 map { $self->read_tag } 1..$tag_count
82 }
83
84 sub read_content_types {
85 my ($self) = @_;
86 my $content_type_count = $self->read_char;
87 map { $self->read_text } 1..$content_type_count
88 }
89
90 sub read_positions {
91 my ($self) = @_;
92 my $count = $self->read_int;
93 my @positions = map { $self->read_long } 1..$count;
94 my $relative_to = $self->ftell;
95 map { $relative_to + $_ } @positions
96 }
97
98 sub fseek {
99 my ($self, $position) = @_;
100 seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
101 }
102
103 sub ftell {
104 my ($self) = @_;
105 my $result = tell $self->{fh};
106 croak "Failed to tell position in file" if $result == -1;
107 $result
108 }
109
110 sub uncompress {
111 my ($self, $data) = @_;
112 $data
113 }
114
115 sub read_header {
116 my ($self) = @_;
117 my $magic = $self->read_data(length MAGIC);
118 croak "Not a SLOB dictionary" unless MAGIC eq $magic;
119 my $uuid = $self->read_data(16);
120
121 my $encoding = $self->read_tiny_text('UTF-8');
122 $self->{encoding} = $encoding;
123
124 my $compression = $self->read_tiny_text;
125 die "Compression not yet supported" if $compression;
126 my %tags = $self->read_tags;
127 my @content_types = $self->read_content_types;
128 my $blob_count = $self->read_int;
129 my $store_offset = $self->read_long;
130 my $size = $self->read_long;
131 my @refs = $self->read_positions;
132
133 $self->fseek($store_offset);
134 my @storage_bins = $self->read_positions;
135
136 +{
137 uuid => $uuid,
138 encoding => $encoding,
139 compression => $compression,
140 tags => \%tags,
141 content_types => \@content_types,
142 blob_count => $blob_count,
143 store_offset => $store_offset,
144 size => $size,
145 refs => \@refs,
146 storage_bins => \@storage_bins,
147 }
148 }
149
150 sub read_ref {
151 my ($self) = @_;
152 my $key = $self->read_text;
153 my $bin_index = $self->read_int;
154 my $item_index = $self->read_short;
155 my $fragment = $self->read_tiny_text;
156 +{
157 key => $key,
158 bin_index => $bin_index,
159 item_index => $item_index,
160 fragment => $fragment,
161 }
162 }
163
164 sub read_storage_bin {
165 my ($self) = @_;
166 my $count = $self->read_int;
167 my @content_types = map { $self->read_char } 1..$count;
168 my $compressed_size = $self->read_int;
169 my $compressed_data = $self->read_data($compressed_size);
170 my $uncompressed_data = $self->uncompress($compressed_data);
171
172 my @positions = unpack "N$count", $uncompressed_data;
173 my $data = substr $uncompressed_data, $count * 4;
174 +{
175 positions => \@positions,
176 data => $data
177 }
178 }
179
180 sub ref_count { shift @{shift->{header}{refs}} }
181
182 sub seek_and_read_ref {
183 my ($self, $index) = @_;
184 croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
185 $self->fseek($self->{header}{refs}[$index]);
186 $self->read_ref
187 }
188
189 sub seek_and_read_storage_bin {
190 my ($self, $index) = @_;
191 croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
192 $self->fseek($self->{header}{storage_bins}[$index]);
193 $self->read_storage_bin
194 }
195
196 sub get_entry_of_storage_bin {
197 my ($self, $storage_bin, $index) = @_;
198 my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
199 my $length = unpack 'N', $start_of_data;
200 substr $start_of_data, 4, $length;
201 }
202
203 sub seek_and_read_ref_and_data {
204 my ($self, $index) = @_;
205 my $ref = $self->seek_and_read_ref($index);
206 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
207 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
208 $ref->{data} = $data;
209 $ref
210 }
211
212 1;
213 __END__
214
215 =encoding utf-8
216
217 =head1 NAME
218
219 Slob - Read .slob dictionaries (as used by Aard 2)
220
221 =head1 SYNOPSIS
222
223 use feature qw/:5.14/;
224 use Slob;
225 my $slob = Slob->new('path/to/dict.slob');
226
227 my $nr_of_entries = $slob->ref_count; # if the same content has two
228 # keys pointing to it, this
229 # counts it twice
230
231 my $second_ref = $slob->seek_and_read_ref(4);
232 say "Entry is for $second_ref->{key}";
233 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
234
235 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
236 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
237 say "Value at position $second_ref->{item_index} is ",
238 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
239
240 # instead of the above, we can do
241 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
242 say "Entry is for $second_ref_and_data->{key}";
243 say "Value is $second_ref_and_data->{data}";
244
245 =head1 DESCRIPTION
246
247 Slob is a compressed read-only format for storing natural language
248 dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads
249 dictionaries in slob format.
250
251 The following methods are available:
252
253 =over
254
255 =item Slob->B<new>(I<$path>)
256 =item Slob->B<new>(I<$fh>)
257
258 Create a new slob reader reading from the given path or filehandle.
259
260 =item $slob->B<ref_count>
261
262 The number of refs (keys) in the dictionary.
263
264 =item $slob->B<seek_and_read_ref>(I<$index>)
265
266 Read the ref (key) at the given index. Returns a hashref with the
267 following keys:
268
269 =over
270
271 =item key
272
273 The key
274
275 =item bin_index
276
277 The storage bin that contains the value for this key
278
279 =item item_index
280
281 The index in the bin_index storage bin of the value for this key
282
283 =item fragment
284
285 HTML fragment that, when applied to the HTML value, points to the
286 definition of the key.
287
288 =back
289
290 =item $slob->B<seek_and_read_storage_bin>(I<$index>)
291
292 Read the storage bin with the given index. Returns the storage bin,
293 which can later be given to B<get_entry_of_storage_bin>.
294
295 =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>)
296
297 Given a storage bin (as returned by C<seek_and_read_storage_bin>) and
298 item index, returns the value at the index i nthe storage bin.
299
300 =item $slob->B<seek_and_read_ref_and_data>($index)
301
302 Convenience method that returns the key and value at a given index.
303 Returns a hashref like C<seek_and_read_ref> with an extra key,
304 I<data>, which is the value of the key.
305
306 =back
307
308 =head1 SEE ALSO
309
310 L<https://github.com/itkach/slob>
311
312 =head1 AUTHOR
313
314 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
315
316 =head1 COPYRIGHT AND LICENSE
317
318 Copyright (C) 2017 by Marius Gavrilescu
319
320 This library is free software; you can redistribute it and/or modify
321 it under the same terms as Perl itself, either Perl version 5.26.1 or,
322 at your option, any later version of Perl 5 you may have available.
323
324
325 =cut
This page took 0.037587 seconds and 4 git commands to generate.