| 1 | package Slob; |
| 2 | |
| 3 | use 5.014000; |
| 4 | use strict; |
| 5 | use warnings; |
| 6 | our $VERSION = '0.002'; |
| 7 | |
| 8 | use constant MAGIC => "!-1SLOB\x1F"; |
| 9 | |
| 10 | use Carp qw/croak verbose/; |
| 11 | use Encode; |
| 12 | |
| 13 | use Compress::Raw::Bzip2; |
| 14 | use Compress::Raw::Lzma; |
| 15 | use Compress::Raw::Zlib; |
| 16 | |
| 17 | our %UNCOMPRESS = ( |
| 18 | '' => sub { $_[0] }, |
| 19 | 'lzma2' => sub { |
| 20 | my ($input) = @_; |
| 21 | my ($lzma2, $code, $output); |
| 22 | ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2()); |
| 23 | die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK; |
| 24 | |
| 25 | $code = $lzma2->code($input, $output); |
| 26 | die "Did not reach end of stream\n" if $code == LZMA_OK; |
| 27 | die "Error decoding LZMA2: $code\n" if $code != LZMA_STREAM_END; |
| 28 | $output |
| 29 | }, |
| 30 | |
| 31 | 'bz2' => sub { |
| 32 | my ($input) = @_; |
| 33 | my ($bz2, $code, $output); |
| 34 | ($bz2, $code)= Compress::Raw::Bunzip2->new; |
| 35 | die "Error creating Bunzip2: $code\n" unless $code == Z_OK; |
| 36 | |
| 37 | $code = $bz2->bzinflate($input, $output); |
| 38 | die "Did not reach end of stream\n" if $code == BZ_OK; |
| 39 | die "Error decoding Bzip2: $code\n" if $code != BZ_STREAM_END; |
| 40 | |
| 41 | $output |
| 42 | }, |
| 43 | |
| 44 | 'zlib' => sub { |
| 45 | my ($input) = @_; |
| 46 | my ($zlib, $code, $output); |
| 47 | ($zlib, $code) = Compress::Raw::Zlib::Inflate->new( |
| 48 | -WindowBits => WANT_GZIP_OR_ZLIB |
| 49 | ); |
| 50 | die "Error creating Zlib inflate: $code\n" unless $code == Z_OK; |
| 51 | |
| 52 | $code = $zlib->inflate($input, \$output, 1); |
| 53 | die "Did not reach end of stream\n" if $code == Z_OK; |
| 54 | die "Error inflating zlib: $code\n" if $code != Z_STREAM_END; |
| 55 | $output |
| 56 | } |
| 57 | ); |
| 58 | |
| 59 | sub new { |
| 60 | my ($class, $path) = @_; |
| 61 | my $fh; |
| 62 | if (ref $path eq 'IO') { |
| 63 | $fh = $path |
| 64 | } else { |
| 65 | open $fh, '<', $path or croak "Cannot open \"$path\": $!" |
| 66 | } |
| 67 | my $self = bless {path => $path, fh => $fh}, $class; |
| 68 | $self->{header} = $self->read_header; |
| 69 | $self |
| 70 | } |
| 71 | |
| 72 | sub read_data { |
| 73 | my ($self, $len) = @_; |
| 74 | my $data; |
| 75 | my $result = read $self->{fh}, $data, $len; |
| 76 | if (!defined $result) { |
| 77 | croak "Failed to read from $self->{path}: $!" |
| 78 | } elsif ($result == $len) { |
| 79 | $data |
| 80 | } elsif ($result == 0) { |
| 81 | croak "$self->{path} is at end of file" |
| 82 | } elsif ($result < $len) { |
| 83 | croak "Only read $result bytes of $self->{path} before reaching EOF" |
| 84 | } |
| 85 | } |
| 86 | |
| 87 | sub read_formatted { |
| 88 | my ($self, $len_of_format, $format) = @_; |
| 89 | unpack $format, $self->read_data($len_of_format); |
| 90 | } |
| 91 | |
| 92 | sub read_char { shift->read_formatted(1, 'C') } |
| 93 | sub read_short { shift->read_formatted(2, 'n') } |
| 94 | sub read_int { shift->read_formatted(4, 'N') } |
| 95 | sub read_long { shift->read_formatted(8, 'Q>') } |
| 96 | |
| 97 | sub read_tiny_text { |
| 98 | my ($self, $encoding) = @_; |
| 99 | my $data = $self->read_data($self->read_char); |
| 100 | if (length $data == 255) { |
| 101 | $data = unpack 'Z*', $data; |
| 102 | } |
| 103 | $encoding //= $self->{encoding}; |
| 104 | decode $encoding, $data; |
| 105 | } |
| 106 | |
| 107 | sub read_text { |
| 108 | my ($self, $encoding) = @_; |
| 109 | my $data = $self->read_data($self->read_short); |
| 110 | $encoding //= $self->{encoding}; |
| 111 | decode $encoding, $data; |
| 112 | } |
| 113 | |
| 114 | sub read_large_byte_string { |
| 115 | my ($self) = @_; |
| 116 | $self->read_data($self->read_short) |
| 117 | } |
| 118 | |
| 119 | sub read_tag { |
| 120 | my ($self) = @_; |
| 121 | my $name = $self->read_tiny_text; |
| 122 | my $value = $self->read_tiny_text; |
| 123 | ($name, $value) |
| 124 | } |
| 125 | |
| 126 | sub read_tags { |
| 127 | my ($self) = @_; |
| 128 | my $tag_count = $self->read_char; |
| 129 | map { $self->read_tag } 1..$tag_count |
| 130 | } |
| 131 | |
| 132 | sub read_content_types { |
| 133 | my ($self) = @_; |
| 134 | my $content_type_count = $self->read_char; |
| 135 | map { $self->read_text } 1..$content_type_count |
| 136 | } |
| 137 | |
| 138 | sub read_positions { |
| 139 | my ($self) = @_; |
| 140 | my $count = $self->read_int; |
| 141 | my @positions = map { $self->read_long } 1..$count; |
| 142 | my $relative_to = $self->ftell; |
| 143 | map { $relative_to + $_ } @positions |
| 144 | } |
| 145 | |
| 146 | sub fseek { |
| 147 | my ($self, $position) = @_; |
| 148 | seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position" |
| 149 | } |
| 150 | |
| 151 | sub ftell { |
| 152 | my ($self) = @_; |
| 153 | my $result = tell $self->{fh}; |
| 154 | croak "Failed to tell position in file" if $result == -1; |
| 155 | $result |
| 156 | } |
| 157 | |
| 158 | sub uncompress { |
| 159 | my ($self, $data) = @_; |
| 160 | $UNCOMPRESS{$self->{header}{compression}}->($data) |
| 161 | } |
| 162 | |
| 163 | sub read_header { |
| 164 | my ($self) = @_; |
| 165 | my $magic = $self->read_data(length MAGIC); |
| 166 | croak "Not a SLOB dictionary" unless MAGIC eq $magic; |
| 167 | my $uuid = $self->read_data(16); |
| 168 | |
| 169 | my $encoding = $self->read_tiny_text('UTF-8'); |
| 170 | $self->{encoding} = $encoding; |
| 171 | |
| 172 | my $compression = $self->read_tiny_text; |
| 173 | die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression}; |
| 174 | my %tags = $self->read_tags; |
| 175 | my @content_types = $self->read_content_types; |
| 176 | my $blob_count = $self->read_int; |
| 177 | my $store_offset = $self->read_long; |
| 178 | my $size = $self->read_long; |
| 179 | my @refs = $self->read_positions; |
| 180 | |
| 181 | $self->fseek($store_offset); |
| 182 | my @storage_bins = $self->read_positions; |
| 183 | |
| 184 | +{ |
| 185 | uuid => $uuid, |
| 186 | encoding => $encoding, |
| 187 | compression => $compression, |
| 188 | tags => \%tags, |
| 189 | content_types => \@content_types, |
| 190 | blob_count => $blob_count, |
| 191 | store_offset => $store_offset, |
| 192 | size => $size, |
| 193 | refs => \@refs, |
| 194 | storage_bins => \@storage_bins, |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | sub read_ref { |
| 199 | my ($self) = @_; |
| 200 | my $key = $self->read_text; |
| 201 | my $bin_index = $self->read_int; |
| 202 | my $item_index = $self->read_short; |
| 203 | my $fragment = $self->read_tiny_text; |
| 204 | +{ |
| 205 | key => $key, |
| 206 | bin_index => $bin_index, |
| 207 | item_index => $item_index, |
| 208 | fragment => $fragment, |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | sub read_storage_bin { |
| 213 | my ($self) = @_; |
| 214 | my $count = $self->read_int; |
| 215 | my @content_types = map { $self->read_char } 1..$count; |
| 216 | my $compressed_size = $self->read_int; |
| 217 | my $compressed_data = $self->read_data($compressed_size); |
| 218 | my $uncompressed_data = $self->uncompress($compressed_data); |
| 219 | |
| 220 | my @positions = unpack "N$count", $uncompressed_data; |
| 221 | my $data = substr $uncompressed_data, $count * 4; |
| 222 | +{ |
| 223 | positions => \@positions, |
| 224 | data => $data |
| 225 | } |
| 226 | } |
| 227 | |
| 228 | sub ref_count { shift @{shift->{header}{refs}} } |
| 229 | |
| 230 | sub seek_and_read_ref { |
| 231 | my ($self, $index) = @_; |
| 232 | croak "No ref has index $index" unless exists $self->{header}{refs}[$index]; |
| 233 | $self->fseek($self->{header}{refs}[$index]); |
| 234 | $self->read_ref |
| 235 | } |
| 236 | |
| 237 | sub seek_and_read_storage_bin { |
| 238 | my ($self, $index) = @_; |
| 239 | croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index]; |
| 240 | $self->fseek($self->{header}{storage_bins}[$index]); |
| 241 | $self->read_storage_bin |
| 242 | } |
| 243 | |
| 244 | sub get_entry_of_storage_bin { |
| 245 | my ($self, $storage_bin, $index) = @_; |
| 246 | my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index]; |
| 247 | my $length = unpack 'N', $start_of_data; |
| 248 | substr $start_of_data, 4, $length; |
| 249 | } |
| 250 | |
| 251 | sub seek_and_read_ref_and_data { |
| 252 | my ($self, $index) = @_; |
| 253 | my $ref = $self->seek_and_read_ref($index); |
| 254 | my $bin = $self->seek_and_read_storage_bin($ref->{bin_index}); |
| 255 | my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index}); |
| 256 | $ref->{data} = $data; |
| 257 | $ref |
| 258 | } |
| 259 | |
| 260 | 1; |
| 261 | __END__ |
| 262 | |
| 263 | =encoding utf-8 |
| 264 | |
| 265 | =head1 NAME |
| 266 | |
| 267 | Slob - Read .slob dictionaries (as used by Aard 2) |
| 268 | |
| 269 | =head1 SYNOPSIS |
| 270 | |
| 271 | use feature qw/:5.14/; |
| 272 | use Slob; |
| 273 | my $slob = Slob->new('path/to/dict.slob'); |
| 274 | |
| 275 | my $nr_of_entries = $slob->ref_count; # if the same content has two |
| 276 | # keys pointing to it, this |
| 277 | # counts it twice |
| 278 | |
| 279 | my $second_ref = $slob->seek_and_read_ref(4); |
| 280 | say "Entry is for $second_ref->{key}"; |
| 281 | say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; |
| 282 | |
| 283 | my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); |
| 284 | say "Bin has ", (scalar @{$bin->{positions}}), " entries"; |
| 285 | say "Value at position $second_ref->{item_index} is ", |
| 286 | $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); |
| 287 | |
| 288 | # instead of the above, we can do |
| 289 | my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4); |
| 290 | say "Entry is for $second_ref_and_data->{key}"; |
| 291 | say "Value is $second_ref_and_data->{data}"; |
| 292 | |
| 293 | =head1 DESCRIPTION |
| 294 | |
| 295 | Slob is a compressed read-only format for storing natural language |
| 296 | dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads |
| 297 | dictionaries in slob format. |
| 298 | |
| 299 | The following methods are available: |
| 300 | |
| 301 | =over |
| 302 | |
| 303 | =item Slob->B<new>(I<$path>) |
| 304 | =item Slob->B<new>(I<$fh>) |
| 305 | |
| 306 | Create a new slob reader reading from the given path or filehandle. |
| 307 | |
| 308 | =item $slob->B<ref_count> |
| 309 | |
| 310 | The number of refs (keys) in the dictionary. |
| 311 | |
| 312 | =item $slob->B<seek_and_read_ref>(I<$index>) |
| 313 | |
| 314 | Read the ref (key) at the given index. Returns a hashref with the |
| 315 | following keys: |
| 316 | |
| 317 | =over |
| 318 | |
| 319 | =item key |
| 320 | |
| 321 | The key |
| 322 | |
| 323 | =item bin_index |
| 324 | |
| 325 | The storage bin that contains the value for this key |
| 326 | |
| 327 | =item item_index |
| 328 | |
| 329 | The index in the bin_index storage bin of the value for this key |
| 330 | |
| 331 | =item fragment |
| 332 | |
| 333 | HTML fragment that, when applied to the HTML value, points to the |
| 334 | definition of the key. |
| 335 | |
| 336 | =back |
| 337 | |
| 338 | =item $slob->B<seek_and_read_storage_bin>(I<$index>) |
| 339 | |
| 340 | Read the storage bin with the given index. Returns the storage bin, |
| 341 | which can later be given to B<get_entry_of_storage_bin>. |
| 342 | |
| 343 | =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>) |
| 344 | |
| 345 | Given a storage bin (as returned by C<seek_and_read_storage_bin>) and |
| 346 | item index, returns the value at the index i nthe storage bin. |
| 347 | |
| 348 | =item $slob->B<seek_and_read_ref_and_data>($index) |
| 349 | |
| 350 | Convenience method that returns the key and value at a given index. |
| 351 | Returns a hashref like C<seek_and_read_ref> with an extra key, |
| 352 | I<data>, which is the value of the key. |
| 353 | |
| 354 | =back |
| 355 | |
| 356 | =head1 SEE ALSO |
| 357 | |
| 358 | L<https://github.com/itkach/slob> |
| 359 | |
| 360 | =head1 AUTHOR |
| 361 | |
| 362 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> |
| 363 | |
| 364 | =head1 COPYRIGHT AND LICENSE |
| 365 | |
| 366 | Copyright (C) 2017-2018 by Marius Gavrilescu |
| 367 | |
| 368 | This library is free software; you can redistribute it and/or modify |
| 369 | it under the same terms as Perl itself, either Perl version 5.26.1 or, |
| 370 | at your option, any later version of Perl 5 you may have available. |
| 371 | |
| 372 | |
| 373 | =cut |