| 1 | package Slob; |
| 2 | |
| 3 | use 5.014000; |
| 4 | use strict; |
| 5 | use warnings; |
| 6 | our $VERSION = '0.001'; |
| 7 | |
| 8 | use constant MAGIC => "!-1SLOB\x1F"; |
| 9 | |
| 10 | use Carp qw/croak verbose/; |
| 11 | use Encode; |
| 12 | |
| 13 | use Compress::Raw::Lzma; |
| 14 | |
| 15 | our %UNCOMPRESS = ( |
| 16 | '' => sub { $_[0] }, |
| 17 | 'lzma2' => sub { |
| 18 | my ($input) = @_; |
| 19 | my ($lzma2, $code, $output); |
| 20 | ($lzma2, $code) = Compress::Raw::Lzma::RawDecoder->new(Filter => Lzma::Filter::Lzma2()); |
| 21 | die "Error creating LZMA2 decoder: $code\n" unless $code == LZMA_OK; |
| 22 | $code = $lzma2->code($input, $output); |
| 23 | die "Did not reach end of stream" if $code == LZMA_OK; |
| 24 | die "Error decoding LZMA2: $code" if $code != LZMA_STREAM_END; |
| 25 | $output |
| 26 | } |
| 27 | ); |
| 28 | |
| 29 | sub new { |
| 30 | my ($class, $path) = @_; |
| 31 | my $fh; |
| 32 | if (ref $path eq 'IO') { |
| 33 | $fh = $path |
| 34 | } else { |
| 35 | open $fh, '<', $path or croak "Cannot open \"$path\": $!" |
| 36 | } |
| 37 | my $self = bless {path => $path, fh => $fh}, $class; |
| 38 | $self->{header} = $self->read_header; |
| 39 | $self |
| 40 | } |
| 41 | |
| 42 | sub read_data { |
| 43 | my ($self, $len) = @_; |
| 44 | my $data; |
| 45 | my $result = read $self->{fh}, $data, $len; |
| 46 | if (!defined $result) { |
| 47 | croak "Failed to read from $self->{path}: $!" |
| 48 | } elsif ($result == $len) { |
| 49 | $data |
| 50 | } elsif ($result == 0) { |
| 51 | croak "$self->{path} is at end of file" |
| 52 | } elsif ($result < $len) { |
| 53 | croak "Only read $result bytes of $self->{path} before reaching EOF" |
| 54 | } |
| 55 | } |
| 56 | |
| 57 | sub read_formatted { |
| 58 | my ($self, $len_of_format, $format) = @_; |
| 59 | unpack $format, $self->read_data($len_of_format); |
| 60 | } |
| 61 | |
| 62 | sub read_char { shift->read_formatted(1, 'C') } |
| 63 | sub read_short { shift->read_formatted(2, 'n') } |
| 64 | sub read_int { shift->read_formatted(4, 'N') } |
| 65 | sub read_long { shift->read_formatted(8, 'Q>') } |
| 66 | |
| 67 | sub read_tiny_text { |
| 68 | my ($self, $encoding) = @_; |
| 69 | my $data = $self->read_data($self->read_char); |
| 70 | if (length $data == 255) { |
| 71 | $data = unpack 'Z*', $data; |
| 72 | } |
| 73 | $encoding //= $self->{encoding}; |
| 74 | decode $encoding, $data; |
| 75 | } |
| 76 | |
| 77 | sub read_text { |
| 78 | my ($self, $encoding) = @_; |
| 79 | my $data = $self->read_data($self->read_short); |
| 80 | $encoding //= $self->{encoding}; |
| 81 | decode $encoding, $data; |
| 82 | } |
| 83 | |
| 84 | sub read_large_byte_string { |
| 85 | my ($self) = @_; |
| 86 | $self->read_data($self->read_short) |
| 87 | } |
| 88 | |
| 89 | sub read_tag { |
| 90 | my ($self) = @_; |
| 91 | my $name = $self->read_tiny_text; |
| 92 | my $value = $self->read_tiny_text; |
| 93 | ($name, $value) |
| 94 | } |
| 95 | |
| 96 | sub read_tags { |
| 97 | my ($self) = @_; |
| 98 | my $tag_count = $self->read_char; |
| 99 | map { $self->read_tag } 1..$tag_count |
| 100 | } |
| 101 | |
| 102 | sub read_content_types { |
| 103 | my ($self) = @_; |
| 104 | my $content_type_count = $self->read_char; |
| 105 | map { $self->read_text } 1..$content_type_count |
| 106 | } |
| 107 | |
| 108 | sub read_positions { |
| 109 | my ($self) = @_; |
| 110 | my $count = $self->read_int; |
| 111 | my @positions = map { $self->read_long } 1..$count; |
| 112 | my $relative_to = $self->ftell; |
| 113 | map { $relative_to + $_ } @positions |
| 114 | } |
| 115 | |
| 116 | sub fseek { |
| 117 | my ($self, $position) = @_; |
| 118 | seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position" |
| 119 | } |
| 120 | |
| 121 | sub ftell { |
| 122 | my ($self) = @_; |
| 123 | my $result = tell $self->{fh}; |
| 124 | croak "Failed to tell position in file" if $result == -1; |
| 125 | $result |
| 126 | } |
| 127 | |
| 128 | sub uncompress { |
| 129 | my ($self, $data) = @_; |
| 130 | $UNCOMPRESS{$self->{header}{compression}}->($data) |
| 131 | } |
| 132 | |
| 133 | sub read_header { |
| 134 | my ($self) = @_; |
| 135 | my $magic = $self->read_data(length MAGIC); |
| 136 | croak "Not a SLOB dictionary" unless MAGIC eq $magic; |
| 137 | my $uuid = $self->read_data(16); |
| 138 | |
| 139 | my $encoding = $self->read_tiny_text('UTF-8'); |
| 140 | $self->{encoding} = $encoding; |
| 141 | |
| 142 | my $compression = $self->read_tiny_text; |
| 143 | die "Compression '$compression' not yet supported" unless exists $UNCOMPRESS{$compression}; |
| 144 | my %tags = $self->read_tags; |
| 145 | my @content_types = $self->read_content_types; |
| 146 | my $blob_count = $self->read_int; |
| 147 | my $store_offset = $self->read_long; |
| 148 | my $size = $self->read_long; |
| 149 | my @refs = $self->read_positions; |
| 150 | |
| 151 | $self->fseek($store_offset); |
| 152 | my @storage_bins = $self->read_positions; |
| 153 | |
| 154 | +{ |
| 155 | uuid => $uuid, |
| 156 | encoding => $encoding, |
| 157 | compression => $compression, |
| 158 | tags => \%tags, |
| 159 | content_types => \@content_types, |
| 160 | blob_count => $blob_count, |
| 161 | store_offset => $store_offset, |
| 162 | size => $size, |
| 163 | refs => \@refs, |
| 164 | storage_bins => \@storage_bins, |
| 165 | } |
| 166 | } |
| 167 | |
| 168 | sub read_ref { |
| 169 | my ($self) = @_; |
| 170 | my $key = $self->read_text; |
| 171 | my $bin_index = $self->read_int; |
| 172 | my $item_index = $self->read_short; |
| 173 | my $fragment = $self->read_tiny_text; |
| 174 | +{ |
| 175 | key => $key, |
| 176 | bin_index => $bin_index, |
| 177 | item_index => $item_index, |
| 178 | fragment => $fragment, |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | sub read_storage_bin { |
| 183 | my ($self) = @_; |
| 184 | my $count = $self->read_int; |
| 185 | my @content_types = map { $self->read_char } 1..$count; |
| 186 | my $compressed_size = $self->read_int; |
| 187 | my $compressed_data = $self->read_data($compressed_size); |
| 188 | my $uncompressed_data = $self->uncompress($compressed_data); |
| 189 | |
| 190 | my @positions = unpack "N$count", $uncompressed_data; |
| 191 | my $data = substr $uncompressed_data, $count * 4; |
| 192 | +{ |
| 193 | positions => \@positions, |
| 194 | data => $data |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | sub ref_count { shift @{shift->{header}{refs}} } |
| 199 | |
| 200 | sub seek_and_read_ref { |
| 201 | my ($self, $index) = @_; |
| 202 | croak "No ref has index $index" unless exists $self->{header}{refs}[$index]; |
| 203 | $self->fseek($self->{header}{refs}[$index]); |
| 204 | $self->read_ref |
| 205 | } |
| 206 | |
| 207 | sub seek_and_read_storage_bin { |
| 208 | my ($self, $index) = @_; |
| 209 | croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index]; |
| 210 | $self->fseek($self->{header}{storage_bins}[$index]); |
| 211 | $self->read_storage_bin |
| 212 | } |
| 213 | |
| 214 | sub get_entry_of_storage_bin { |
| 215 | my ($self, $storage_bin, $index) = @_; |
| 216 | my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index]; |
| 217 | my $length = unpack 'N', $start_of_data; |
| 218 | substr $start_of_data, 4, $length; |
| 219 | } |
| 220 | |
| 221 | sub seek_and_read_ref_and_data { |
| 222 | my ($self, $index) = @_; |
| 223 | my $ref = $self->seek_and_read_ref($index); |
| 224 | my $bin = $self->seek_and_read_storage_bin($ref->{bin_index}); |
| 225 | my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index}); |
| 226 | $ref->{data} = $data; |
| 227 | $ref |
| 228 | } |
| 229 | |
| 230 | 1; |
| 231 | __END__ |
| 232 | |
| 233 | =encoding utf-8 |
| 234 | |
| 235 | =head1 NAME |
| 236 | |
| 237 | Slob - Read .slob dictionaries (as used by Aard 2) |
| 238 | |
| 239 | =head1 SYNOPSIS |
| 240 | |
| 241 | use feature qw/:5.14/; |
| 242 | use Slob; |
| 243 | my $slob = Slob->new('path/to/dict.slob'); |
| 244 | |
| 245 | my $nr_of_entries = $slob->ref_count; # if the same content has two |
| 246 | # keys pointing to it, this |
| 247 | # counts it twice |
| 248 | |
| 249 | my $second_ref = $slob->seek_and_read_ref(4); |
| 250 | say "Entry is for $second_ref->{key}"; |
| 251 | say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; |
| 252 | |
| 253 | my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); |
| 254 | say "Bin has ", (scalar @{$bin->{positions}}), " entries"; |
| 255 | say "Value at position $second_ref->{item_index} is ", |
| 256 | $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); |
| 257 | |
| 258 | # instead of the above, we can do |
| 259 | my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4); |
| 260 | say "Entry is for $second_ref_and_data->{key}"; |
| 261 | say "Value is $second_ref_and_data->{data}"; |
| 262 | |
| 263 | =head1 DESCRIPTION |
| 264 | |
| 265 | Slob is a compressed read-only format for storing natural language |
| 266 | dictionaries. It is used in Aard 2. C<Slob.pm> is a module that reads |
| 267 | dictionaries in slob format. |
| 268 | |
| 269 | The following methods are available: |
| 270 | |
| 271 | =over |
| 272 | |
| 273 | =item Slob->B<new>(I<$path>) |
| 274 | =item Slob->B<new>(I<$fh>) |
| 275 | |
| 276 | Create a new slob reader reading from the given path or filehandle. |
| 277 | |
| 278 | =item $slob->B<ref_count> |
| 279 | |
| 280 | The number of refs (keys) in the dictionary. |
| 281 | |
| 282 | =item $slob->B<seek_and_read_ref>(I<$index>) |
| 283 | |
| 284 | Read the ref (key) at the given index. Returns a hashref with the |
| 285 | following keys: |
| 286 | |
| 287 | =over |
| 288 | |
| 289 | =item key |
| 290 | |
| 291 | The key |
| 292 | |
| 293 | =item bin_index |
| 294 | |
| 295 | The storage bin that contains the value for this key |
| 296 | |
| 297 | =item item_index |
| 298 | |
| 299 | The index in the bin_index storage bin of the value for this key |
| 300 | |
| 301 | =item fragment |
| 302 | |
| 303 | HTML fragment that, when applied to the HTML value, points to the |
| 304 | definition of the key. |
| 305 | |
| 306 | =back |
| 307 | |
| 308 | =item $slob->B<seek_and_read_storage_bin>(I<$index>) |
| 309 | |
| 310 | Read the storage bin with the given index. Returns the storage bin, |
| 311 | which can later be given to B<get_entry_of_storage_bin>. |
| 312 | |
| 313 | =item $slob->B<get_entry_of_storage_bin>(I<$bin>, I<$index>) |
| 314 | |
| 315 | Given a storage bin (as returned by C<seek_and_read_storage_bin>) and |
| 316 | item index, returns the value at the index i nthe storage bin. |
| 317 | |
| 318 | =item $slob->B<seek_and_read_ref_and_data>($index) |
| 319 | |
| 320 | Convenience method that returns the key and value at a given index. |
| 321 | Returns a hashref like C<seek_and_read_ref> with an extra key, |
| 322 | I<data>, which is the value of the key. |
| 323 | |
| 324 | =back |
| 325 | |
| 326 | =head1 SEE ALSO |
| 327 | |
| 328 | L<https://github.com/itkach/slob> |
| 329 | |
| 330 | =head1 AUTHOR |
| 331 | |
| 332 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> |
| 333 | |
| 334 | =head1 COPYRIGHT AND LICENSE |
| 335 | |
| 336 | Copyright (C) 2017 by Marius Gavrilescu |
| 337 | |
| 338 | This library is free software; you can redistribute it and/or modify |
| 339 | it under the same terms as Perl itself, either Perl version 5.26.1 or, |
| 340 | at your option, any later version of Perl 5 you may have available. |
| 341 | |
| 342 | |
| 343 | =cut |