8298becd0e964bb8e1e3508ff9f811793cc67cdd
[slob.git] / lib / Slob.pm
1 package Slob;
2
3 use 5.014000;
4 use strict;
5 use warnings;
6 our $VERSION = '0.000_001';
7
8 use constant MAGIC => "!-1SLOB\x1F";
9
10 use Carp qw/croak verbose/;
11 use Encode;
12
13 sub new {
14 my ($class, $path) = @_;
15 open my $fh, '<', $path or croak "Cannot open \"$path\": $!";
16 my $self = bless {path => $path, fh => $fh}, $class;
17 $self->{header} = $self->read_header;
18 $self
19 }
20
21 sub read_data {
22 my ($self, $len) = @_;
23 my $data;
24 my $result = read $self->{fh}, $data, $len;
25 if (!defined $result) {
26 croak "Failed to read from $self->{path}: $!"
27 } elsif ($result == $len) {
28 $data
29 } elsif ($result == 0) {
30 croak "$self->{path} is at end of file"
31 } elsif ($result < $len) {
32 croak "Only read $result bytes of $self->{path} before reaching EOF"
33 }
34 }
35
36 sub read_formatted {
37 my ($self, $len_of_format, $format) = @_;
38 unpack $format, $self->read_data($len_of_format);
39 }
40
41 sub read_char { shift->read_formatted(1, 'C') }
42 sub read_short { shift->read_formatted(2, 'n') }
43 sub read_int { shift->read_formatted(4, 'N') }
44 sub read_long { shift->read_formatted(8, 'Q>') }
45
46 sub read_tiny_text {
47 my ($self, $encoding) = @_;
48 my $data = $self->read_data($self->read_char);
49 if (length $data == 255) {
50 $data = unpack 'Z*', $data;
51 }
52 $encoding //= $self->{encoding};
53 decode $encoding, $data;
54 }
55
56 sub read_text {
57 my ($self, $encoding) = @_;
58 my $data = $self->read_data($self->read_short);
59 $encoding //= $self->{encoding};
60 decode $encoding, $data;
61 }
62
63 sub read_large_byte_string {
64 my ($self) = @_;
65 $self->read_data($self->read_short)
66 }
67
68 sub read_tag {
69 my ($self) = @_;
70 my $name = $self->read_tiny_text;
71 my $value = $self->read_tiny_text;
72 ($name, $value)
73 }
74
75 sub read_tags {
76 my ($self) = @_;
77 my $tag_count = $self->read_char;
78 map { $self->read_tag } 1..$tag_count
79 }
80
81 sub read_content_types {
82 my ($self) = @_;
83 my $content_type_count = $self->read_char;
84 map { $self->read_text } 1..$content_type_count
85 }
86
87 sub read_positions {
88 my ($self) = @_;
89 my $count = $self->read_int;
90 my @positions = map { $self->read_long } 1..$count;
91 my $relative_to = $self->ftell;
92 map { $relative_to + $_ } @positions
93 }
94
95 sub fseek {
96 my ($self, $position) = @_;
97 seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position"
98 }
99
100 sub ftell {
101 my ($self) = @_;
102 my $result = tell $self->{fh};
103 croak "Failed to tell position in file" if $result == -1;
104 $result
105 }
106
107 sub uncompress {
108 my ($self, $data) = @_;
109 $data
110 }
111
112 sub read_header {
113 my ($self) = @_;
114 my $magic = $self->read_data(length MAGIC);
115 croak "Not a SLOB dictionary" unless MAGIC eq $magic;
116 my $uuid = $self->read_data(16);
117
118 my $encoding = $self->read_tiny_text('UTF-8');
119 $self->{encoding} = $encoding;
120
121 my $compression = $self->read_tiny_text;
122 die "Compression not yet supported" if $compression;
123 my %tags = $self->read_tags;
124 my @content_types = $self->read_content_types;
125 my $blob_count = $self->read_int;
126 my $store_offset = $self->read_long;
127 my $size = $self->read_long;
128 my @refs = $self->read_positions;
129
130 $self->fseek($store_offset);
131 my @storage_bins = $self->read_positions;
132
133 +{
134 uuid => $uuid,
135 encoding => $encoding,
136 compression => $compression,
137 tags => \%tags,
138 content_types => \@content_types,
139 blob_count => $blob_count,
140 store_offset => $store_offset,
141 size => $size,
142 refs => \@refs,
143 storage_bins => \@storage_bins,
144 }
145 }
146
147 sub read_ref {
148 my ($self) = @_;
149 my $key = $self->read_text;
150 my $bin_index = $self->read_int;
151 my $item_index = $self->read_short;
152 my $fragment = $self->read_tiny_text;
153 +{
154 key => $key,
155 bin_index => $bin_index,
156 item_index => $item_index,
157 fragment => $fragment,
158 }
159 }
160
161 sub read_storage_bin {
162 my ($self) = @_;
163 my $count = $self->read_int;
164 my @content_types = map { $self->read_char } 1..$count;
165 my $compressed_size = $self->read_int;
166 my $compressed_data = $self->read_data($compressed_size);
167 my $uncompressed_data = $self->uncompress($compressed_data);
168
169 my @positions = unpack "N$count", $uncompressed_data;
170 my $data = substr $uncompressed_data, $count * 4;
171 +{
172 positions => \@positions,
173 data => $data
174 }
175 }
176
177 sub ref_count { shift @{shift->{header}{refs}} }
178
179 sub seek_and_read_ref {
180 my ($self, $index) = @_;
181 croak "No ref has index $index" unless exists $self->{header}{refs}[$index];
182 $self->fseek($self->{header}{refs}[$index]);
183 $self->read_ref
184 }
185
186 sub seek_and_read_storage_bin {
187 my ($self, $index) = @_;
188 croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index];
189 $self->fseek($self->{header}{storage_bins}[$index]);
190 $self->read_storage_bin
191 }
192
193 sub get_entry_of_storage_bin {
194 my ($self, $storage_bin, $index) = @_;
195 my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index];
196 my $length = unpack 'N', $start_of_data;
197 substr $start_of_data, 4, $length;
198 }
199
200 sub seek_and_read_ref_and_data {
201 my ($self, $index) = @_;
202 my $ref = $self->seek_and_read_ref($index);
203 my $bin = $self->seek_and_read_storage_bin($ref->{bin_index});
204 my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index});
205 $ref->{data} = $data;
206 $ref
207 }
208
209 1;
210 __END__
211
212 =encoding utf-8
213
214 =head1 NAME
215
216 Slob - Read .slob dictionaries (as used by Aard 2)
217
218 =head1 SYNOPSIS
219
220 use feature qw/:5.14/;
221 use Slob;
222 my $slob = Slob->new('path/to/dict.slob');
223
224 my $nr_of_entries = $slob->ref_count; # if the same content has two
225 # keys pointing to it, this
226 # counts it twice
227
228 my $second_ref = $slob->seek_and_read_ref(4);
229 say "Entry is for $second_ref->{key}";
230 say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}";
231
232 my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index});
233 say "Bin has ", (scalar @{$bin->{positions}}), " entries";
234 say "Value at position $second_ref->{item_index} is ",
235 $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index});
236
237 # instead of the above, we can do
238 my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4);
239 say "Entry is for $second_ref_and_data->{key}";
240 say "Value is $second_ref_and_data->{data}";
241
242 =head1 DESCRIPTION
243
244 No documentation yet, see SYNOPSIS.
245
246 =head1 AUTHOR
247
248 Marius Gavrilescu, E<lt>marius@ieval.roE<gt>
249
250 =head1 COPYRIGHT AND LICENSE
251
252 Copyright (C) 2017 by Marius Gavrilescu
253
254 This library is free software; you can redistribute it and/or modify
255 it under the same terms as Perl itself, either Perl version 5.26.1 or,
256 at your option, any later version of Perl 5 you may have available.
257
258
259 =cut
This page took 0.035843 seconds and 3 git commands to generate.