]>
Commit | Line | Data |
---|---|---|
d3779104 MG |
1 | package Slob; |
2 | ||
3 | use 5.014000; | |
4 | use strict; | |
5 | use warnings; | |
6 | our $VERSION = '0.000_001'; | |
7 | ||
8 | use constant MAGIC => "!-1SLOB\x1F"; | |
9 | ||
10 | use Carp qw/croak verbose/; | |
11 | use Encode; | |
12 | ||
13 | sub new { | |
14 | my ($class, $path) = @_; | |
15 | open my $fh, '<', $path or croak "Cannot open \"$path\": $!"; | |
16 | my $self = bless {path => $path, fh => $fh}, $class; | |
17 | $self->{header} = $self->read_header; | |
18 | $self | |
19 | } | |
20 | ||
21 | sub read_data { | |
22 | my ($self, $len) = @_; | |
23 | my $data; | |
24 | my $result = read $self->{fh}, $data, $len; | |
25 | if (!defined $result) { | |
26 | croak "Failed to read from $self->{path}: $!" | |
27 | } elsif ($result == $len) { | |
28 | $data | |
29 | } elsif ($result == 0) { | |
30 | croak "$self->{path} is at end of file" | |
31 | } elsif ($result < $len) { | |
32 | croak "Only read $result bytes of $self->{path} before reaching EOF" | |
33 | } | |
34 | } | |
35 | ||
36 | sub read_formatted { | |
37 | my ($self, $len_of_format, $format) = @_; | |
38 | unpack $format, $self->read_data($len_of_format); | |
39 | } | |
40 | ||
41 | sub read_char { shift->read_formatted(1, 'C') } | |
42 | sub read_short { shift->read_formatted(2, 'n') } | |
43 | sub read_int { shift->read_formatted(4, 'N') } | |
44 | sub read_long { shift->read_formatted(8, 'Q>') } | |
45 | ||
46 | sub read_tiny_text { | |
47 | my ($self, $encoding) = @_; | |
48 | my $data = $self->read_data($self->read_char); | |
49 | if (length $data == 255) { | |
50 | $data = unpack 'Z*', $data; | |
51 | } | |
52 | $encoding //= $self->{encoding}; | |
53 | decode $encoding, $data; | |
54 | } | |
55 | ||
56 | sub read_text { | |
57 | my ($self, $encoding) = @_; | |
58 | my $data = $self->read_data($self->read_short); | |
59 | $encoding //= $self->{encoding}; | |
60 | decode $encoding, $data; | |
61 | } | |
62 | ||
63 | sub read_large_byte_string { | |
64 | my ($self) = @_; | |
65 | $self->read_data($self->read_short) | |
66 | } | |
67 | ||
68 | sub read_tag { | |
69 | my ($self) = @_; | |
70 | my $name = $self->read_tiny_text; | |
71 | my $value = $self->read_tiny_text; | |
72 | ($name, $value) | |
73 | } | |
74 | ||
75 | sub read_tags { | |
76 | my ($self) = @_; | |
77 | my $tag_count = $self->read_char; | |
78 | map { $self->read_tag } 1..$tag_count | |
79 | } | |
80 | ||
81 | sub read_content_types { | |
82 | my ($self) = @_; | |
83 | my $content_type_count = $self->read_char; | |
84 | map { $self->read_text } 1..$content_type_count | |
85 | } | |
86 | ||
87 | sub read_positions { | |
88 | my ($self) = @_; | |
89 | my $count = $self->read_int; | |
90 | my @positions = map { $self->read_long } 1..$count; | |
91 | my $relative_to = $self->ftell; | |
92 | map { $relative_to + $_ } @positions | |
93 | } | |
94 | ||
95 | sub fseek { | |
96 | my ($self, $position) = @_; | |
97 | seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position" | |
98 | } | |
99 | ||
100 | sub ftell { | |
101 | my ($self) = @_; | |
102 | my $result = tell $self->{fh}; | |
103 | croak "Failed to tell position in file" if $result == -1; | |
104 | $result | |
105 | } | |
106 | ||
107 | sub uncompress { | |
108 | my ($self, $data) = @_; | |
109 | $data | |
110 | } | |
111 | ||
112 | sub read_header { | |
113 | my ($self) = @_; | |
114 | my $magic = $self->read_data(length MAGIC); | |
115 | croak "Not a SLOB dictionary" unless MAGIC eq $magic; | |
116 | my $uuid = $self->read_data(16); | |
117 | ||
118 | my $encoding = $self->read_tiny_text('UTF-8'); | |
119 | $self->{encoding} = $encoding; | |
120 | ||
121 | my $compression = $self->read_tiny_text; | |
122 | die "Compression not yet supported" if $compression; | |
123 | my %tags = $self->read_tags; | |
124 | my @content_types = $self->read_content_types; | |
125 | my $blob_count = $self->read_int; | |
126 | my $store_offset = $self->read_long; | |
127 | my $size = $self->read_long; | |
128 | my @refs = $self->read_positions; | |
129 | ||
130 | $self->fseek($store_offset); | |
131 | my @storage_bins = $self->read_positions; | |
132 | ||
133 | +{ | |
134 | uuid => $uuid, | |
135 | encoding => $encoding, | |
136 | compression => $compression, | |
137 | tags => \%tags, | |
138 | content_types => \@content_types, | |
139 | blob_count => $blob_count, | |
140 | store_offset => $store_offset, | |
141 | size => $size, | |
142 | refs => \@refs, | |
143 | storage_bins => \@storage_bins, | |
144 | } | |
145 | } | |
146 | ||
147 | sub read_ref { | |
148 | my ($self) = @_; | |
149 | my $key = $self->read_text; | |
150 | my $bin_index = $self->read_int; | |
151 | my $item_index = $self->read_short; | |
152 | my $fragment = $self->read_tiny_text; | |
153 | +{ | |
154 | key => $key, | |
155 | bin_index => $bin_index, | |
156 | item_index => $item_index, | |
157 | fragment => $fragment, | |
158 | } | |
159 | } | |
160 | ||
161 | sub read_storage_bin { | |
162 | my ($self) = @_; | |
163 | my $count = $self->read_int; | |
164 | my @content_types = map { $self->read_char } 1..$count; | |
165 | my $compressed_size = $self->read_int; | |
166 | my $compressed_data = $self->read_data($compressed_size); | |
167 | my $uncompressed_data = $self->uncompress($compressed_data); | |
168 | ||
169 | my @positions = unpack "N$count", $uncompressed_data; | |
170 | my $data = substr $uncompressed_data, $count * 4; | |
171 | +{ | |
172 | positions => \@positions, | |
173 | data => $data | |
174 | } | |
175 | } | |
176 | ||
177 | sub ref_count { shift @{shift->{header}{refs}} } | |
178 | ||
179 | sub seek_and_read_ref { | |
180 | my ($self, $index) = @_; | |
181 | croak "No ref has index $index" unless exists $self->{header}{refs}[$index]; | |
182 | $self->fseek($self->{header}{refs}[$index]); | |
183 | $self->read_ref | |
184 | } | |
185 | ||
186 | sub seek_and_read_storage_bin { | |
187 | my ($self, $index) = @_; | |
188 | croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index]; | |
189 | $self->fseek($self->{header}{storage_bins}[$index]); | |
190 | $self->read_storage_bin | |
191 | } | |
192 | ||
193 | sub get_entry_of_storage_bin { | |
194 | my ($self, $storage_bin, $index) = @_; | |
195 | my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index]; | |
196 | my $length = unpack 'N', $start_of_data; | |
197 | substr $start_of_data, 4, $length; | |
198 | } | |
199 | ||
0e15b496 MG |
200 | sub seek_and_read_ref_and_data { |
201 | my ($self, $index) = @_; | |
202 | my $ref = $self->seek_and_read_ref($index); | |
203 | my $bin = $self->seek_and_read_storage_bin($ref->{bin_index}); | |
204 | my $data = $self->get_entry_of_storage_bin($bin, $ref->{item_index}); | |
205 | $ref->{data} = $data; | |
206 | $ref | |
207 | } | |
208 | ||
d3779104 MG |
209 | 1; |
210 | __END__ | |
211 | ||
212 | =encoding utf-8 | |
213 | ||
214 | =head1 NAME | |
215 | ||
216 | Slob - Read .slob dictionaries (as used by Aard 2) | |
217 | ||
218 | =head1 SYNOPSIS | |
219 | ||
220 | use feature qw/:5.14/; | |
221 | use Slob; | |
222 | my $slob = Slob->new('path/to/dict.slob'); | |
223 | ||
224 | my $nr_of_entries = $slob->ref_count; # if the same content has two | |
225 | # keys pointing to it, this | |
226 | # counts it twice | |
227 | ||
228 | my $second_ref = $slob->seek_and_read_ref(4); | |
229 | say "Entry is for $second_ref->{key}"; | |
230 | say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; | |
231 | ||
232 | my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); | |
233 | say "Bin has ", (scalar @{$bin->{positions}}), " entries"; | |
234 | say "Value at position $second_ref->{item_index} is ", | |
235 | $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); | |
236 | ||
0e15b496 MG |
237 | # instead of the above, we can do |
238 | my $second_ref_and_data = $slob->seek_and_read_ref_and_data(4); | |
239 | say "Entry is for $second_ref_and_data->{key}"; | |
240 | say "Value is $second_ref_and_data->{data}"; | |
241 | ||
d3779104 MG |
242 | =head1 DESCRIPTION |
243 | ||
244 | No documentation yet, see SYNOPSIS. | |
245 | ||
246 | =head1 AUTHOR | |
247 | ||
248 | Marius Gavrilescu, E<lt>marius@ieval.roE<gt> | |
249 | ||
250 | =head1 COPYRIGHT AND LICENSE | |
251 | ||
252 | Copyright (C) 2017 by Marius Gavrilescu | |
253 | ||
254 | This library is free software; you can redistribute it and/or modify | |
255 | it under the same terms as Perl itself, either Perl version 5.26.1 or, | |
256 | at your option, any later version of Perl 5 you may have available. | |
257 | ||
258 | ||
259 | =cut |