From: Marius Gavrilescu Date: Thu, 30 Nov 2017 21:45:48 +0000 (+0000) Subject: Initial commit X-Git-Tag: 0.000_001^0 X-Git-Url: http://git.ieval.ro/?p=slob.git;a=commitdiff_plain;h=d3779104ff18ac718524de579f1d5247c30cc48a Initial commit --- d3779104ff18ac718524de579f1d5247c30cc48a diff --git a/Changes b/Changes new file mode 100644 index 0000000..bbbd437 --- /dev/null +++ b/Changes @@ -0,0 +1,4 @@ +Revision history for Perl extension Slob. + +0.000_001 2017-11-30T21:45+00:00 + - Initial release diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..823ea14 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,7 @@ +Changes +Makefile.PL +MANIFEST +README +t/Slob.t +t/freedict-01.slob +lib/Slob.pm diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..2c380c0 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,19 @@ +use 5.014000; +use ExtUtils::MakeMaker; + +WriteMakefile( + NAME => 'Slob', + VERSION_FROM => 'lib/Slob.pm', + ABSTRACT_FROM => 'lib/Slob.pm', + AUTHOR => 'Marius Gavrilescu ', + MIN_PERL_VERSION => '5.14.0', + LICENSE => 'perl', + SIGN => 1, + PREREQ_PM => {}, + META_ADD => { + dynamic_config => 0, + resources => { + repository => 'https://git.ieval.ro/?p=slob.git', + }, + } +); diff --git a/README b/README new file mode 100644 index 0000000..9186721 --- /dev/null +++ b/README @@ -0,0 +1,25 @@ +Slob version 0.000_001 +====================== + +INSTALLATION + +To install this module type the following: + + perl Makefile.PL + make + make test + make install + +DEPENDENCIES + +This module requires no non-core modules and libraries. + +COPYRIGHT AND LICENCE + +Copyright (C) 2017 by Marius Gavrilescu + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself, either Perl version 5.26.1 or, +at your option, any later version of Perl 5 you may have available. + + diff --git a/ex.pl b/ex.pl new file mode 100644 index 0000000..16aa432 --- /dev/null +++ b/ex.pl @@ -0,0 +1,17 @@ + + use feature qw/:5.14/; + use Slob; + my $slob = Slob->new('t/freedict-01.slob'); + + my $nr_of_entries = $slob->ref_count; # if the same content has two + # keys pointing to it, this + # counts it twice + + my $second_ref = $slob->seek_and_read_ref(4); + say "Entry is for $second_ref->{key}"; + say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; + + my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); + say "Bin has ", (scalar @{$bin->{positions}}), " entries"; + say "Value at position $second_ref->{item_index} is ", + $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); diff --git a/lib/Slob.pm b/lib/Slob.pm new file mode 100644 index 0000000..b1eb4f8 --- /dev/null +++ b/lib/Slob.pm @@ -0,0 +1,245 @@ +package Slob; + +use 5.014000; +use strict; +use warnings; +our $VERSION = '0.000_001'; + +use constant MAGIC => "!-1SLOB\x1F"; + +use Carp qw/croak verbose/; +use Encode; + +sub new { + my ($class, $path) = @_; + open my $fh, '<', $path or croak "Cannot open \"$path\": $!"; + my $self = bless {path => $path, fh => $fh}, $class; + $self->{header} = $self->read_header; + $self +} + +sub read_data { + my ($self, $len) = @_; + my $data; + my $result = read $self->{fh}, $data, $len; + if (!defined $result) { + croak "Failed to read from $self->{path}: $!" + } elsif ($result == $len) { + $data + } elsif ($result == 0) { + croak "$self->{path} is at end of file" + } elsif ($result < $len) { + croak "Only read $result bytes of $self->{path} before reaching EOF" + } +} + +sub read_formatted { + my ($self, $len_of_format, $format) = @_; + unpack $format, $self->read_data($len_of_format); +} + +sub read_char { shift->read_formatted(1, 'C') } +sub read_short { shift->read_formatted(2, 'n') } +sub read_int { shift->read_formatted(4, 'N') } +sub read_long { shift->read_formatted(8, 'Q>') } + +sub read_tiny_text { + my ($self, $encoding) = @_; + my $data = $self->read_data($self->read_char); + if (length $data == 255) { + $data = unpack 'Z*', $data; + } + $encoding //= $self->{encoding}; + decode $encoding, $data; +} + +sub read_text { + my ($self, $encoding) = @_; + my $data = $self->read_data($self->read_short); + $encoding //= $self->{encoding}; + decode $encoding, $data; +} + +sub read_large_byte_string { + my ($self) = @_; + $self->read_data($self->read_short) +} + +sub read_tag { + my ($self) = @_; + my $name = $self->read_tiny_text; + my $value = $self->read_tiny_text; + ($name, $value) +} + +sub read_tags { + my ($self) = @_; + my $tag_count = $self->read_char; + map { $self->read_tag } 1..$tag_count +} + +sub read_content_types { + my ($self) = @_; + my $content_type_count = $self->read_char; + map { $self->read_text } 1..$content_type_count +} + +sub read_positions { + my ($self) = @_; + my $count = $self->read_int; + my @positions = map { $self->read_long } 1..$count; + my $relative_to = $self->ftell; + map { $relative_to + $_ } @positions +} + +sub fseek { + my ($self, $position) = @_; + seek $self->{fh}, $position, 0 or croak "Failed to seek to byte $position" +} + +sub ftell { + my ($self) = @_; + my $result = tell $self->{fh}; + croak "Failed to tell position in file" if $result == -1; + $result +} + +sub uncompress { + my ($self, $data) = @_; + $data +} + +sub read_header { + my ($self) = @_; + my $magic = $self->read_data(length MAGIC); + croak "Not a SLOB dictionary" unless MAGIC eq $magic; + my $uuid = $self->read_data(16); + + my $encoding = $self->read_tiny_text('UTF-8'); + $self->{encoding} = $encoding; + + my $compression = $self->read_tiny_text; + die "Compression not yet supported" if $compression; + my %tags = $self->read_tags; + my @content_types = $self->read_content_types; + my $blob_count = $self->read_int; + my $store_offset = $self->read_long; + my $size = $self->read_long; + my @refs = $self->read_positions; + + $self->fseek($store_offset); + my @storage_bins = $self->read_positions; + + +{ + uuid => $uuid, + encoding => $encoding, + compression => $compression, + tags => \%tags, + content_types => \@content_types, + blob_count => $blob_count, + store_offset => $store_offset, + size => $size, + refs => \@refs, + storage_bins => \@storage_bins, + } +} + +sub read_ref { + my ($self) = @_; + my $key = $self->read_text; + my $bin_index = $self->read_int; + my $item_index = $self->read_short; + my $fragment = $self->read_tiny_text; + +{ + key => $key, + bin_index => $bin_index, + item_index => $item_index, + fragment => $fragment, + } +} + +sub read_storage_bin { + my ($self) = @_; + my $count = $self->read_int; + my @content_types = map { $self->read_char } 1..$count; + my $compressed_size = $self->read_int; + my $compressed_data = $self->read_data($compressed_size); + my $uncompressed_data = $self->uncompress($compressed_data); + + my @positions = unpack "N$count", $uncompressed_data; + my $data = substr $uncompressed_data, $count * 4; + +{ + positions => \@positions, + data => $data + } +} + +sub ref_count { shift @{shift->{header}{refs}} } + +sub seek_and_read_ref { + my ($self, $index) = @_; + croak "No ref has index $index" unless exists $self->{header}{refs}[$index]; + $self->fseek($self->{header}{refs}[$index]); + $self->read_ref +} + +sub seek_and_read_storage_bin { + my ($self, $index) = @_; + croak "No storage bin has index $index" unless exists $self->{header}{storage_bins}[$index]; + $self->fseek($self->{header}{storage_bins}[$index]); + $self->read_storage_bin +} + +sub get_entry_of_storage_bin { + my ($self, $storage_bin, $index) = @_; + my $start_of_data = substr $storage_bin->{data}, $storage_bin->{positions}[$index]; + my $length = unpack 'N', $start_of_data; + substr $start_of_data, 4, $length; +} + +1; +__END__ + +=encoding utf-8 + +=head1 NAME + +Slob - Read .slob dictionaries (as used by Aard 2) + +=head1 SYNOPSIS + + use feature qw/:5.14/; + use Slob; + my $slob = Slob->new('path/to/dict.slob'); + + my $nr_of_entries = $slob->ref_count; # if the same content has two + # keys pointing to it, this + # counts it twice + + my $second_ref = $slob->seek_and_read_ref(4); + say "Entry is for $second_ref->{key}"; + say "Data is in bin $second_ref->{bin_index} at position $second_ref->{item_index}"; + + my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); + say "Bin has ", (scalar @{$bin->{positions}}), " entries"; + say "Value at position $second_ref->{item_index} is ", + $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}); + +=head1 DESCRIPTION + +No documentation yet, see SYNOPSIS. + +=head1 AUTHOR + +Marius Gavrilescu, Emarius@ieval.roE + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2017 by Marius Gavrilescu + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself, either Perl version 5.26.1 or, +at your option, any later version of Perl 5 you may have available. + + +=cut diff --git a/t/Slob.t b/t/Slob.t new file mode 100644 index 0000000..b0ab28f --- /dev/null +++ b/t/Slob.t @@ -0,0 +1,29 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use Test::More tests => 6; +BEGIN { use_ok('Slob') }; + +my $slob = Slob->new('t/freedict-01.slob'); + +my $nr_of_entries = $slob->ref_count; + +my $second_ref = $slob->seek_and_read_ref(4); +my $bin = $slob->seek_and_read_storage_bin($second_ref->{bin_index}); + +is $second_ref->{key}, 'abacus'; +is $second_ref->{bin_index}, 0; +is $second_ref->{item_index}, 161; +my $count = scalar @{$bin->{positions}}; +is $count, 637; + +my $expected = <<'EOF'; +
+
abacus
æbəkəs
  1. +
    1. +
      1. Rechenbrett
      +
      m
+EOF +chomp $expected; +is $slob->get_entry_of_storage_bin($bin, $second_ref->{item_index}), $expected; diff --git a/t/freedict-01.slob b/t/freedict-01.slob new file mode 100644 index 0000000..c836579 Binary files /dev/null and b/t/freedict-01.slob differ