From 9dbb6c4f6cc2bbc5824ac268a77141eacd12ca35 Mon Sep 17 00:00:00 2001 From: Dan Zeman Date: Fri, 12 May 2023 00:18:11 +0200 Subject: [PATCH] Preparing a driver for UniMorph. --- Changes.txt | 5 + lib/Lingua/Interset/Tagset/MUL/Unimorph.pm | 109 +++++++++++++++++++++ xt/author/test.t | 4 +- 3 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 lib/Lingua/Interset/Tagset/MUL/Unimorph.pm diff --git a/Changes.txt b/Changes.txt index d40667a..0a79437 100644 --- a/Changes.txt +++ b/Changes.txt @@ -1,5 +1,10 @@ {{$NEXT}} +Drivers: +* MUL::Unimorph + + + 3.015 2022-03-05 11:53:12+01:00 Europe/Prague Features: diff --git a/lib/Lingua/Interset/Tagset/MUL/Unimorph.pm b/lib/Lingua/Interset/Tagset/MUL/Unimorph.pm new file mode 100644 index 0000000..13bdb7c --- /dev/null +++ b/lib/Lingua/Interset/Tagset/MUL/Unimorph.pm @@ -0,0 +1,109 @@ +# ABSTRACT: Driver for the UniMorph features. +# https://unimorph.github.io/ +# Copyright © 2023 Dan Zeman + +package Lingua::Interset::Tagset::MUL::Unimorph; +use strict; +use warnings; +our $VERSION = '3.016'; + +use utf8; +use open ':utf8'; +use namespace::autoclean; +use Moose; +use Lingua::Interset::FeatureStructure; +extends 'Lingua::Interset::Tagset'; + + + +#------------------------------------------------------------------------------ +# Returns the tagset id that should be set as the value of the 'tagset' feature +# during decoding. Every derived class must (re)define this method! The result +# should correspond to the last two parts in package name, lowercased. +# Specifically, it should be the ISO 639-2 language code, followed by '::' and +# a language-specific tagset id. Example: 'cs::multext'. +#------------------------------------------------------------------------------ +sub get_tagset_id +{ + return 'mul::unimorph'; +} + + + +#------------------------------------------------------------------------------ +# Decodes a physical tag (string) and returns the corresponding feature +# structure. +#------------------------------------------------------------------------------ +sub decode +{ + my $self = shift; + my $tag = shift; + # There is a string of feature values, separated by semicolons. The order of + # the features is not significant, except that the main part of speech always + # comes first. + my @features = split(/;/, $tag); + $fs->set_tagset('mul::unimorph'); + ###!!! + # DECODE THE FEATURES HERE. + ###!!! + return $fs; +} + + + +#------------------------------------------------------------------------------ +# Takes feature structure and returns the corresponding physical tag (string). +#------------------------------------------------------------------------------ +sub encode +{ + my $self = shift; + my $fs = shift; # Lingua::Interset::FeatureStructure + ###!!! + # ENCODE THE FEATURES HERE. + ###!!! + my $tag = join(';', @features); + return $tag; +} + + + +#------------------------------------------------------------------------------ +# Returns reference to list of known tags. +###!!! WHAT DO WE DO FOR UNIMORPH HERE? THERE ARE TOO MANY POSSIBLE COMBINATIONS. +#------------------------------------------------------------------------------ +sub list +{ + my $self = shift; + my @list = (); + return \@list; +} + + + +1; + +=head1 SYNOPSIS + + use Lingua::Interset::Tagset::MUL::Unimorph; + my $driver = Lingua::Interset::Tagset::MUL::Unimorph->new(); + my $fs = $driver->decode("N;MASC;SG;NOM"); + +or + + use Lingua::Interset qw(decode); + my $fs = decode('mul::unimorph', "N;MASC;SG;NOM"); + +=head1 DESCRIPTION + +Interset driver for UniMorph 4.0 feature strings, +see L. + +=head1 SEE ALSO + +L +L, +L, +L, +L + +=cut diff --git a/xt/author/test.t b/xt/author/test.t index 6e14b5d..140b7df 100644 --- a/xt/author/test.t +++ b/xt/author/test.t @@ -10,7 +10,7 @@ binmode(STDOUT, ':utf8'); binmode(STDERR, ':utf8'); # We must declare in advance how many tests we are going to perform. # There are currently three tests per tagset driver. -use Test::More tests => 63*3; +use Test::More tests => 64*3; use Lingua::Interset qw(get_driver_object); use Lingua::Interset::Tagset; @@ -43,7 +43,7 @@ my @tagsets = 'la::conll', 'la::it', 'la::itconll', 'lt::jablonskis', 'lt::multext', 'mt::mlss', - 'mul::google', 'mul::upos', + 'mul::google', 'mul::upos', 'mul::unimorph', 'nl::cgn', 'nl::conll', 'no::conll', 'pl::ipipan',