diff --git a/.gitignore b/.gitignore index d9734d08..108b66b9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ engine/templates_c/* -engine/include/vendor +vendor config/config.local.php docker/www/config/config.local.php secured_data/ccls diff --git a/composer.json b/composer.json index c3f6e3c9..500edcf7 100644 --- a/composer.json +++ b/composer.json @@ -1,7 +1,6 @@ { "config": { - "platform": {"php":"5.6"}, - "vendor-dir": "engine/include/vendor" + "platform": {"php":"5.6"} }, "autoload": { diff --git a/docker-compose.yml b/docker-compose.yml index bdcc6585..de4fb4eb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,6 +13,7 @@ services: - ./docker/www/config:/home/inforex/config - ./secured_data:/home/inforex/secured_data - ./phpunit:/home/inforex/phpunit + - ./vendor:/home/inforex/vendor depends_on: - db restart: always diff --git a/docker-dev-up.sh b/docker-dev-up.sh index 7ce37345..9f940b24 100755 --- a/docker-dev-up.sh +++ b/docker-dev-up.sh @@ -2,7 +2,7 @@ composer update -AUTOLOAD=engine/include/vendor/autoload.php +AUTOLOAD=vendor/autoload.php if [ -f $AUTOLOAD ]; then docker-compose build @@ -10,4 +10,4 @@ if [ -f $AUTOLOAD ]; then else echo -e "[\e[31mERROR\e[0m] $AUTOLOAD not found" echo -e "Make sure that '\e[32mcomposer\e[0m' is installed in order to run '\e[32mcomposer update\e[0m' and generate $AUTOLOAD" -fi \ No newline at end of file +fi diff --git a/engine/external/pear/HTML/Select.php b/engine/external/pear/HTML/Select.php index 47f4bd88..54dce9f1 100644 --- a/engine/external/pear/HTML/Select.php +++ b/engine/external/pear/HTML/Select.php @@ -13,7 +13,7 @@ * @link http://pear.php.net/package/HTML_Select */ -require_once __DIR__ . '/../../../include/vendor/autoload.php'; +require_once __DIR__ . '/../../../../vendor/autoload.php'; /** * Class to dynamically create an HTML SELECT diff --git a/engine/include.php b/engine/include.php index 2c43a4fa..189eba19 100644 --- a/engine/include.php +++ b/engine/include.php @@ -5,4 +5,4 @@ $include_paths[] = get_include_path(); set_include_path( implode(PATH_SEPARATOR, $include_paths) ); -require_once __DIR__ . '/include/vendor/autoload.php'; \ No newline at end of file +require_once __DIR__ . '/../vendor/autoload.php'; diff --git a/engine/include/database/CDbAnnotation.php b/engine/include/database/CDbAnnotation.php index a11aa456..5d58a9f9 100755 --- a/engine/include/database/CDbAnnotation.php +++ b/engine/include/database/CDbAnnotation.php @@ -166,7 +166,7 @@ static function getAnnotationTypesBySets($report_ids, $relation_ids){ static function getAnnotationsBySets($report_ids=null, $annotation_layers=null, $annotation_names=null, $stage = null){ global $db; - $sql = "SELECT *, raa.`value` AS `prop` " . + $sql = "SELECT ra.*, at.*, raa.annotation_id, raa.annotation_attribute_id, raa.`user_id` AS `attr_user_id`, raa.`value` AS `prop` " . " FROM reports_annotations ra" . " LEFT JOIN annotation_types at ON (ra.type=at.name) " . " LEFT JOIN reports_annotations_attributes raa ON (ra.id=raa.annotation_id) "; @@ -203,7 +203,7 @@ static function getAnnotationsBySets($report_ids=null, $annotation_layers=null, */ static function getAnnotationsBySubsets($report_ids=null, $annotation_subset_ids=null){ global $db; - $sql = "SELECT *, ra.type, raa.`value` AS `prop` " . + $sql = "SELECT ra.*, at.*, raa.annotation_id, raa.annotation_attribute_id, raa.`user_id` AS `attr_user_id`, raa.`value` AS `prop` " . " FROM reports_annotations ra" . " LEFT JOIN annotation_types at ON (ra.type=at.name) " . " LEFT JOIN reports_annotations_attributes raa ON (ra.id=raa.annotation_id) "; diff --git a/engine/include/database/CDbCorpusRelation.php b/engine/include/database/CDbCorpusRelation.php index 8f281e8c..28f99854 100644 --- a/engine/include/database/CDbCorpusRelation.php +++ b/engine/include/database/CDbCorpusRelation.php @@ -163,28 +163,20 @@ static function getRelationsByRelationSetIds($relation_set_ids){ return $db->fetch_rows($sql); } - //TODO to delete - static function getRelationsBySets($report_ids, $relation_type_ids){ - global $db; - $sql = "SELECT reports_annotations.report_id as report_id, rel.id, rel.relation_type_id, rel.source_id, rel.target_id, relation_types.name " . - "FROM " . - "(SELECT * " . - "FROM relations " . - "WHERE source_id IN " . - "(SELECT id " . - "FROM reports_annotations " . - "WHERE report_id IN('" . implode("','",$report_ids) . "')) " . - "AND relation_type_id " . - "IN (".implode(",",$relation_type_ids).")) rel " . - "LEFT JOIN relation_types " . - "ON rel.relation_type_id=relation_types.id " . - "LEFT JOIN reports_annotations " . - "ON rel.source_id=reports_annotations.id "; - return $db->fetch_rows($sql); - } - - static function getRelationsBySets2($report_ids=null, $relation_set_ids=null, $relation_type_ids=null, $stage_ids=null, $user_ids=null){ + static function getRelationsBySets($report_ids=null, $relation_set_ids=null, $relation_type_ids=null, $stage_ids=null, $user_ids=null, $relation_stages=array()){ global $db; + + // if $relation_stages not set is equal $stage_ids - stages of + // relation are identical as stages of annotations + if( is_array($relation_stages) && (count($relation_stages)==0)) { + $relation_stages = $stage_ids; + } // if not set + + if (is_array($relation_stages) && (count($relation_stages)>0)) { + $relationStages = "stage IN('".implode("','",$relation_stages)."') AND"; + } else { // if $relation_stages==null default is 'final' + $relationStages = "stage = 'final' AND"; + } $sql = "SELECT reports_annotations.report_id as report_id, " . " rel.id, " . " rel.relation_type_id, " . @@ -196,7 +188,7 @@ static function getRelationsBySets2($report_ids=null, $relation_set_ids=null, $r "FROM " . "(SELECT * " . "FROM relations " . - "WHERE stage = 'final' AND source_id IN " . + "WHERE ".$relationStages." source_id IN " . "(SELECT id " . "FROM reports_annotations " . "WHERE report_id IN('0','" . implode("','",$report_ids) . "')) " . diff --git a/engine/include/export/CCclFactory.php b/engine/include/export/CclExportDocument.php similarity index 76% rename from engine/include/export/CCclFactory.php rename to engine/include/export/CclExportDocument.php index 130412b7..cd492fd0 100644 --- a/engine/include/export/CCclFactory.php +++ b/engine/include/export/CclExportDocument.php @@ -6,22 +6,26 @@ * See LICENCE */ -class CclFactory{ +class CclExportDocument extends CclDocument { /** * $report --- tablica asocjacyjna z atrybutami dokumentu (jak z tabeli reports) * $tokens --- tablica asocjacyjna z wartościami 'from', 'to' i 'eos' * $tags --- * function creates ccl document using 'eos' token attributes to match end of sentence - * see: createFromReportAndTokensSentence + * see: createFromReportAndTokensSentence, was createFromReportAndTokens() */ - function createFromReportAndTokens(&$report, &$tokens, &$tags){ + public function __construct(&$report, &$tokens, &$tags){ $fileName = str_pad($report['id'],8,'0',STR_PAD_LEFT); - $ccl = new CclDocument(); - $ccl->setFileName($fileName); - $ccl->setSubcorpus(preg_replace("/[^\p{L}|\p{N}]+/u","_",$report['name'])); - $ccl->setReport($report); + $this->setFileName($fileName); + $this->setSubcorpus( + // SW ?? there are not 'name' column in DB table reports + isset($report['name']) + ? preg_replace("/[^\p{L}|\p{N}]+/u","_",$report['name']) + : "" + ); + $this->setReport($report); $chunkList = explode('<\\chunk>', $report['content']); @@ -34,7 +38,10 @@ function createFromReportAndTokens(&$report, &$tokens, &$tags){ $chunk = str_replace(">","> ",$chunk); preg_match_all($pattern, $chunk, $matches); $type = "p"; - if (is_array($matches) && array_key_exists(1, $matches)) + if (is_array($matches) + && array_key_exists(1, $matches) + && array_key_exists(0,$matches[1]) + ) $type = $matches[1][0]; $tmpStr = trim(preg_replace("/\s\s+/"," ",custom_html_entity_decode(strip_tags($chunk)))); $tmpStr2 = preg_replace("/\n+|\r+|\s+/","",$tmpStr); @@ -93,7 +100,7 @@ function createFromReportAndTokens(&$report, &$tokens, &$tags){ } $s->addToken($t); - $ccl->addToken($t); + $this->addToken($t); if ( $token['eos'] ){ $c->addSentence($s); $s = new CclSentence(); @@ -108,37 +115,36 @@ function createFromReportAndTokens(&$report, &$tokens, &$tags){ else $sentenceIndex--; } - $ccl->addChunk($c); + $this->addChunk($c); } - return $ccl; - } + } // __construct() - function setAnnotationLemmas(&$ccl, &$annotation_lemmas){ + protected function setAnnotationLemmas($annotation_lemmas){ if (empty($annotation_lemmas)){ return false; } foreach($annotation_lemmas as $lemma){ - $ccl->setAnnotationLemma($lemma); + $this->setAnnotationLemma($lemma); } } - function setAnnotationProperties(&$ccl, &$annotation_properties){ + protected function setAnnotationProperties($annotation_properties){ if (empty($annotation_properties)){ return false; } foreach($annotation_properties as $property){ - $ccl->setAnnotationProperty($property); + $this->setAnnotationProperty($property); } } /** * */ - function setAnnotationsAndRelations(&$ccl, &$annotations, &$relations){ + protected function setAnnotationsAndRelations(&$annotations, &$relations){ if (empty($annotations)) return false; $annotationsById = array(); $continuousAnnotationIds = array(); @@ -157,7 +163,7 @@ function setAnnotationsAndRelations(&$ccl, &$annotations, &$relations){ foreach ($annotations as &$annotation){ if ( !in_array($annotation['id'], $continuousAnnotationIds)){ - $ccl->setAnnotation($annotation); + $this->setAnnotation($annotation); } else { $continuousAnnotations[$annotation['id']] =& $annotation; } @@ -169,7 +175,7 @@ function setAnnotationsAndRelations(&$ccl, &$annotations, &$relations){ $target_id = $cRelation['target_id']; if (array_key_exists($source_id, $annotationsById) && array_key_exists($target_id, $annotationsById)){ - $ccl->setContinuousAnnotation2( + $this->setContinuousAnnotation2( $continuousAnnotations[$source_id], $continuousAnnotations[$target_id]); } else if (array_key_exists($source_id, $annotationsById) || @@ -179,7 +185,7 @@ function setAnnotationsAndRelations(&$ccl, &$annotations, &$relations){ $e->setFunctionName("setAnnotationsAndRelations"); $e->addObject("relation", $cRelation); $e->addComment("008 no source or target annotation in a continuous relation"); - $ccl->addError($e); + $this->addError($e); } } @@ -188,7 +194,7 @@ function setAnnotationsAndRelations(&$ccl, &$annotations, &$relations){ $target_id = $nRelation['target_id']; if (array_key_exists($source_id, $annotationsById) && array_key_exists($target_id, $annotationsById)){ - $ccl->setRelation( + $this->setRelation( $annotationsById[$nRelation['source_id']], $annotationsById[$nRelation['target_id']], $nRelation); @@ -198,10 +204,21 @@ function setAnnotationsAndRelations(&$ccl, &$annotations, &$relations){ $e->setFunctionName("setAnnotationsAndRelations"); $e->addObject("relation", $nRelation); $e->addComment("009 no source or target annotation in a normal relation"); - $ccl->addError($e); + $this->addError($e); } } return true; } -} + public function setCclProperties(&$annotations, &$relations, $lemmas, $attributes ) { + + $this->setAnnotationsAndRelations($annotations, $relations); + // Lemmas will be added only if annotations are too + if(is_array($annotations) && (count($annotations)>0)) { + $this->setAnnotationLemmas($lemmas); + } + $this->setAnnotationProperties($attributes); + + } // setCclProperties() + +} // CclExportDocument class diff --git a/engine/include/export/ConllAndJsonFactory.php b/engine/include/export/ConllAndJsonFactory.php index 68c9a4cf..6cbd7038 100644 --- a/engine/include/export/ConllAndJsonFactory.php +++ b/engine/include/export/ConllAndJsonFactory.php @@ -1,10 +1,44 @@ => [ 'idx'=>.., 'lemma'=>.. ] + **/ + protected function makeLemmaCache(array $lemma) { + + $lemmaCache = array(); + for($i=0;$i$i, + 'lemma'=> isset($lemma[$i]['lemma']) + ? $lemma[$i]['lemma'] + : null // lemma record with no 'lemma' field + ); + } + } + return $lemmaCache; + + } // makeLemmaCache() + + protected function makeConllAndJsonExportData($ccl, $tokens, $relations, $annotations, $tokens_ids, $annotations_by_id, $lemmas) { + + // create index for $lemmas + $lemmas_by_annotation_id = $this->makeLemmaCache($lemmas); + // add only lemmas pointed by extractor to proper annotations + foreach($annotations as &$ann) { + if(array_key_exists($ann['id'],$lemmas_by_annotation_id)) { + $ann['lemma']=$lemmas_by_annotation_id[$ann['id']]['lemma']; + } + } + /** * Create a cache for 'token from' to boost processing */ @@ -90,7 +124,7 @@ function exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $json_sentence = []; $id = 0; foreach ($sentence->tokens as $token) { - $original_id = $tokens_ids[$it++]; + $original_id = isset($tokens_ids[$it]) ? $tokens_ids[$it++] : null; $ann_tag = []; $ann_id = []; $rel_id = []; @@ -102,7 +136,7 @@ function exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $iob = $annotations_from_cache["iob"]; $annotation = $annotations_by_id[$annotations_from_cache_id]; - $ann_tag[] = $iob . $annotation['name']; + $ann_tag[] = $iob . $annotation['type']; $ann_id[] = $annotation['id']; if (array_key_exists($annotation['id'], $relations_cache)) { @@ -117,11 +151,12 @@ function exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, } } $token_id = $id++; + $ctag = isset($token->lexemes[0]->ctag) ? $token->lexemes[0]->ctag :''; $json_sentence[] = array( "order_id" => $token->id, "token_id" => $token_id, "orth" => $token->orth, - "ctag" => $token->lexemes[0]->ctag, + "ctag" => $ctag, "from" => $token->from, "to" => $token->to, "annotations" => $ann_id, @@ -138,7 +173,7 @@ function exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $array_to_check += ["_"]; } } - $conll .= $token->id . "\t" . $token_id . "\t" . $token->orth . "\t" . $token->lexemes[0]->ctag . "\t" . $token->from . "\t" . + $conll .= $token->id . "\t" . $token_id . "\t" . $token->orth . "\t" . $ctag . "\t" . $token->from . "\t" . $token->to . "\t" . join(":", $ann_tag) . "\t" . join(":", $ann_id) . "\t" . join(":", $rel_id) . "\t" . join(":", $rel_target_id) . "\n"; @@ -149,13 +184,17 @@ function exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $json_builder["chunks"][] = $json_sentences; } - $handle = fopen($file_path_without_ext . ".conll", "w"); - fwrite($handle, $conll); - fclose($handle); + return array($conll,$json_builder); + + } // makeConllAndJsonExportData() + + public function exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $annotations, $tokens_ids, $annotations_by_id, $lemmas) + { + list($conll,$json_builder) = $this->makeConllAndJsonExportData($ccl, $tokens, $relations, $annotations, $tokens_ids, $annotations_by_id, $lemmas); + $fw = new FileWriter(); + $fw->writeTextToFile($file_path_without_ext . ".conll",$conll); + $fw->writeJSONToFile($file_path_without_ext . ".json",$json_builder); - $handle = fopen($file_path_without_ext . ".json", "w"); - fwrite($handle, json_encode($json_builder, JSON_PRETTY_PRINT + JSON_UNESCAPED_UNICODE)); - fclose($handle); - } + } // exportToConllAndJson() -} \ No newline at end of file +} // ConllAndJsonFactory class diff --git a/engine/include/export/CorpusExporter.php b/engine/include/export/CorpusExporter.php index 4bcd64a4..f1a6e641 100644 --- a/engine/include/export/CorpusExporter.php +++ b/engine/include/export/CorpusExporter.php @@ -9,7 +9,7 @@ * */ class CorpusExporter{ - private $export_errors = array(); + protected $export_errors = array(); /** * Returns array given as param, without all items with value null @@ -32,6 +32,24 @@ public static function arrayRemoveNullElements(array $arr) { return $arr; } // arrayRemoveNullElements() + /** + * Parameter array should be list of subarrays with field => + * We remove all fileds with =='lemma' + * + * @param $ann - list of associative arrays + * + * @returns - array given w/o 'lemma' fields + */ + private function RemoveLemmaFieldFromAnnotationsList(array $anns) { + + foreach($anns as &$ann) { + if(array_key_exists('lemma',$ann)) + unset($ann['lemma']); + } + return $anns; + + } // RemoveLemmaFieldFromAnnotationsList() + /** * Funkcja parsuje opis ekstraktora danych * @@ -61,7 +79,11 @@ public static function arrayRemoveNullElements(array $arr) { */ protected function parse_extractor($description){ $extractors = array(); - $parts = explode(":", $description); + try { + $parts = explode(":", $description); + } catch(Exception $ex){ + throw new Exception("Niepoprawny opis ekstraktora "); + } // catch() if ( count($parts) !== 2 ){ throw new Exception("Niepoprawny opis ekstraktora " . $description); } @@ -109,6 +131,7 @@ protected function parse_extractor($description){ $params['attributes_annotation_subset_ids'] = null; $params['relation_set_ids'] = null; $params['stages'] = null; + $params['relation_stages'] = array(); // internally expanded foreach ( explode(";", $parts[1]) as $part ){ $name_value = explode("#", $part); @@ -126,11 +149,24 @@ protected function parse_extractor($description){ } } + // hint for selecting annotation in stage final and relation + // in stage agreement + if( is_array($params["stages"])) { + foreach($params["stages"] as &$stage) { + if($stage=='relationagreement') { + $stage = 'final'; // for annotations + $params["relation_stages"] = array('agreement'); // for relations + } // if 'relationagreement' + } // foreach "stages" + } // is_array('stages') + $extractor["params"] = $params; $extractor["extractor"] = function($report_id, $params, &$elements){ // $params -- annotations_set_ids, $stages $annotations = DbAnnotation::getReportAnnotations($report_id, $params["user_ids"], $params["annotation_set_ids"], $params["annotation_subset_ids"], null, $params["stages"]); + // we don't want lemma field in full annotation records + $annotations = $this->RemoveLemmaFieldFromAnnotationsList($annotations); if ( is_array($annotations) ) { $elements['annotations'] = array_merge($elements['annotations'], $annotations); } @@ -163,7 +199,7 @@ protected function parse_extractor($description){ } if(is_array($params['relation_set_ids']) && count($params['relation_set_ids'])>0) { // add custom relation - $relations = DbCorpusRelation::getRelationsBySets2(array($report_id), $params['relation_set_ids'], null, $params["stages"],$params["user_ids"]); + $relations = DbCorpusRelation::getRelationsBySets(array($report_id), $params['relation_set_ids'], null, $params["stages"],$params["user_ids"],$params["relation_stages"]); if ( is_array($relations) ) { $elements['relations'] = array_merge($elements['relations'], $relations); } @@ -188,7 +224,7 @@ protected function parse_extractor($description){ $extractor["params"] = explode(",", $parts[1]); $extractor["extractor"] = function($report_id, $params, &$elements){ // $params -- set of annotation_set_id - $relations = DbCorpusRelation::getRelationsBySets2(array($report_id), $params); + $relations = DbCorpusRelation::getRelationsBySets(array($report_id), $params); if ( is_array($relations) ) { $elements['relations'] = array_merge($elements['relations'], $relations); } @@ -299,7 +335,8 @@ private function log_error($file_name, $line_no, $report_id, $message, $error_ty break; //Problem z utworzeniem CCL case 2: - $this->export_errors[$error_type]['details']['names'][$error_params['name']] = 1; + if(isset($error_params['name'])) + $this->export_errors[$error_type]['details']['names'][$error_params['name']] = 1; $this->export_errors[$error_type]['details']['error'][$error_params['error']] = 1; break; //Brak anotacji źródłowej dla relacji @@ -343,7 +380,7 @@ private function makeAssocArray($arr, $key, $disamb_only=false){ return $ret; } - private function getReportTagsByTokens($report_id, $tokens_ids, $disamb_only=true, $tagging='tagger'){ + protected function getReportTagsByTokens($report_id, $tokens_ids, $disamb_only=true, $tagging='tagger'){ $tags = array(); $tags_by_tokens = array(); @@ -435,6 +472,269 @@ private function getReportTagsByTokens($report_id, $tokens_ids, $disamb_only=tru return $tags_by_tokens; } + protected function getFlagsByReportId($report_id) { + + return DbReportFlag::getReportFlags($report_id); + + } // getFlagsByReportId() + + protected function getTokenByReportId($report_id){ + + return DbToken::getTokenByReportId($report_id, null, true); + + } // getTokenByReportId() + + protected function getReportById($report_id){ + + return DbReport::getReportById($report_id); + + } // getReportById() + + protected function getReportExtById($report_id){ + + return DbReport::getReportExtById($report_id); + + } // getReportExtById() + + protected function getFormatName($format_id) { + + return DbReport::formatName($format_id); + + } // getFormatName() + + protected function exportReportContent($report,$file_path_without_ext) { + + try { + // getHtmlStr() need $report['format'] field, which isn't + // exists in `reports` DB now. We must create it from + // $reports['format_id']. Its not elegant here, but works... + if(!isset($report['format'])){ + $report['format'] = + isset($report['format_id']) && $report['format_id'] + ? $this->getFormatName($report['format_id']) + : 'xml' ; // default for default format_id=1 + } + $html = ReportContent::getHtmlStr($report); + } catch(Exception $ex){ + $errorMsg = "Problem z eksportem zawartości HTML dokumentu"; + $exceptionMsg = $ex->getMessage(); + $error_params = array( + 'message' => $errorMsg, + 'error' => $exceptionMsg + ); + $this->log_error(__FILE__, __LINE__, $report["id"], + $errorMsg.": ".$exceptionMsg, 8, $error_params); + return False; + } // catch() + $content = $html->getContent(); + file_put_contents($file_path_without_ext .".txt", $content); + return True; + + } // exportReportContent() + + protected function updateLists($flags,$reportFileName,&$lists) { + + /* Przypisanie dokumentu do list */ + foreach ( $lists as $ix=>$list){ + foreach ( $list['flags'] as $flag){ + $flag_name = $flag["flag_name"]; + $flag_ids = $flag["flag_ids"]; + if ( isset($flags[$flag_name]) && in_array($flags[$flag_name], $flag_ids) ){ + $lists[$ix]["document_names"][$reportFileName.".xml"] = 1; + } + } + } + // returns changes in $lists array from params + + } // updateLists() + + protected function createIniFile($report,$subcorpora,$file_path_without_ext) { + + $ext = $this->getReportExtById($report["id"]); + + $basic = array("id", "date", "title", "source", "author", "tokenization", "subcorpus"); + $lines = array(); + $lines[] = "[document]"; + $report["subcorpus"] = isset($subcorpora[$report['subcorpus_id']]) ? $subcorpora[$report['subcorpus_id']] : ""; + + foreach ($basic as $name){ + $lines[] = sprintf("%s = %s", $name, $report[$name]); + } + if ( count($ext) > 0 ){ + $lines[] = ""; + $lines[] = "[metadata]"; + foreach ($ext as $key=>$val){ + if ($key != "id"){ + $key = preg_replace("/[^\p{L}|\p{N}]+/u", "_", $key); + $lines[] = sprintf("%s = %s", $key, $val); + } + } + } + file_put_contents($file_path_without_ext.".ini", implode("\n", $lines)); + + } // createIniFile() + + protected function checkIfAnnotationForLemmaExists($report_id,$lemmas,$annotations_by_id) { + + $allLemmasCorrect = True; + foreach ($lemmas as $an){ + $anid = intval($an['id']); + if ( !isset($annotations_by_id[$anid]) ){ + $error_params = array( + 'message' => "Brak warstwy anotacji dla lematu.", + 'group_id' => $an['group_id'], + 'lemma' => $an['name'] + ); + $this->log_error(__FILE__, __LINE__, $report_id, "brak anotacji $anid dla lematu ({$an["name"]}) -- brakuje warstwy anotacji?", 6, $error_params); + $allLemmasCorrect = False; + } + } + return $allLemmasCorrect; + + } // checkIfAnnotationForLemmaExists() + + protected function checkIfAnnotationForRelationExists($report_id,$relations,$annotations_by_id) { + /* Sprawdzenie, anotacji źródłowych i docelowych dla relacji */ + $allRelationsCorrect = True; + foreach ( $relations as $rel ){ + $source_id = $rel["source_id"]; + $target_id = $rel["target_id"]; + if ( !isset($annotations_by_id[$source_id]) ){ + $error_params = array( + 'message' => "Brak anotacji źródłowej dla relacji.", + 'source_id' => $source_id, + 'relation' => $rel["name"] + ); + $this->log_error(__FILE__, __LINE__, $report_id, "brak anotacji źródłowej o identyfikatorze $source_id ({$rel["name"]}) -- brakuje warstwy anotacji?", 4, $error_params); + $allRelationsCorrect = False; + } + if ( !isset($annotations_by_id[$target_id]) ){ + $error_params = array( + 'message' => "Brak anotacji docelowej dla relacji.", + 'target_id' => $target_id, + 'relation' => $rel["name"] + ); + $this->log_error(__FILE__, __LINE__, $report_id, "brak anotacji źródłowej o identyfikatorze $target_id ({$rel["name"]}) -- brakuje warsty anotacji?", 5, $error_params); + $allRelationsCorrect = False; + } + } + return $allRelationsCorrect; + + } // checkIfAnnotationForRelationExists() + + protected function sortUniqueAnnotationsById($report_id,$annotations) { + + /* Usunięcie zduplikowanych anotacji */ + $annotations_by_id = array(); + foreach ($annotations as $an){ + $anid = isset($an['id']) ? intval($an['id']) : 0; + if ( $anid > 0 ){ + $annotations_by_id[$anid] = $an; + } + else{ + $error_params = array( + 'message' => "Brak identyfikatora anotacji." + ); + $this->log_error(__FILE__, __LINE__, $report_id, "brak identyfikatora anotacji", 3, $error_params); + } + } + return $annotations_by_id; + + } // sortUniqueAnnotationsById() + + protected function dispatchElements($elements) { + + $annotations = array(); + $relations = array(); + $lemmas = array(); + $attributes = array(); + if ( isset($elements["annotations"]) && count($elements["annotations"]) ){ + $annotations = $elements["annotations"]; + } + if ( isset($elements["relations"]) && count($elements["relations"]) ){ + $relations = $elements["relations"]; + } + if ( isset($elements["lemmas"]) && count($elements["lemmas"]) ){ + $lemmas = $elements["lemmas"]; + } + + if ( isset($elements["attributes"]) && count($elements["attributes"]) ){ + $attributes = $elements["attributes"]; + } + return [$annotations,$relations,$lemmas,$attributes]; + + } // dispatchElements() + + protected function callCclCreator($report,$tokens,$tags_by_tokens) { + + $ccl = new CclExportDocument($report, $tokens, $tags_by_tokens); + return $ccl; + + } // callCclCreator() + + protected function generateCcl($report,$tokens,$tags_by_tokens) { + + try{ + $ccl = $this->callCclCreator($report, $tokens, $tags_by_tokens); + } + catch(Exception $ex){ + $error = $ex->getMessage(); + $error_params = array( + 'message' => "Problem z utworzeniem CCL", + 'error' => $error + ); + $this->log_error(__FILE__, __LINE__, $report["id"], "Problem z utworzeniem ccl: " . $error, 2, $error_params); + return False; // error is collected + } + return $ccl; // all ok + + } // generateCcl() + + protected function updateExtractorStats($extractorName,$extractor_stats,$extractor_elements) { + + // update $extractor_stats table, for index $extractorName + // with counter from $extractor_elements results + // Returns updated stats table + $name = $extractorName; + if ( !isset($extractor_stats[$name]) ){ + $extractor_stats[$name] = array(); + } + foreach ( $extractor_elements as $type=>$items ){ + if ( !isset($extractor_stats[$name][$type]) ){ + $extractor_stats[$name][$type] = count($items); + } else { + $extractor_stats[$name][$type] += count($items); + } + } + return $extractor_stats; + + } // updateExtractorStats() + + protected function runExtractor($flags,$report_id,$extractor,&$elements,&$extractor_stats) { + + // Wykonaj extraktor w zależności od ustalonej flagi + $func = $extractor["extractor"]; + $params = $extractor["params"]; + $flag_name = $extractor["flag_name"]; + $flag_ids = $extractor["flag_ids"]; + if ( isset($flags[$flag_name]) && in_array($flags[$flag_name], $flag_ids) ){ + $extractor_elements = array(); + foreach (array_keys($elements) as $key){ + $extractor_elements[$key] = array(); + } + + $func($report_id, $params, $extractor_elements); + + foreach (array_keys($extractor_elements) as $key){ + $elements[$key] = array_merge($elements[$key], $extractor_elements[$key]); + } + + // Zapisz statystyki + $extractor_stats = $this->updateExtractorStats($extractor["name"],$extractor_stats,$extractor_elements); + } // if flags is set + + } // runExtractorFunction() + /** * Eksport dokumentu o wskazanym identyfikatorze * @param $report_id Identyfikator dokumentu do eksportu @@ -444,190 +744,72 @@ private function getReportTagsByTokens($report_id, $tokens_ids, $disamb_only=tru * @param $tagging_method String tagging method from ['tagger', 'final', 'final_or_tagger', 'user:{id}'] */ protected function export_document($report_id, $extractors, $disamb_only, &$extractor_stats, &$lists, $output_folder, $subcorpora, $tagging_method){ - $flags = DbReportFlag::getReportFlags($report_id); + $flags = $this->getFlagsByReportId($report_id); $elements = array("annotations"=>array(), "relations"=>array(), "lemmas"=>array(), "attributes"=>array()); - // Wykonaj extraktor w zależności od ustalonej flagi + // Wykonaj extraktory w zależności od ustalonej flagi foreach ( $extractors as $extractor ){ - $func = $extractor["extractor"]; - $params = $extractor["params"]; - $flag_name = $extractor["flag_name"]; - $flag_ids = $extractor["flag_ids"]; - if ( isset($flags[$flag_name]) && in_array($flags[$flag_name], $flag_ids) ){ - $extractor_elements = array(); - foreach (array_keys($elements) as $key){ - $extractor_elements[$key] = array(); - } - $func($report_id, $params, $extractor_elements); - foreach (array_keys($extractor_elements) as $key){ - $elements[$key] = array_merge($elements[$key], $extractor_elements[$key]); - } - // Zapisz statystyki - $name = $extractor["name"]; - if ( !isset($extractor_stats[$name]) ){ - $extractor_stats[$name] = array(); - } - foreach ( $extractor_elements as $type=>$items ){ - if ( !isset($extractor_stats[$name][$type]) ){ - $extractor_stats[$name][$type] = count($items); - } - else{ - $extractor_stats[$name][$type] += count($items); - } - } - } + $this->runExtractor($flags,$report_id,$extractor,$elements,$extractor_stats); } - $tokens = DbToken::getTokenByReportId($report_id, null, true); + $tokens = $this->getTokenByReportId($report_id); $tokens_ids = array_column($tokens, 'token_id'); $tags_by_tokens = $this->getReportTagsByTokens($report_id, $tokens_ids, $disamb_only, $tagging_method); - $report = DbReport::getReportById($report_id); - try{ - $ccl = CclFactory::createFromReportAndTokens($report, $tokens, $tags_by_tokens); - } - catch(Exception $ex){ - $error = $ex->getMessage(); - $error_params = array( - 'message' => "Problem z utworzeniem CCL", - 'error' => $error - ); - $this->log_error(__FILE__, __LINE__, $report_id, "Problem z utworzeniem ccl: " . $error, 2, $error_params); - return; - } - $annotations = array(); - $relations = array(); - $lemmas = array(); - $attributes = array(); - if ( isset($elements["annotations"]) && count($elements["annotations"]) ){ - $annotations = $elements["annotations"]; - } - if ( isset($elements["relations"]) && count($elements["relations"]) ){ - $relations = $elements["relations"]; - } - if ( isset($elements["lemmas"]) && count($elements["lemmas"]) ){ - $lemmas = $elements["lemmas"]; - } + $report = $this->getReportById($report_id); - if ( isset($elements["attributes"]) && count($elements["attributes"]) ){ - $attributes = $elements["attributes"]; - } + $ccl = $this->generateCcl($report,$tokens,$tags_by_tokens); + if($ccl===False) { return; } + + list($annotations,$relations,$lemmas,$attributes) = $this->dispatchElements($elements); /* Usunięcie zduplikowanych anotacji */ - $annotations_by_id = array(); - foreach ($annotations as $an){ - $anid = intval($an['id']); - if ( $anid > 0 ){ - $annotations_by_id[$anid] = $an; - } - else{ - $error_params = array( - 'message' => "Brak identyfikatora anotacji." - ); - $this->log_error(__FILE__, __LINE__, $report_id, "brak identyfikatora anotacji", 3, $error_params); - } - } + $annotations_by_id = $this->sortUniqueAnnotationsById($report_id,$annotations); $annotations = array_values($annotations_by_id); /* Sprawdzenie, anotacji źródłowych i docelowych dla relacji */ - foreach ( $relations as $rel ){ - $source_id = $rel["source_id"]; - $target_id = $rel["target_id"]; - if ( !isset($annotations_by_id[$source_id]) ){ - $error_params = array( - 'message' => "Brak anotacji źródłowej dla relacji.", - 'source_id' => $source_id, - 'relation' => $rel["name"] - ); - $this->log_error(__FILE__, __LINE__, $report_id, "brak anotacji źródłowej o identyfikatorze $source_id ({$rel["name"]}) -- brakuje warstwy anotacji?", 4, $error_params); - } - if ( !isset($annotations_by_id[$target_id]) ){ - $error_params = array( - 'message' => "Brak anotacji docelowej dla relacji.", - 'target_id' => $target_id, - 'relation' => $rel["name"] - ); - $this->log_error(__FILE__, __LINE__, $report_id, "brak anotacji źródłowej o identyfikatorze $target_id ({$rel["name"]}) -- brakuje warsty anotacji?", 5, $error_params); - } - } + $this->checkIfAnnotationForRelationExists($report_id,$relations,$annotations_by_id); /* Sprawdzenie lematów */ - foreach ($lemmas as $an){ - $anid = intval($an['id']); - if ( !isset($annotations_by_id[$anid]) ){ - $error_params = array( - 'message' => "Brak warstwy anotacji dla lematu.", - 'group_id' => $an['group_id'], - 'lemma' => $an['name'] - ); - $this->log_error(__FILE__, __LINE__, $report_id, "brak anotacji $anid dla lematu ({$an["name"]}) -- brakuje warstwy anotacji?", 6, $error_params); - } - } + $this->checkIfAnnotationForLemmaExists($report_id,$lemmas,$annotations_by_id); $file_path_without_ext = $output_folder . "/" . $ccl->getFileName(); /* Wygeneruj CONLL i JSON */ - ConllAndJsonFactory::exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $annotations, $tokens_ids, $annotations_by_id); + (new ConllAndJsonFactory())->exportToConllAndJson($file_path_without_ext, $ccl, $tokens, $relations, $annotations, $tokens_ids, $annotations_by_id, $lemmas); /* Wygeneruj xml i rel.xml */ - CclFactory::setAnnotationsAndRelations($ccl, $annotations, $relations); - CclFactory::setAnnotationLemmas($ccl, $lemmas); - CclFactory::setAnnotationProperties($ccl, $attributes); - CclWriter::write($ccl, $output_folder . "/" . $ccl->getFileName() . ".xml", CclWriter::$CCL); - CclWriter::write($ccl, $output_folder . "/" . $ccl->getFileName() . ".rel.xml", CclWriter::$REL); + (new XmlFactory())->exportToXmlAndRelxml($file_path_without_ext,$ccl,$annotations,$relations,$lemmas,$attributes); /* Eksport metadanych */ - $report = DbReport::getReportById($report_id); - $ext = DbReport::getReportExtById($report_id); + $this->createIniFile($report,$subcorpora,$file_path_without_ext); - $basic = array("id", "date", "title", "source", "author", "tokenization", "subcorpus"); - $lines = array(); - $lines[] = "[document]"; - $report["subcorpus"] = $subcorpora[$report['subcorpus_id']]; + /* Przypisanie dokumentu do list */ + $this->updateLists($flags,$ccl->getFileName(),$lists); + $this->exportReportContent($report,$file_path_without_ext); - foreach ($basic as $name){ - $lines[] = sprintf("%s = %s", $name, $report[$name]); - } - if ( count($ext) > 0 ){ - $lines[] = ""; - $lines[] = "[metadata]"; - foreach ($ext as $key=>$val){ - if ($key != "id"){ - $key = preg_replace("/[^\p{L}|\p{N}]+/u", "_", $key); - $lines[] = sprintf("%s = %s", $key, $val); - } - } - } - file_put_contents($output_folder . "/" . $ccl->getFileName() . ".ini", implode("\n", $lines)); + } // export_document() - /* Przypisanie dokumentu do list */ - foreach ( $lists as $ix=>$list){ - foreach ( $list['flags'] as $flag){ - $flag_name = $flag["flag_name"]; - $flag_ids = $flag["flag_ids"]; - if ( isset($flags[$flag_name]) && in_array($flags[$flag_name], $flag_ids) ){ - $lists[$ix]["document_names"][$ccl->getFileName() . ".xml"] = 1; - } - } - } - try { - $html = ReportContent::getHtmlStr($report); - } catch(Exception $ex){ - $errorMsg = "Problem z eksportem zawartości HTML dokumentu"; - $exceptionMsg = $ex->getMessage(); - $error_params = array( - 'message' => $errorMsg, - 'error' => $exceptionMsg - ); - $this->log_error(__FILE__, __LINE__, $report_id, - $errorMsg.": ".$exceptionMsg, 8, $error_params); - return; - } // catch() - $content = $html->getContent(); - file_put_contents($output_folder . "/" . $ccl->getFileName() . ".txt", $content); + protected function getSubcorporaList() { - } + /* Przygotuj listę podkorpusów w postaci tablicy id=>nazwa*/ + $subcorpora_assoc = DbCorpus::getSubcorpora(); + $subcorpora = array(); + foreach ( $subcorpora_assoc as $sub ){ + $subcorpora[$sub['subcorpus_id']] = $sub['name']; + } + return $subcorpora; + + } // getSubcorporaList() + + protected function writeConsoleMessage($msg) { + + $isCLI = (php_sapi_name() == 'cli'); + if($isCLI) + echo($msg); + + } // writeConsoleMessage() /** * Wykonuje eksport korpusu zgodnie z określonymi parametrami (selektory, ekstraktory i indeksy). @@ -644,13 +826,8 @@ public function exportToCcl($output_folder, $selectors_description, $extractors_ mkdir("$output_folder/documents", 0777, true); } - /* Przygotuj listę podkorpusów w postaci tablicy id=>nazwa*/ - $subcorpora_assoc = DbCorpus::getSubcorpora(); - $subcorpora = array(); - foreach ( $subcorpora_assoc as $sub ){ - $subcorpora[$sub['subcorpus_id']] = $sub['name']; - } + $subcorpora = $this->getSubcorporaList(); $extractors = array(); foreach ( $extractors_description as $extractor ){ @@ -670,7 +847,7 @@ public function exportToCcl($output_folder, $selectors_description, $extractors_ } $document_ids = array_keys($document_ids); - echo "Liczba dokumentów do eksportu: " . count($document_ids) . "\n"; + $this->writeConsoleMessage("Liczba dokumentów do eksportu: " . count($document_ids) . "\n"); $extractor_stats = array(); $number_of_docs = count($document_ids); @@ -684,11 +861,11 @@ public function exportToCcl($output_folder, $selectors_description, $extractors_ if($percent_done > $progress){ $progress = $percent_done; DbExport::updateExportProgress($export_id, $progress); - echo intval($progress) . "%" . "\n"; + $this->writeConsoleMessage(intval($progress) . "%" . "\n"); } } foreach ($lists as $list){ - echo sprintf("%4d %s\n", count(array_keys($list["document_names"])), $list["name"]); + $this->writeConsoleMessage(sprintf("%4d %s\n", count(array_keys($list["document_names"])), $list["name"])); $lines = array(); foreach ( array_keys($list["document_names"]) as $document_name ){ $lines[] = "./documents/" . $document_name; @@ -705,7 +882,7 @@ public function exportToCcl($output_folder, $selectors_description, $extractors_ $types[$type] = 1; } } - echo "\n"; + $this->writeConsoleMessage("\n"); $stats_str = str_repeat(" ", $max_len_name); foreach ( array_keys($types) as $type ){ diff --git a/engine/include/export/ExportManager.php b/engine/include/export/ExportManager.php deleted file mode 100644 index 80a27532..00000000 --- a/engine/include/export/ExportManager.php +++ /dev/null @@ -1,644 +0,0 @@ -7, - "software_nam"=>7, - "event_nam"=>6, - "road_nam"=>5, - "facility_nam"=>4, - "company_nam"=>3, - "astronomical_nam"=>3, - "person_nam"=>2, - "city_nam"=>1); - var $cclDocuments = array(); - //input parameters - var $db = null; //instance of Database - var $corpus_ids = null; //array, value: id - var $subcorpus_ids = null; //array, value: id - var $document_ids = null; //array, value: id - var $flags = null; //array, key: flag name; value: array of flag values - var $annotation_layers = null; //array, value: id - var $annotation_names = null; //array, value: type name - var $folder = null; //string - var $relation_set_ids = null; - var $relation_type_ids = null; - var $index_flags = null; //array, value: corpora_flags.corpora_flag_id or corpora_flags.short - var $index_flag_ids = array(); //array, value: corpora_flags.corpora_flag_id - var $index_flag_paths = array(); //array, key: corpora_flags.short(lowercase, "_" instead of " "), value: array of strings (paths) - var $report_flag_ids = array(); //array, key: report_id; value: array (value: corpora_flags.short) - - var $split_documents = false; - var $separate_relations = false; - - var $iob_file_name = false; - - var $report_ids = array(); //array, value: id - var $reports = array(); //array, key: report id; value: report - var $metadata = array(); - var $tokens = array(); //array, key: report id; value: token - var $tags = array(); //array, key: report id; value: array (key: token_id, value: tag) - var $annotations = array(); //array, key: report id; value: annotation - var $relations = array(); - var $annotation_lemmas = array(); - var $annotation_properties = array(); - - var $verbose = false; - var $no_disamb = false; - - function setCorpusIds($corpus_ids){ - $this->corpus_ids = $corpus_ids; - } - - function setSubcorpusIds($subcorpus_ids){ - $this->subcorpus_ids = $subcorpus_ids; - } - - function setDocumentIds($document_ids){ - $this->document_ids = $document_ids; - } - - function setFlags($flags){ - $this->flags = $flags; - } - - function setAnnotationLayers($annotation_layers){ - $this->annotation_layers = $annotation_layers; - } - - function setAnnotationNames($annotation_names){ - $this->annotation_names = $annotation_names; - } - - function setRelationSetIds($relation_set_ids){ - $this->relation_set_ids = $relation_set_ids; - } - - function setRelationTypeIds($relation_type_ids){ - $this->relation_type_ids = $relation_type_ids; - } - - function setIndexFlags($index_flags){ - if ($index_flags){ - $this->index_flags = array(); - foreach($index_flags as $item){ - $this->index_flags[] = mb_strtolower($item); - } - } - } - - function setDb($db){ - assert('$db instanceof Database'); - $this->db = $db; - } - - function setFolder($folder){ - $this->folder = $folder; - } - - function setSplit($split_documents){ - $this->split_documents = $split_documents; - } - - function setSeparateRelations($separate_relations){ - $this->separate_relations = $separate_relations; - } - - function setIob($iob_file_name){ - $this->iob_file_name = $iob_file_name; - } - - function log($text){ - if ( $this->verbose ) - echo date("\r[H:i:s]") . " $text\n"; - } - - function setVerbose($verbose){ - $this->verbose = $verbose; - } - - function setNoDisamb($no_disamb){ - $this->no_disamb = $no_disamb; - } - - /** - * Wczytuje dokumenty do eksportu na podstawie ustawionych filtrów. - */ - function readDocuments(){ - $reports = DbReport::getReports($this->corpus_ids, $this->subcorpus_ids, - $this->document_ids, $this->flags); - - foreach ($reports as &$r) - $this->reports[$r['id']] = &$r; - $this->report_ids = array_keys($this->reports); - $this->log(sprintf("Number of documents to export: %d", count($this->report_ids))); - if ($this->index_flags){ - $this->getIndexFlags(); - if ($this->no_content){ - $this->writeRawIndexes(); - } - } - } - - function getIndexFlags(){ - echo date("[H:i:s] ") . " - get index flags\n"; - $index_flags = DbCorporaFlag::getCorporaFlagIds($this->index_flags); - - //check if all given flags exist in database - $flag_errors = array(); - $flag_ids = array(); - $flag_shorts_orig = array(); - if (empty($index_flags)){ - $e = new CclError(); - $e->setClassName("CclSetFactory"); - $e->setFunctionName("acquireData"); - $e->addObject("message", "flag error"); - $e->addComment("015 no given flag was found"); - $flag_errors[] = $e; - } - else { - $flag_shorts = array(); - foreach ($index_flags as $item){ - $flag_ids[] = $item['corpora_flag_id']; - $flag_lower = mb_strtolower($item['short']);; - if (!in_array($flag_lower, $flag_shorts)){ - $flag_shorts[] = $flag_lower; - } - } - foreach ($this->index_flags as $item){ - if (! (in_array($item, $flag_ids) || in_array($item, $flag_shorts)) ){ - $e = new CclError(); - $e->setClassName("CclSetFactory"); - $e->setFunctionName("acquireData"); - $e->addObject("message", "flag error"); - $e->addComment("016 flag \"$item\" not found"); - $flag_errors[] = $e; - } - } - } - if ($flag_errors){ - foreach ($flag_errors as $flag_error){ - print (string)$flag_error . "\n"; - } - exit("EXIT: flag error\n"); - } - else { - $this->index_flag_ids = $flag_ids; - - $report_flag_ids = DBReportFlag::getReportFlagData($this->report_ids, $flag_ids); - foreach ($report_flag_ids as $item){ - $report_id = $item['report_id']; - $short = $item['short']; - if (empty($this->report_flag_ids[$report_id])) - $this->report_flag_ids[$report_id] = array(); - $this->report_flag_ids[$report_id][] = mb_strtolower(str_replace(" ","_",$short)); - } - } - } - - /** - * Przygotowuje indeksy do zapisu. - */ - function processIndexes(){ - foreach ($this->report_flag_ids as $report_id=>$flag_shorts){ - $path = ""; - if ($this->split_documents) - $relativePath = preg_replace("/[^\p{L}|\p{N}]+/u","_",$this->reports[$report_id]['name']) . "/"; - foreach ($flag_shorts as $short){ - $path = $relativePath . str_pad($report_id,8,'0',STR_PAD_LEFT) . ".xml"; - if (empty($this->index_flag_paths[$short])) - $this->index_flag_paths[$short] = array(); - $this->index_flag_paths[$short][] = $path; - } - } - } - - /** - * Zapisuje indeksy dla flag. - */ - function writeIndexes(){ - $subfolder = $this->folder . "/"; - foreach($this->index_flag_paths as $index_name=>$paths){ - $filename = "index_" . $index_name . ".txt"; - $handle = fopen($subfolder . $filename, "w"); - sort($paths); - foreach ($paths as $path) - fwrite($handle, $path . "\n"); - fclose($handle); - if ( $this->verbose ) - echo sprintf(" - index: %4d in %s\n", count($paths), $filename); - } - } - - /** - * Wczytuje dane o treści dokumentów, tokenizacji, anotacjach i relacjach - * z bazy danych. - */ - function readContent(){ - $this->log("Reading content ..."); - $this->log(" a) reading tokens ..."); - $tokens = DbToken::getTokensByReportIds($this->report_ids); - foreach ($tokens as &$token){ - $report_id = $token['report_id']; - if (!array_key_exists($report_id, $this->tokens)) - $this->tokens[$report_id] = array(); - $this->tokens[$report_id][] = &$token; - } - - $this->log(" b) reading tags ..."); - $tags = DbTag::getTagsByReportIds($this->report_ids); - - $this->log(" c) assigning tags to tokens ..."); - foreach ($tags as $tag){ - $report_id = $tag['report_id']; - $token_id = $tag['token_id']; - if ( !isset($this->tags[$report_id]) ) - $this->tags[$report_id] = array(); - if ( !isset($this->tags[$report_id][$token_id]) ) - $this->tags[$report_id][$token_id] = array(); - $this->tags[$report_id][$token_id][] = $tag; - } - - /** Reorganize tags */ - foreach ($this->tags as $report_id=>$tokens){ - foreach ($tokens as $token_id=>$tags){ - $ign = null; - $other = array(); - foreach ($tags as $i_tag=>$tag){ - if ($this->no_disamb) - $this->tags[$report_id][$token_id][$i_tag]['disamb'] = 0; - - if ($tag['ctag'] == "ign") - $ign = $tag; - else - $other[] = $tag; - } - /* Jeżeli jedną z interpretacji jest ign, to usuń pozostałe. */ - if ($this->no_disamb && $ign) - $this->tags[$report_id][$token_id] = array($ign); - elseif ($ign) - $this->tags[$report_id][$token_id] = array_merge(array($ign), $other); - } - } - - $this->log(" d) reading annotations ..."); - $annotations = DbAnnotation::getAnnotationsBySets($this->report_ids, - $this->annotation_layers, $this->annotation_names); - foreach ($annotations as &$annotation){ - $report_id = $annotation['report_id']; - if (!array_key_exists($report_id, $this->annotations)) - $this->annotations[$report_id] = array(); - $this->annotations[$report_id][] = &$annotation; - } - - $this->log(" e) reading relations ..."); - $relations = DbCorpusRelation::getRelationsBySets2($this->report_ids, - $this->relation_set_ids, $this->relation_type_ids); - foreach ($relations as &$relation){ - $report_id = $relation['report_id']; - if (!array_key_exists($report_id, $this->relations)){ - $this->relations[$report_id] = array(); - } - $this->relations[$report_id][] = &$relation; - } - - $this->log(" f) reading annotation lemmas ..."); - //$this->annotation_lemmas = DbReportAnnotationLemma::getLemmasByReportsIds($this->report_ids); - $this->annotation_lemmas = DbReportAnnotationLemma::getLemmasBySets2($this->report_ids, $this->annotation_layers, $this->annotation_names); - - $this->log(" f) reading annotation attributes ..."); - $this->annotation_properties = DbReportAnnotationLemma::getPropertiesBySets2($this->report_ids, $this->annotation_layers, $this->annotation_names); - - - $this->log("Reading content is done."); - } - - /** - * Read documents metadata from the database. - */ - function readMetadata(){ - $corpora = array(); - $cnt=0; - foreach ($this->report_ids as $report_id){ - $cnt ++; - $report = $this->reports[$report_id]; - $corpora[$report['corpora']][] = $report['id']; - } - foreach ($corpora as $corpus_id => $report_ids){ - $corpus = DbCorpus::getCorpusById($corpus_id); - $ext = $corpus['ext']; - if ($ext){ - $exts = DbReport::getReportExtByIds($report_ids, $ext); - foreach ($exts as $ext){ - $this->metadata[$ext['id']] = $ext; - } - } - } - } - - /** - * Read documents segmentation, annotation and relations from databse. - */ - function processContent(){ - //$this->log("Processing content ..."); - $allReports = count($this->report_ids); - $cnt = 0; - foreach ($this->report_ids as $report_id){ - $cnt ++; - $report = $this->reports[$report_id]; - - $tokens = array(); - $tags = array(); - $annotations = array(); - $relations = array(); - $annotation_lemmas = array(); - $annotation_properties = array(); - - if (array_key_exists($report_id, $this->tokens)) - $tokens = &$this->tokens[$report_id]; - - if (array_key_exists($report_id, $this->tags)) - $tags = &$this->tags[$report_id]; - - if (array_key_exists($report_id, $this->annotations)) - $annotations = &$this->annotations[$report_id]; - - if (array_key_exists($report_id, $this->relations)) - $relations = &$this->relations[$report_id]; - - if (array_key_exists($report_id, $this->annotation_lemmas)) - $annotation_lemmas = $this->annotation_lemmas[$report_id]; - if (array_key_exists($report_id, $this->annotation_properties)) - $annotation_properties = $this->annotation_properties[$report_id]; - - try{ - $ccl = CclFactory::createFromReportAndTokens($report, $tokens, $tags); - - if (count($tokens)==0){ - $e = new CclError(); - $e->setClassName("CclSetFactory"); - $e->setFunctionName("create"); - $e->addObject("report", $report); - $e->addComment("010 no tokenization in report"); - $ccl->addError($e); - } - else { - $flags = DbReportFlag::getReportFlags($report_id); - $annotations = $this->filterAnnotationsByFlags($report_id, $flags, $annotations); - $relations = $this->filterRelationsByFlags($report_id, $flags, $relations); - CclFactory::setAnnotationsAndRelations($ccl, $annotations, $relations); - CclFactory::setAnnotationLemmas($ccl, $annotation_lemmas); - CclFactory::setAnnotationProperties($ccl, $annotation_properties); - } - - if (count($tags)==0){ - $e = new CclError(); - $e->setClassName("CclSetFactory"); - $e->setFunctionName("create"); - $e->addObject("report", $report); - $e->addComment("011 no tags in report"); - $ccl->addError($e); - } - - $this->cclDocuments[$report_id] = $ccl; - } - catch(Exception $ex){ - print "!!!!! FIX ME report_id = $report_id\n"; - } - } - //$this->log("Processing content ... done"); - } - - /** - * Write documents tokens, annotations and relations to files. - */ - function writeContent(){ - //$this->log("Writing content ... "); - if ($this->iob_file_name) - $this->_writeIob(); - else - $this->_writeCcl(); - //$this->log("Writing content ... done"); - } - - /** - * - */ - function _writeCcl(){ - $subfolder = $this->folder . "/"; - $relativePath = ""; - $failed = array(); - if (!is_dir($subfolder)) mkdir($subfolder, 0777); - foreach ($this->cclDocuments as $cclDocument){ - if ($this->split_documents){ - $relativePath = $cclDocument->getSubcorpus() . "/"; - $subfolder = $this->folder . "/" . $relativePath; - if (!is_dir($subfolder)) mkdir($subfolder, 0777); - } - - if (!$cclDocument->hasErrors()){ - if ($this->separate_relations){ - CclWriter::write($cclDocument, $subfolder . $cclDocument->getFileName() . ".xml", CclWriter::$CCL); - CclWriter::write($cclDocument, $subfolder . $cclDocument->getFileName() . ".rel.xml", CclWriter::$REL); - } - else - CclWriter::write($cclDocument, $subfolder . $cclDocument->getFileName() . ".xml", CclWriter::$CCLREL); - if ($this->index_flags){ - $report = $cclDocument->getReport(); - $report_id = $report['id']; - if (!empty($this->report_flag_ids[$report_id])){ - foreach ($this->report_flag_ids[$report_id] as $short){ - if (empty($this->index_flag_paths[$short])) - $this->index_flag_paths[$short] = array(); - $this->index_flag_paths[$short][] = $relativePath . $cclDocument->getFileName() . ".xml"; - } - } - } - } - else { - echo "ERROR in " . $cclDocument->getFileName() . " \n"; - $failed[] = $cclDocument->getFileName(); - $errors = $cclDocument->getErrors(); - foreach ($errors as $error){ - print (string)$error . "\n"; - } - } - } - - if ( count($failed) ){ - $this->log("[ERROR] Following documents were not saved because of errors:"); - foreach ($failed as $f) - $this->log(" - $f"); - } - } - - /** - * Write tokens and annotations to a single IOB file. - */ - function _writeIob(){ - $subfolder = $this->folder . "/"; - if (!is_dir($subfolder)) mkdir($subfolder, 0777); - $filename = $subfolder . $this->iob_file_name; - $writer = new IobWriter($filename, $this->channelPriority); - $writer->writeAll($this->cclDocuments); - $writer->close(); - $writer->printStats(); - } - - /** - * Save documents metadata to files. - */ - function writeMetadata(){ - $this->log("Writing medatada ..."); - $subfolder = $this->folder . "/"; - foreach ($this->reports as $r){ - $basic = array("id", "date", "title", "source", "author", "tokenization", "name:subcorpus"); - $lines = array(); - $lines[] = "[document]"; - - foreach ($basic as $b=>$br){ - $parts = split(":", $br); - $name = $parts[0]; - $name_target = $parts[1] ? $parts[1] : $name; - $lines[] = sprintf("%s = %s", $name_target, $r[$name]); - } - - if (isset($this->metadata[$r['id']])){ - $lines[] = ""; - $lines[] = "[metadata]"; - foreach ($this->metadata[$r['id']] as $key=>$val) - if ($key != "id"){ - $key = preg_replace("/[^\p{L}|\p{N}]+/u", "_", $key); - $lines[] = sprintf("%s = %s", $key, $val); - } - } - - if ($this->split_documents){ - $subfolder = $this->folder . "/" . - preg_replace("/[^\p{L}|\p{N}]+/u", "_", $r['name']) . "/"; - } - - $filename = $subfolder . str_pad($r['id'], 8, "0", STR_PAD_LEFT) . ".ini"; - $f = fopen($filename, "w"); - fwrite($f, implode("\n", $lines)); - fclose($f); - } - } - - /** - * Removes annotations according to flags. If there is a flag - * for fiven layer of annotations, the flag for document must be set to 3 or 4. - * In other case the annotation is discarded. - * - * @param $flags --- array of document flags and values. - * @param $annotations --- array of annotations. - */ - function filterAnnotationsByFlags($report_id, $flags, $annotations){ - $annotatons_filtered = array(); - $skipped = array(); - foreach ($annotations as $an){ - $group_id = intval($an['group_id']); - $keep = false; - - switch ( $group_id ) { - case 1: - $keep = isset($flags[FLAG_NAMES]) - && $this->flagReady($flags[FLAG_NAMES]); - break; - - case 2: - $keep = isset($flags[FLAG_WSD]) - && $this->flagReady($flags[FLAG_WSD]); - break; - - case 7: - $keep = isset($flags[FLAG_CHUNKS]) - && $this->flagReady($flags[FLAG_CHUNKS]); - break; - - default: - $keep = true; - break; - } - - if ($keep){ - $annotatons_filtered[] = $an; - } - else{ - $skipped[$an['name']] = 1; - } - } - - if (count($skipped) && $this->verbose){ - echo sprintf(">> [id=%d] Skipped annotations: %s\n", - $report_id, implode(", ", array_keys($skipped))); - } - - return $annotatons_filtered; - } - - /** - * - */ - function filterRelationsByFlags($report_id, $flags, $relations){ - $relations_filtered = array(); - $skipped = array(); - foreach ($relations as $rel){ - $group_id = intval($rel['relation_set_id']); - $keep = false; - - switch ( $group_id ) { - case 1: /* Chunks relations */ - $keep = isset($flags[FLAG_CHUNKS_REL]) - && $this->flagReady($flags[FLAG_CHUNKS_REL]); - break; - - case 2: /* Names relations */ - $keep = isset($flags[FLAG_NAMES_REL]) - && $this->flagReady($flags[FLAG_NAMES_REL]); - break; - - case 3: /* Coreference */ - $keep = isset($flags[FLAG_COREF]) - && $this->flagReady($flags[FLAG_COREF]); - break; - - default: - $keep = true; - break; - } - - if ($keep){ - $relations_filtered[] = $rel; - } - else{ - $skipped[$rel['name']] = 1; - } - } - - if (count($skipped) && $this->verbose){ - echo sprintf(">> [id=%d] Skipped relations: %s\n", - $report_id, implode(", ", array_keys($skipped))); - } - - return $relations_filtered; - } - - /** - * - */ - function flagReady($value){ - return in_array(intval($value), array(3,4)); - } -} - - - - -?> diff --git a/engine/include/export/FileWriter.php b/engine/include/export/FileWriter.php new file mode 100644 index 00000000..44c6cda0 --- /dev/null +++ b/engine/include/export/FileWriter.php @@ -0,0 +1,27 @@ +writeTextToFile($fileName,$textContent); + + } // writeJSONToFile() + +} // FileWriter class diff --git a/engine/include/export/XmlFactory.php b/engine/include/export/XmlFactory.php new file mode 100644 index 00000000..9d08818e --- /dev/null +++ b/engine/include/export/XmlFactory.php @@ -0,0 +1,16 @@ +setCclProperties($annotations,$relations,$lemmas,$attributes); + // export from $ccl to files + $writer = new CclWriter(); + $writer->write($ccl, $filePathWithoutExt.".xml", CclWriter::$CCL); + $writer->write($ccl, $filePathWithoutExt.".rel.xml", CclWriter::$REL); + + } // exportToXmlAndRelxml() + +} // XmlFactory class diff --git a/engine/include/structs/CclChunk.php b/engine/include/structs/CclChunk.php new file mode 100644 index 00000000..4bd9be47 --- /dev/null +++ b/engine/include/structs/CclChunk.php @@ -0,0 +1,63 @@ +setSentenceIndexInTokens(count($this->sentences)); + $this->sentences[] = $sentence; + } + + function setType($type){ + $this->type = $type; + } + + function setId($id){ + $this->id = $id; + } + + function getSentences(){ + return $this->sentences; + } + + function getId(){ + return $this->id; + } + + function getType(){ + return $this->type; + } + + public function setChunkIndexInTokens($chunkIndex) { + foreach($this->sentences as $sentence) { + $sentence->setChunkIndexInTokens($chunkIndex); + } + } // setParentIndexesInTokens + + public function getSentenceByIndex($sentenceIndex){ + // sentenceIndex may be digit 0 + if( is_numeric($sentenceIndex) and ($sentenceIndex < count($this->sentences)) ) { + return $this->sentences[$sentenceIndex]; + } else { + return null; + } + } // getSentenceByIndex + +} // CclChunk class + +?> diff --git a/engine/include/structs/CclStruct.php b/engine/include/structs/CclDocument.php similarity index 56% rename from engine/include/structs/CclStruct.php rename to engine/include/structs/CclDocument.php index 11d95472..f519d1fc 100644 --- a/engine/include/structs/CclStruct.php +++ b/engine/include/structs/CclDocument.php @@ -97,7 +97,6 @@ function getRelations(){ } function setAnnotationLemma($annotation_lemma){ - $type = $annotation_lemma['type']; if ( !isset($this->char2token[$annotation_lemma['from']])){ $e = new CclError(); @@ -123,6 +122,8 @@ function setAnnotationLemma($annotation_lemma){ $token = & $this->tokens[$i]; if (! $token->setAnnotationLemma($annotation_lemma)){ + // 'dead code' - actually CclToken->setAnnotationLemma() + // returns always true. For invalid data too. $e = new CclError(); $e->setClassName("CclDocument"); $e->setFunctionName("setAnnotationLemma"); @@ -134,7 +135,6 @@ function setAnnotationLemma($annotation_lemma){ } function setAnnotationProperty($annotation_property){ - $type = $annotation_property['type']; if ( !isset($this->char2token[$annotation_property['from']])){ $e = new CclError(); @@ -205,8 +205,8 @@ function setAnnotation($annotation){ } $found = true; } - if ( $annotation['value'] ){ - $prop_name = sprintf("sense:%s", $annotation['name']); + if ( isset($annotation['value']) && $annotation['value'] ){ + $prop_name = sprintf("sense:%s", $annotation['type']); $token->prop[$prop_name] = $annotation['value']; } if (! $token->setAnnotation($annotation,$this->getSentenceByToken($token)->channels)){ @@ -410,483 +410,7 @@ protected function getSentenceByToken($token){ return null; } // getSentenceByToken -} -class CclChunk{ - var $id; // optional - var $type; //required - var $sentences = array(); - - function addSentence($sentence){ - assert('$sentence instanceof CclSentence'); - $sentence->setSentenceIndexInTokens(count($this->sentences)); - $this->sentences[] = $sentence; - } - - function setType($type){ - $this->type = $type; - } - - function setId($id){ - $this->id = $id; - } - - function getSentences(){ - return $this->sentences; - } - - function getId(){ - return $this->id; - } - - function getType(){ - return $this->type; - } - - public function setChunkIndexInTokens($chunkIndex) { - foreach($this->sentences as $sentence) { - $sentence->setChunkIndexInTokens($chunkIndex); - } - } // setParentIndexesInTokens - - public function getSentenceByIndex($sentenceIndex){ - // sentenceIndex may be digit 0 - if( is_numeric($sentenceIndex) and ($sentenceIndex < count($this->sentences)) ) { - return $this->sentences[$sentenceIndex]; - } else { - return null; - } - } // getSentenceByIndex - -} - -class CclSentence{ - var $id; // optional - var $tokens = array(); - var $channels = array(); //key: annotation_type, value: last channel number - - - function setId($id){ - $this->id = $id; - } - - function addToken($token){ - assert('$token instanceof CclToken'); - $this->tokens[] = $token; - } - - function getTokens(){ - return $this->tokens; - } - - function getId(){ - return $this->id; - } - - function setChannel($type, $value){ - $this->channels[$type] = $value; - } - - function incChannel($type){ - if (!array_key_exists($type, $this->channels)) - $this->channels[$type]=1; - else $this->channels[$type]++; - } - - function fillChannel($type){ - foreach ($this->tokens as $token){ - $token->fillChannel($type); - } - } - - function getChannel($type){ - if ($type == null) return 0; - if (!array_key_exists($type, $this->channels)) - return 0; - else return $this->channels[$type]; - } - - public function setSentenceIndexInTokens($sentenceIndex) { - foreach($this->tokens as $token) { - $token->setParentSentenceIndex($sentenceIndex); - } - } // setSentenceIndexInTokens - - public function setChunkIndexInTokens($chunkIndex) { - foreach($this->tokens as $token) { - $token->setParentChunkIndex($chunkIndex); - } - } // setChunkIndexInTokens - -} - -class CclToken{ - var $id = null; - var $orth = null; - // If token is preceded by a white space - var $ns = false; - var $lexemes = array(); - var $from = null; - var $to = null; - private $parentSentenceIndex = null; - // parent sentence index in chunk sentences[] array - private $parentChunkIndex = null; - // parent chunk index in document chunks[] array - var $channels = array(); //same as in sentence, but with unique according number - var $prop = null; - - function setOrth($orth){ - $this->orth = $orth; - } - - function setNs($ns){ - $this->ns = $ns; - } - - function setId($id){ - $this->id = $id; - } - - function setFrom($from){ - $this->from = $from; - } - - function setTo($to){ - $this->to = $to; - } - - function setAnnotationLemma($annotation_lemma){ - $this->prop[$annotation_lemma["type"].":lemma"] = $annotation_lemma["lemma"]; - return true; - } - - function setAnnotationProperty($annotation_property){ - $this->prop[$annotation_property["type"].":".$annotation_property["name"]] = $annotation_property["value"]; - return true; - } - - function setAnnotation($annotation,$parentChannels = null){ - - $type = $annotation['type']; - if ($type=="sense"){ - /* - * Caution! Now WSD annotations are not part of any relations - * and all instances (even having more than 1 name in db) can - * be renumbered in 'sense' channel, e.g. - * [metrów] as wsd_m got number 6, but in db this instance - * was described also as wsd_metr (#3767), so there will be next - * assignment of channel number from parent sentence, which will be 7. - */ - - //if more than 1 annotation with the same name length covers one token (#3767): - if ($this->prop && (count($this->prop) == count($annotation['value'])) ){ - return false; - } - - else if (!$this->prop || (count($this->prop) < count($annotation['value'])) ){ - $this->prop = $annotation['value']; - } - - $this->channels[$type] = $annotation['id']; - } - else { - if (array_key_exists($type, $this->channels) && $this->channels[$type]!=0 ){ - return false; - } - - if (is_array($parentChannels) && !array_key_exists($type, $parentChannels) ){ - return false; - } - - $this->channels[$type] = $annotation['id']; - } - - return true; - } - - function setContinuousAnnotation2($type,$parentChannels = null){ - - // $parentChannels may be null or sth - if(!is_array($parentChannels)) - return false; - //annotation might exist in more than one sentence - if (!array_key_exists($type, $parentChannels) ) - return false; - $this->channels[$type] = $parentChannels[$type]; - return true; - } - - function fillChannel($type){ - if (!array_key_exists($type, $this->channels)) - $this->channels[$type]=0; - } - - function addLexeme($lexeme){ - $this->lexemes[] = $lexeme; - } - - function getOrth(){ - return $this->orth; - } - - function getNs(){ - return $this->ns; - } - - function getLexemes(){ - return $this->lexemes; - } - - function getChannels(){ - return $this->channels; - } - - function getChannel($type){ - if (!array_key_exists($type, $this->channels)) - return 0; - return $this->channels[$type]; - } - - function getId(){ - return $this->id; - } - - function getFrom(){ - return $this->from; - } - - function getTo(){ - return $this->to; - } - - function isIn($annotation){ - return ($this->from >= $annotation['from'] && $this->to <= $annotation['to']); - } - - /** - * Return base for first disamb lexem. - * If no dismb lexems is found then the base of a first lexem is returned. - */ - function getBase(){ - foreach ($this->lexemes as $lexem){ - if ($lexem->getDisamb()){ - return $lexem->getBase(); - } - } - if ( count($this->lexemes) > 0){ - return $this->lexemes[0]->getBase(); - } - return null; - } - - public function setParentSentenceIndex($parentSentenceIndex) { - $this->parentSentenceIndex = $parentSentenceIndex; - } // setParentSentenceIndex - - public function getParentSentenceIndex() { - return $this->parentSentenceIndex; - } // getParentSentenceIndex - - public function setParentChunkIndex($parentChunkIndex) { - $this->parentChunkIndex = $parentChunkIndex; - } // setParentChunkIndex - - public function getParentChunkIndex() { - return $this->parentChunkIndex; - } // getParentChunkIndex - -} - -class CclLexeme{ - var $disamb = null; - var $base = null; - var $ctag = null; - - function setDisamb($disamb){ - $this->disamb = $disamb; - } - - function setBase($base){ - $this->base = $base; - } - - function setCtag($ctag){ - $this->ctag = $ctag; - } - - function getDisamb(){ - return $this->disamb; - } - - function getBase(){ - return $this->base; - } - - function getCtag(){ - return $this->ctag; - } - -} - -class CclRelation{ - var $name = null; - var $set = null; - var $fromSentence = null; - var $fromChannel = null; - var $toSentence = null; - var $toChannel = null; - var $fromType = null; - var $toType = null; - - function getName(){ - return $this->name; - } - - function getSet(){ - return $this->set; - } - - function getFromSentence(){ - return $this->fromSentence; - } - - function getToSentence(){ - return $this->toSentence; - } - - function getFromChannel(){ - return $this->fromChannel; - } - - function getToChannel(){ - return $this->toChannel; - } - - function getFromType(){ - return $this->fromType; - } - - function getToType(){ - return $this->toType; - } - - function setName($name){ - $this->name = $name; - } - - function setSet($set){ - $this->set = $set; - } - - function setFromSentence($fromSentence){ - $this->fromSentence = $fromSentence; - } - - function setToSentence($toSentence){ - $this->toSentence = $toSentence; - } - - function setFromChannel($fromChannel){ - $this->fromChannel = $fromChannel; - } - - function setToChannel($toChannel){ - $this->toChannel = $toChannel; - } - - function setFromType($fromType){ - $this->fromType = $fromType; - } - - function setToType($toType){ - $this->toType = $toType; - } - -} - -class CclError{ - var $className = null; - var $functionName = null; - var $objects = array(); - var $comments = array(); - - - function setClassName($className){ - $this->className = $className; - } - - function setFunctionName($functionName){ - $this->functionName = $functionName; - } - - function addObject($key, $value){ - $this->objects[$key] = $value; - } - - function addComment($value){ - $this->comments[] = $value; - } - - - function getClassName(){ - return $this->className; - } - - function getFunctionName(){ - return $this->functionName; - } - - function getObjects(){ - return $this->objects; - } - - function getComments(){ - return $this->comments; - } - - - function __toString(){ - $str = "---------------------ERROR-------------------------\n"; - $str .= "class: {$this->className}\n"; - $str .= "function: {$this->functionName}\n"; - $str .= "comments: \n"; - foreach ($this->comments as $comment) - $str .= " $comment\n"; - $str .= "objects: \n"; - - foreach ($this->objects as $key=>$obj){ - if ($key=="token"){ - $str .= " Token:\n"; - $str .= " Orth: {$obj->getOrth()}\n"; - $str .= " From: {$obj->getFrom()}\n"; - $str .= " To : {$obj->getTo()}\n"; - } - elseif (strpos($key, "annotation") === 0){ - $str .= " Annotation:\n"; - $str .= " Key : $key \n"; - $str .= " Type: {$obj['type']}\n"; - $str .= " From: {$obj['from']}\n"; - $str .= " To : {$obj['to']}\n"; - $str .= " Text: {$obj['text']}\n"; - } - elseif ($key=="relation"){ - $str .= " Relation:\n"; - $str .= " Source id: {$obj['source_id']}\n"; - $str .= " Target id: {$obj['target_id']}\n"; - } - elseif ($key=="message"){ - $str .= "message: $obj"; - } - else { - $str .= " $key\n"; - $str .= " build your own user-friendly dump\n"; - } - } - return $str; - } - - - -} +} // CclDocument ?> diff --git a/engine/include/structs/CclError.php b/engine/include/structs/CclError.php new file mode 100644 index 00000000..fac4713b --- /dev/null +++ b/engine/include/structs/CclError.php @@ -0,0 +1,92 @@ +className = $className; + } + + function setFunctionName($functionName){ + $this->functionName = $functionName; + } + + function addObject($key, $value){ + $this->objects[$key] = $value; + } + + function addComment($value){ + $this->comments[] = $value; + } + + + function getClassName(){ + return $this->className; + } + + function getFunctionName(){ + return $this->functionName; + } + + function getObjects(){ + return $this->objects; + } + + function getComments(){ + return $this->comments; + } + + + function __toString(){ + $str = "---------------------ERROR-------------------------\n"; + $str .= "class: {$this->className}\n"; + $str .= "function: {$this->functionName}\n"; + $str .= "comments: \n"; + foreach ($this->comments as $comment) + $str .= " $comment\n"; + $str .= "objects: \n"; + + foreach ($this->objects as $key=>$obj){ + if ($key=="token"){ + $str .= " Token:\n"; + $str .= " Orth: {$obj->getOrth()}\n"; + $str .= " From: {$obj->getFrom()}\n"; + $str .= " To : {$obj->getTo()}\n"; + } + elseif (strpos($key, "annotation") === 0){ + $str .= " Annotation:\n"; + $str .= " Key : $key \n"; + $str .= " Type: {$obj['type']}\n"; + $str .= " From: {$obj['from']}\n"; + $str .= " To : {$obj['to']}\n"; + $str .= " Text: {$obj['text']}\n"; + } + elseif ($key=="relation"){ + $str .= " Relation:\n"; + $str .= " Source id: {$obj['source_id']}\n"; + $str .= " Target id: {$obj['target_id']}\n"; + } + elseif ($key=="message"){ + $str .= "message: $obj"; + } + else { + $str .= " $key\n"; + $str .= " build your own user-friendly dump\n"; + } + } + return $str; + } + +} // CclError class + +?> diff --git a/engine/include/structs/CclLexeme.php b/engine/include/structs/CclLexeme.php new file mode 100644 index 00000000..95c5f5b2 --- /dev/null +++ b/engine/include/structs/CclLexeme.php @@ -0,0 +1,40 @@ +disamb = $disamb; + } + + function setBase($base){ + $this->base = $base; + } + + function setCtag($ctag){ + $this->ctag = $ctag; + } + + function getDisamb(){ + return $this->disamb; + } + + function getBase(){ + return $this->base; + } + + function getCtag(){ + return $this->ctag; + } + +} // CclLexeme class + +?> diff --git a/engine/include/structs/CclRelation.php b/engine/include/structs/CclRelation.php new file mode 100644 index 00000000..e587c9b9 --- /dev/null +++ b/engine/include/structs/CclRelation.php @@ -0,0 +1,85 @@ +name; + } + + function getSet(){ + return $this->set; + } + + function getFromSentence(){ + return $this->fromSentence; + } + + function getToSentence(){ + return $this->toSentence; + } + + function getFromChannel(){ + return $this->fromChannel; + } + + function getToChannel(){ + return $this->toChannel; + } + + function getFromType(){ + return $this->fromType; + } + + function getToType(){ + return $this->toType; + } + + function setName($name){ + $this->name = $name; + } + + function setSet($set){ + $this->set = $set; + } + + function setFromSentence($fromSentence){ + $this->fromSentence = $fromSentence; + } + + function setToSentence($toSentence){ + $this->toSentence = $toSentence; + } + + function setFromChannel($fromChannel){ + $this->fromChannel = $fromChannel; + } + + function setToChannel($toChannel){ + $this->toChannel = $toChannel; + } + + function setFromType($fromType){ + $this->fromType = $fromType; + } + + function setToType($toType){ + $this->toType = $toType; + } + +} // CclRelation class + +?> diff --git a/engine/include/structs/CclSentence.php b/engine/include/structs/CclSentence.php new file mode 100644 index 00000000..cfce19db --- /dev/null +++ b/engine/include/structs/CclSentence.php @@ -0,0 +1,75 @@ +id = $id; + } + + function addToken($token){ + assert('$token instanceof CclToken'); + $this->tokens[] = $token; + } + + function getTokens(){ + return $this->tokens; + } + + function getId(){ + return $this->id; + } + + function setChannel($type, $value){ + $this->channels[$type] = $value; + } + + function incChannel($type){ + if (!array_key_exists($type, $this->channels)) + $this->channels[$type]=1; + else $this->channels[$type]++; + } + + function fillChannel($type){ + foreach ($this->tokens as $token){ + $token->fillChannel($type); + } + } + + function getChannel($type){ + if ($type == null) return 0; + if (!array_key_exists($type, $this->channels)) + return 0; + else return $this->channels[$type]; + } + + public function setSentenceIndexInTokens($sentenceIndex) { + foreach($this->tokens as $token) { + $token->setParentSentenceIndex($sentenceIndex); + } + } // setSentenceIndexInTokens + + public function setChunkIndexInTokens($chunkIndex) { + foreach($this->tokens as $token) { + $token->setParentChunkIndex($chunkIndex); + } + } // setChunkIndexInTokens + +} // CclSentence class + +?> diff --git a/engine/include/structs/CclStruct2.php b/engine/include/structs/CclStruct2.php deleted file mode 100644 index ffe213d0..00000000 --- a/engine/include/structs/CclStruct2.php +++ /dev/null @@ -1,124 +0,0 @@ -name = $name; - $this->value = $value; - } - - function getXml(){ - return " name}\">{$this->value}\n"; - } -} - -class CclLexem { - public $disamb = null; - public $base = null; - public $ctag = null; - function __construct($disamb, $base, $ctag){ - $this->disamb = $disamb; - $this->base = htmlspecialchars($base); - $this->ctag = $ctag; - } - - function getXml(){ - //return ""; - $xml = $this->disamb ? " \n" : " \n"; - $xml .= " {$this->base}\n"; - $xml .= " {$this->ctag}\n"; - return $xml . " \n"; - } -} - -class CclToken { - public $orth = null; - public $lexemes = null; - public $channels = null; - public $ns = null; - - function __construct($orth){ - $this->orth = htmlspecialchars($orth); - $this->lexemes = array(); - $this->channels = array(); - $this->ns = false; - } - - function getXml($channelTypes){ - $xml = " \n"; - $xml .= " {$this->orth}\n"; - foreach ($this->lexemes as $lexeme) - $xml .= $lexeme->getXml(); - - foreach ($channelTypes as $annType) - $xml .= $this->channels[$annType]->getXml(); - if ($this->ns) return $xml . " \n \n"; - return $xml . " \n"; - - } -} - -class CclSentence { - public $tokens = null; - public $channelTypes = null; - public $id = null; - - function __construct($id){ - $this->tokens = array(); - $this->channelTypes = array(); - $this->id = $id; - } - - function getXml(){ - $usedTypes = array_keys($this->channelTypes); - $xml = " id}\">\n"; - foreach ($this->tokens as $token) - $xml .= $token->getXml($usedTypes); - return $xml . " \n"; - } -} - -class CclChunk { - public $id = null; - public $sentences = null; - function __construct($id){ - $this->id = $id; - $this->sentences = array(); - } - - function getXml(){ - $xml = " id}\">\n"; - foreach ($this->sentences as $sentence) - $xml .= $sentence->getXml(); - return $xml . " \n"; - } -} - -class CclDocument { - public $chunks = null; - - function __construct(){ - $this->chunks = array(); - } - - function getXml(){ - $xml = "\n"; - foreach ($this->chunks as $chunk) - $xml .= $chunk->getXml(); - return $xml . "\n"; - - } - -} - - - -?> diff --git a/engine/include/structs/CclToken.php b/engine/include/structs/CclToken.php new file mode 100644 index 00000000..d8e97712 --- /dev/null +++ b/engine/include/structs/CclToken.php @@ -0,0 +1,189 @@ +orth = $orth; + } + + function setNs($ns){ + $this->ns = $ns; + } + + function setId($id){ + $this->id = $id; + } + + function setFrom($from){ + $this->from = $from; + } + + function setTo($to){ + $this->to = $to; + } + + function setAnnotationLemma($annotation_lemma){ + $this->prop[$annotation_lemma["type"].":lemma"] = $annotation_lemma["lemma"]; + return true; + } + + function setAnnotationProperty($annotation_property){ + $this->prop[$annotation_property["type"].":".$annotation_property["name"]] = $annotation_property["value"]; + return true; + } + + public function setAnnotation($annotation,$parentChannels = null){ + + $type = $annotation['type']; + if ($type=="sense"){ + /* + * Caution! Now WSD annotations are not part of any relations + * and all instances (even having more than 1 name in db) can + * be renumbered in 'sense' channel, e.g. + * [metrów] as wsd_m got number 6, but in db this instance + * was described also as wsd_metr (#3767), so there will be next + * assignment of channel number from parent sentence, which will be 7. + */ + + //if more than 1 annotation with the same name length covers one token (#3767): + if ($this->prop && (count($this->prop) == count($annotation['value'])) ){ + return false; + } + + else if (!$this->prop || (count($this->prop) < count($annotation['value'])) ){ + $this->prop = $annotation['value']; + } + } + else { + if (array_key_exists($type, $this->channels) && $this->channels[$type]!=0 ){ + return false; + } + + if (is_array($parentChannels) && !array_key_exists($type, $parentChannels) ){ + return false; + } + } + // add to typed channel and return true if not exited earlier + $this->channels[$type] = $annotation['id']; + return true; + } // setAnnotation() + + function setContinuousAnnotation2($type,$parentChannels = null){ + + // $parentChannels may be null or sth + if(!is_array($parentChannels)) + return false; + //annotation might exist in more than one sentence + if (!array_key_exists($type, $parentChannels) ) + return false; + $this->channels[$type] = $parentChannels[$type]; + return true; + } + + function fillChannel($type){ + if (!array_key_exists($type, $this->channels)) + $this->channels[$type]=0; + } + + function addLexeme($lexeme){ + $this->lexemes[] = $lexeme; + } + + function getOrth(){ + return $this->orth; + } + + function getNs(){ + return $this->ns; + } + + function getLexemes(){ + return $this->lexemes; + } + + function getChannels(){ + return $this->channels; + } + + function getChannel($type){ + if (!array_key_exists($type, $this->channels)) + return 0; + return $this->channels[$type]; + } + + function getId(){ + return $this->id; + } + + function getFrom(){ + return $this->from; + } + + function getTo(){ + return $this->to; + } + + function isIn($annotation){ + return ($this->from >= $annotation['from'] && $this->to <= $annotation['to']); + } + + /** + * Return base for first disamb lexem. + * If no dismb lexems is found then the base of a first lexem is returned. + */ + function getBase(){ + foreach ($this->lexemes as $lexem){ + if ($lexem->getDisamb()){ + return $lexem->getBase(); + } + } + if ( count($this->lexemes) > 0){ + return $this->lexemes[0]->getBase(); + } + return null; + } + + public function setParentSentenceIndex($parentSentenceIndex) { + $this->parentSentenceIndex = $parentSentenceIndex; + } // setParentSentenceIndex + + public function getParentSentenceIndex() { + return $this->parentSentenceIndex; + } // getParentSentenceIndex + + public function setParentChunkIndex($parentChunkIndex) { + $this->parentChunkIndex = $parentChunkIndex; + } // setParentChunkIndex + + public function getParentChunkIndex() { + return $this->parentChunkIndex; + } // getParentChunkIndex + +} // CclToken class + +?> diff --git a/engine/include/writers/CCclWriter.php b/engine/include/writers/CCclWriter.php index 42329bf5..7a8d8083 100644 --- a/engine/include/writers/CCclWriter.php +++ b/engine/include/writers/CCclWriter.php @@ -11,8 +11,26 @@ class CclWriter{ public static $CCL = 2; public static $REL = 3; + protected function formatPropToXML($propTable) { + + $xml = ""; // for no data + if (($propTable) && is_array($propTable)) { + foreach ($propTable as $key=>$val) { + if (strpos($val, ';;') !== FALSE){ + $values = explode(";;", $val); + $xml .= sprintf(" %s\n", htmlspecialchars(str_replace("lemma", "lval", $key)), htmlspecialchars($values[0])); + $xml .= sprintf(" %s\n", htmlspecialchars(str_replace("lemma", "val", $key)), htmlspecialchars($values[1])); + } else { + $xml .= sprintf(" %s\n", htmlspecialchars($key), htmlspecialchars($val)); + } + } // foreach + } // if is_array + return $xml; + + } // formatPropToXML() - static function write($ccl, $filename, $mode){ + private function makeXmlData($ccl,$mode) { + $xml = "\n"; $xml .= "\n"; @@ -38,17 +56,7 @@ static function write($ccl, $filename, $mode){ } foreach ($channels as $type=>$number) $xml .= " {$number}\n"; - if ($token->prop){ - foreach ($token->prop as $key=>$val){ - if (strpos($val, ';;') !== FALSE){ - $values = explode(";;", $val); - $xml .= sprintf(" %s\n", htmlspecialchars(str_replace("lemma", "lval", $key)), htmlspecialchars($values[0])); - $xml .= sprintf(" %s\n", htmlspecialchars(str_replace("lemma", "val", $key)), htmlspecialchars($values[1])); - } - else - $xml .= sprintf(" %s\n", htmlspecialchars($key), htmlspecialchars($val)); - } - } + $xml.= $this->formatPropToXML($token->prop); $xml .= $token->ns ? " \n \n" : " \n"; } $xml .= " \n"; @@ -73,13 +81,17 @@ static function write($ccl, $filename, $mode){ } if ($mode==self::$CCL || $mode==self::$CCLREL) $xml .= "\n"; - $handle = fopen($filename, "w"); - fwrite($handle, $xml); - fclose($handle); - } - - + + return $xml; + + } // makeXmlData() + + public function write($ccl, $filename, $mode){ + + (new FileWriter()) -> writeTextToFile($filename,$this->makeXmlData($ccl,$mode)); + + } // write() -} +} // CclWriter class ?> diff --git a/engine/templates/page_corpus_export.tpl b/engine/templates/page_corpus_export.tpl index 2c3fb395..a928d070 100644 --- a/engine/templates/page_corpus_export.tpl +++ b/engine/templates/page_corpus_export.tpl @@ -109,6 +109,7 @@