diff --git a/config.example.py b/config.example.py index 256149d..6080917 100644 --- a/config.example.py +++ b/config.example.py @@ -3,13 +3,15 @@ from pathlib import Path # Add your botpassword and login here: +from typing import List + username = "" password = "" # Global settings +loglevel = logging.WARNING wiki_user = "User:Username" # Change this to your username -list_of_allowed_aliases = [] # Add elements like this ["API"] -logging.basicConfig(level=logging.WARNING) +list_of_allowed_aliases: List[str] = [] # Add elements like this ["API"] version = "0.2" # Don't touch this. wd_prefix = "http://www.wikidata.org/entity/" endpoint = "https://query.wikidata.org/sparql" diff --git a/diagrams/classes.puml b/diagrams/classes.puml new file mode 100644 index 0000000..3bf280d --- /dev/null +++ b/diagrams/classes.puml @@ -0,0 +1,169 @@ +@startuml +'https://plantuml.com/class-diagram + +abstract class BaseModel + +package wikimedia { + enum WikimediaLanguageCode { + BASQUE + BENGALI + BOKMÅL + CZECH + DANISH + ENGLISH + ESTONIAN + FRENCH + GERMAN + HEBREW + LATIN + MALAYALAM + RUSSIAN + SWEDISH + } + enum WikimediaLanguageQID { + BASQUE = "Q8752" + BENGALI = "Q9610" + BOKMÅL = "Q25167" + CZECH = "Q9056" + DANISH = "Q9035" + ENGLISH = "Q1860" + ESTONIAN = "Q9072" + FRENCH = "Q150" + GERMAN = "Q188" + HEBREW = "Q9288" + LATIN = "Q397" + MALAYALAM = "Q36236" + RUSSIAN = "Q7737" + SWEDISH = "Q9027" + } + package wikidata { + class Entity { + id: Optional[str] + label: str + upload_one_statement_to_wikidata() + url() + } + class EntityID{ + letter: WikidataNamespaceLetters + rest: str + __init__() + __str__() + } + class ForeignID{ + __init__() + } + class SparqlItem{ + item: Value + itemLabel: Value + validate_qid_and_copy_label() + } + class Item{ + label: Optional[str] = None + description: Optional[str] = None + aliases: Optional[List[str]] = None + __init__() + __str__() + parse_json() + parse_from_wdqs_json() + fetch_label_and_description_and_aliases() + } + enum WikidataGrammaticalFeature { + ACTIVE_VOICE + DEFINITE + GENITIVE_CASE + IMPERATIVE + INDEFINITE + INFINITIVE + NOMINATIVE_CASE + PASSIVE_VOICE + PLURAL + PRESENT_TENSE + PRETERITE + SIMPLE_PRESENT + SINGULAR + SUPINE + THIRD_PERSON_SINGULAR + } + enum WikidataLexicalCategory { + ADJECTIVE + ADVERB + AFFIX + NOUN + PROPER_NOUN + VERB + } + enum WikidataNamespaceLetters { + ITEM + LEXEME + PROPERTY + } + } +} +package items { + abstract class Items + class AcademicJournalItems { + fetch_based_on_label() + } + class RiksdagenDocumentItems { + +list + +fetch_based_on_label() + } + + class ScholarlyArticleItems { + +list + +fetch_based_on_label() + } + class ThesisItems { + list + fetch_based_on_label() + } +} +class Suggestion { + item: Item = None + search_strings: List[str] = None + task: Task = None + args: argparse.Namespace = None + __init__() + __str__() + add_to_items() + extract_search_strings() + search_urls ()) +} + +class Task { + best_practice_information: Union[str, None] = None + id: TaskIds = None + label: str = None + language_code: SupportedLanguageCode = None + number_of_queries_per_search_string = 1 + __init__() + __str__() +} + +class BatchJobs { +job_count +jobs: List[BatchJob] +print_running_jobs() +run_jobs() +} + +class BatchJob { + +suggestion: Suggestion + +items: Items + run() +} + +Items <|-- AcademicJournalItems +Items <|-- RiksdagenDocumentItems +Items <|-- ScholarlyArticleItems +Items <|-- ThesisItems +BaseModel <|-- Entity +BaseModel <|-- Task +BaseModel <|-- Suggestion +BaseModel <|-- BatchJob +BaseModel <|-- BatchJobs +BaseModel <|-- Items +Entity <|-- Item +Item <|-- SparqlItem + +@enduml \ No newline at end of file diff --git a/diagrams/sequence_sparql.puml b/diagrams/sequence_sparql.puml new file mode 100644 index 0000000..3f94e25 --- /dev/null +++ b/diagrams/sequence_sparql.puml @@ -0,0 +1,50 @@ +@startuml +'https://plantuml.com/sequence-diagram + +autonumber +actor User +'cloud Wikidata +User -> ItemSubjector : start script +alt "arguments: sparql && limit" + ItemSubjector -> Wikidata : fetch subjects + Wikidata -> ItemSubjector : response + loop "for each item in list" + alt "below limit" + ItemSubjector -> Wikidata : fetch details about the item + Wikidata -> ItemSubjector : response + ItemSubjector -> Wikidata : fetch scientific articles according to SPARQL query built based on the details + Wikidata -> ItemSubjector : response + ItemSubjector -> User : present max 50 items + ItemSubjector -> User : ask for approval of batch + ItemSubjector -> User : show count of batches and matches in the job list in memory + end + alt "above limit" + ItemSubjector -> User : ask before continuing + end + end + alt "user choose not to continue" + ItemSubjector -> Wikidata : Upload main subjects to all matches + end +end +alt "arguments: sparql && limit && prepare-jobs" + ItemSubjector -> Wikidata : fetch subjects + Wikidata -> ItemSubjector : response + loop "for each item in list" + alt "below limit" + ItemSubjector -> Wikidata : fetch details about the item + Wikidata -> ItemSubjector : response + ItemSubjector -> Wikidata : fetch scientific articles according to SPARQL query built based on the details + Wikidata -> ItemSubjector : response + ItemSubjector -> User : present max 50 items + ItemSubjector -> User : ask for approval of batch + ItemSubjector -> User : show count of batches and matches in the job list in memory + end + alt "above limit" + ItemSubjector -> User : ask before continuing + end + end + alt "user choose not to continue" + ItemSubjector -> Wikidata : save to job list on disk + end +end +@enduml \ No newline at end of file diff --git a/fetch_main_subjects.py b/fetch_main_subjects.py deleted file mode 100644 index 27e83cd..0000000 --- a/fetch_main_subjects.py +++ /dev/null @@ -1,71 +0,0 @@ -import logging -import random - -from wikibaseintegrator import wbi_config -from wikibaseintegrator.wbi_helpers import execute_sparql_query - -import config -from src import console -from src.helpers.cleaning import strip_prefix -from src.helpers.pickle import add_to_main_subject_pickle - -logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger(__name__) -wbi_config.config["USER_AGENT_DEFAULT"] = config.user_agent -console.print("Fetching 100,000 main subjects") -console.input("Press enter to continue") -subjects = [] -# This offset ensures that we don't get -# the same subset of subjects every time we run it -randomizing_offset: int = random.randint(1, 500000) -console.print(f"Random offset used: {randomizing_offset} for this run") -for i in range(0+randomizing_offset, 100000+randomizing_offset, 10000): - print(i) - # title: Get main subjects used at least once on scholarly articles - results = execute_sparql_query(f""" -SELECT ?subject -WHERE -{{ -{{ -SELECT DISTINCT ?subject WHERE {{ - hint:Query hint:optimizer "None". - ?item wdt:P31 wd:Q13442814; - wdt:P921 ?subject. -}} -offset {i} -limit 10000 -}} -MINUS{{ -?item wdt:P31 wd:Q8054. # protein -}} -MINUS{{ -?item wdt:P279 wd:Q8054. # protein -}} -MINUS{{ -?item wdt:P31 wd:Q7187. # gene -}} -MINUS{{ -?item wdt:P279 wd:Q7187. # gene -}} -}} - """) - if len(results) == 0: - raise ValueError("No main subjects found") - else: - # print("adding lexemes to list") - # pprint(results.keys()) - # pprint(results["results"].keys()) - # pprint(len(results["results"]["bindings"])) - for result in results["results"]["bindings"]: - # print(result) - subjects.append(strip_prefix(result["subject"]["value"])) - # exit(0) -console.print(f"{len(subjects)} fetched") -console.print("Filtering out duplicates") -subjects_without_duplicates = set() -for subject in subjects: - subjects_without_duplicates.add(subject) -console.print(f"Saving {len(subjects_without_duplicates)} " - f"to pickle '{config.main_subjects_pickle_file_path}' (overwriting)") -add_to_main_subject_pickle(subjects) -console.print("Done") \ No newline at end of file diff --git a/itemsubjector.py b/itemsubjector.py index 344ed19..182c802 100644 --- a/itemsubjector.py +++ b/itemsubjector.py @@ -1,6 +1,6 @@ import logging -from src import * +import src logging.basicConfig(level=logging.DEBUG) -main() \ No newline at end of file +src.main() diff --git a/requirements.txt b/requirements.txt index bef4108..404a2cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ console-menu git+git://github.com/LeMyst/WikibaseIntegrator@v0.12.0.dev5#egg=wikibaseintegrator rich~=10.9.0 -SPARQLWrapper~=1.8.5 \ No newline at end of file +SPARQLWrapper~=1.8.5 +pydantic \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py index 7223475..6b1a561 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,30 +1,32 @@ import argparse import logging -from typing import List -from wikibaseintegrator import wbi_login, wbi_config -from wikibaseintegrator.wbi_helpers import execute_sparql_query +import pandas as pd # type: ignore +from wikibaseintegrator import wbi_login, wbi_config # type: ignore +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config from src.helpers.argparse_setup import setup_argparse_and_return_args from src.helpers.cleaning import strip_prefix -from src.helpers.console import console, print_found_items_table, ask_add_to_job_queue, print_running_jobs, \ +from src.helpers.console import console, print_found_items_table, ask_add_to_job_queue, \ ask_yes_no_question, print_finished, \ print_keep_an_eye_on_wdqs_lag, print_best_practice, print_job_statistics, ask_discard_existing_job_pickle from src.helpers.enums import TaskIds -from src.helpers.jobs import process_qid_into_job, process_user_supplied_qids_into_batch_jobs, run_jobs, \ +from src.helpers.jobs import process_qid_into_job, process_user_supplied_qids_into_batch_jobs, \ handle_job_preparation_or_run_directly_if_any_jobs, get_validated_main_subjects_as_jobs from src.helpers.menus import select_task from src.helpers.migration import migrate_pickle_detection from src.helpers.pickle import parse_job_pickle, remove_job_pickle, add_to_job_pickle, check_if_pickle_exists, \ - parse_main_subjects_pickle, get_hash_of_job_pickle + get_hash_of_job_pickle from src.models.batch_job import BatchJob -from src.models.quickstatements import QuickStatementsCommandVersion1 +from src.models.batch_jobs import BatchJobs from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Item, EntityID +from src.models.wikimedia.wikidata.entiyt_id import EntityId from src.tasks import tasks +logging.basicConfig(level=config.loglevel) + def login(): with console.status("Logging in with WikibaseIntegrator..."): @@ -38,30 +40,12 @@ def login(): wbi_config.config["USER_AGENT_DEFAULT"] = config.user_agent -def match_existing_main_subjects(args: argparse.Namespace = None, - jobs: List[BatchJob] = None): - if jobs is None: - raise ValueError("jobs was None") - if not isinstance(jobs, List): - raise ValueError("jobs was not a list") - with console.status("Reading the main subjects file into memory"): - main_subjects = parse_main_subjects_pickle() - # raise Exception("debug exit") - jobs = get_validated_main_subjects_as_jobs(args=args, - main_subjects=main_subjects, - jobs=jobs) - handle_job_preparation_or_run_directly_if_any_jobs(args=args, jobs=jobs) - - -def match_main_subjects_from_sparql(args: argparse.Namespace = None, - jobs: List[BatchJob] = None): +def match_main_subjects_from_sparql(args: argparse.Namespace = None): """Collect subjects via SPARQL and call get_validated_main_subjects() If we get any validated jobs we handle them""" logger = logging.getLogger(__name__) - if jobs is None: - raise ValueError("jobs was None") - if not isinstance(jobs, List): - raise ValueError("jobs was not a list") + if args is None or args.sparql is None: + raise ValueError("args.sparql was None") if "P1889" not in args.sparql: console.print("Your SPARQL did not contain P1889 (different from). " "Please include 'MINUS {?item wdt:P1889 [].}' " @@ -78,12 +62,9 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None, main_subjects.append(item_json["item"]["value"]) if len(main_subjects) > 0: console.print(f"Got {len(main_subjects)} results") - jobs = get_validated_main_subjects_as_jobs( - args=args, - main_subjects=main_subjects, - jobs=jobs - ) - handle_job_preparation_or_run_directly_if_any_jobs(args=args, jobs=jobs) + batchjobs = get_validated_main_subjects_as_jobs(args=args, + main_subjects=main_subjects) + handle_job_preparation_or_run_directly_if_any_jobs(args=args, batchjobs=batchjobs) else: console.print("Got 0 results. Try another query or debug it using --debug") @@ -91,16 +72,15 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None, def export_jobs_to_dataframe(): logger = logging.getLogger(__name__) logger.info("Exporting jobs to DataFrame. All jobs are appended to one frame") - jobs = parse_job_pickle() - if jobs is not None: - number_of_jobs = len(jobs) - if jobs is not None and number_of_jobs > 0: - logger.info(f"Found {number_of_jobs} jobs") + batchjobs = parse_job_pickle() + if batchjobs is not None: + if batchjobs is not None and batchjobs.job_count > 0: + logger.info(f"Found {batchjobs.job_count} jobs") df = pd.DataFrame() count = 1 - for job in jobs: + for job in batchjobs.jobs: count += 1 - logger.info(f"Working on job {count}/{number_of_jobs}") + logger.info(f"Working on job {count}/{batchjobs.job_count}") job_df = pd.DataFrame() for item in job.items.list: job_df = job_df.append(pd.DataFrame(data=[dict( @@ -117,36 +97,11 @@ def export_jobs_to_dataframe(): else: console.print("No jobs found. Create a job list first by using '--prepare-jobs'") -def export_jobs_to_quickstatements(): - logger = logging.getLogger(__name__) - logger.info("Exporting jobs to QuickStatements V1 commands. One file for each job.") - jobs = parse_job_pickle() - if jobs is not None and len(jobs) > 0: - for job in jobs: - # Convert all items - lines = [] - for item in job.items.list: - line = QuickStatementsCommandVersion1( - target=EntityID(item.id), - property=EntityID("P921"), - value=EntityID(job.suggestion.item.id), - ) - lines.append(line) - logger.debug(f"Got {len(lines)} QS lines to export") - filename = (f"quickstatements-export-" - f"{job.suggestion.item.id}-" - f"{job.suggestion.item.label}.csv") - with open(filename, "w") as file: - for line in lines: - file.write(f"{str(line)}\n") - console.print(f"Wrote to {filename} in the current directory") - def main(): """This is the main function that makes everything else happen""" logger = logging.getLogger(__name__) migrate_pickle_detection() - jobs: List[BatchJob] = [] args = setup_argparse_and_return_args() # console.print(args.list) if args.remove_prepared_jobs is True: @@ -156,33 +111,22 @@ def main(): if args.prepare_jobs is True: logger.info("Preparing jobs") if check_if_pickle_exists(config.job_pickle_file_path): - if not ask_discard_existing_job_pickle(): - # the default is yes - # to avoid running batches multiple times by - # mistake (which does not harm Wikidata, but waste - # precious computing resources which we want to avoid.) - jobs = parse_job_pickle(silent=True) - if len(jobs) > 0: - console.print(f"Found and loaded {len(jobs)} " - f"jobs with a total of " - f"{sum(len(job.items.list) for job in jobs)} items") - remove_job_pickle(silent=True) + if ask_discard_existing_job_pickle(): + remove_job_pickle(silent=True) + else: + console.print("Quitting.") if args.run_prepared_jobs is True: logger.info("Running prepared jobs") - jobs = parse_job_pickle() - if jobs is not None and len(jobs) > 0: + batchjobs = parse_job_pickle() + if batchjobs is not None and len(batchjobs.jobs) > 0: file_hash = get_hash_of_job_pickle() - run_jobs(jobs) + batchjobs.run_jobs() # Remove the pickle afterwards remove_job_pickle(hash=file_hash) - if args.export_job_list_to_quickstatements: - export_jobs_to_quickstatements() elif args.export_jobs_to_dataframe: export_jobs_to_dataframe() - elif args.match_existing_main_subjects is True: - match_existing_main_subjects(args=args, jobs=jobs) elif args.sparql: - match_main_subjects_from_sparql(args=args, jobs=jobs) + match_main_subjects_from_sparql(args=args) else: # if not args.run_prepared_jobs: if args.add is None: @@ -191,8 +135,10 @@ def main(): task: Task = select_task() if task is None: raise ValueError("Got no task") + jobs = [] jobs.extend(process_user_supplied_qids_into_batch_jobs(args=args, task=task)) - handle_job_preparation_or_run_directly_if_any_jobs(args=args, jobs=jobs) + batchjobs = BatchJobs(jobs=jobs) + handle_job_preparation_or_run_directly_if_any_jobs(args=args, batchjobs=batchjobs) if __name__ == "__main__": diff --git a/src/helpers/argparse_setup.py b/src/helpers/argparse_setup.py index 3b9cfb0..2d24c52 100644 --- a/src/helpers/argparse_setup.py +++ b/src/helpers/argparse_setup.py @@ -98,12 +98,6 @@ def setup_argparse_and_return_args(): type=int, help='When working on SPARQL queries of e.g. galaxies, match more until this many matches are in the job list' ) - parser.add_argument( - '--export-job-list-to-quickstatements', '-qs', - action='store_true', - help='Export the prepared job list to QuickStatements.', - default=False - ) parser.add_argument( '--export-jobs-to-dataframe', action='store_true', diff --git a/src/helpers/cleaning.py b/src/helpers/cleaning.py index b15ffec..12de0a9 100644 --- a/src/helpers/cleaning.py +++ b/src/helpers/cleaning.py @@ -5,17 +5,17 @@ def strip_bad_chars(string): # https://stackoverflow.com/questions/3411771/best-way-to-replace-multiple-characters-in-a-string return ( string - # Needed for matching backslashes e.g. "Dmel\CG5330" on Q29717230 - .replace("\\", "\\\\") - # Needed for when labels contain apostrophe - .replace("'", "\\'") - .replace(",", "") - .replace(":", "") - .replace(";", "") - .replace("(", "") - .replace(")", "") - .replace("[", "") - .replace("]", "") + # Needed for matching backslashes e.g. "Dmel\CG5330" on Q29717230 + .replace("\\", "\\\\") + # Needed for when labels contain apostrophe + .replace("'", "\\'") + .replace(",", "") + .replace(":", "") + .replace(";", "") + .replace("(", "") + .replace(")", "") + .replace("[", "") + .replace("]", "") ) @@ -30,4 +30,4 @@ def strip_prefix(qid): if "http://www.wikidata.org/entity/" in qid: qid = qid[31:] # logger.debug(f"qid:{qid}") - return qid \ No newline at end of file + return qid diff --git a/src/helpers/console.py b/src/helpers/console.py index 6a19f23..8d11e89 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import argparse -from typing import List +from typing import List, TYPE_CHECKING from urllib.parse import quote from rich.console import Console @@ -7,8 +9,11 @@ from src.helpers.cleaning import clean_rich_formatting from src.models.batch_job import BatchJob -from src.models.task import Task -from src.models.wikidata import Items +from src.models.batch_jobs import BatchJobs + +if TYPE_CHECKING: + from src.models.items import Items + from src.models.task import Task console = Console() @@ -74,6 +79,8 @@ def print_found_items_table(args: argparse.Namespace = None, raise ValueError("args was None") if items is None: raise ValueError("items was None") + if items.list is None: + raise ValueError("items.list was None") table = Table(title="Matched items found") if len(items.list) < 1000: list_to_show = items.list[0:50] @@ -89,6 +96,8 @@ def print_found_items_table(args: argparse.Namespace = None, if args.show_item_urls: table.add_column(f"Wikidata URL") for item in list_to_show: + if item.label is None: + raise ValueError("item.label was None") if args.show_item_urls: label = clean_rich_formatting(item.label) table.add_row(label, item.url()) @@ -98,37 +107,41 @@ def print_found_items_table(args: argparse.Namespace = None, def ask_add_to_job_queue(job: BatchJob = None): + if job is None: + raise ValueError("job was None") + if job.suggestion.item is None: + raise ValueError("job.suggestion.item was None") + if job.suggestion.item.label is None: + raise ValueError("job.suggestion.item.label was None") + if job.suggestion.item.description is None: + raise ValueError("job.suggestion.item.description was None") + if job.items.list is None: + raise ValueError("job.items.list was None") return ask_yes_no_question(f"Do you want to add this job for " f"[magenta]{job.suggestion.item.label}: " f"{job.suggestion.item.description}[/magenta] with " f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})") -def print_running_jobs(jobs: List[BatchJob] = None): - if jobs is None: - raise ValueError("jobs was None") - console.print(f"Running {len(jobs)} job(s) with a total of " - f"{sum(len(job.items.list) for job in jobs)} items " - f"non-interactively now. You can take a " - f"coffee break and lean back :)") - - def print_finished(): console.print("All jobs finished successfully") -def print_job_statistics(jobs: List[BatchJob] = None): - if jobs is None: +def print_job_statistics(batchjobs: BatchJobs = None): + if batchjobs is None: raise ValueError("jobs was None") - if len(jobs) == 0: + if batchjobs.jobs is None: + raise ValueError("batchjobs.jobs was None") + if not isinstance(batchjobs.jobs, list): + raise ValueError("jobs was not a list") + if len(batchjobs.jobs) == 0: console.print("The jobs list is empty") else: - console.print(f"The jobs list now contain a total of {len(jobs)} " + console.print(f"The jobs list now contain a total of {len(batchjobs.jobs)} " f"jobs with a total of " - f"{sum(len(job.items.list) for job in jobs)} items") + f"{sum(len(job.items.list) for job in batchjobs.jobs if batchjobs.jobs is not None and job is not None)} items") def ask_discard_existing_job_pickle(): return ask_yes_no_question("A prepared list of jobs already exist, " - "do you want to overwrite it? " - "(pressing no will append to it)") + "do you want to delete it?") diff --git a/src/helpers/enums.py b/src/helpers/enums.py index bb7fff0..be9cd65 100644 --- a/src/helpers/enums.py +++ b/src/helpers/enums.py @@ -10,4 +10,4 @@ class TaskIds(Enum): SCHOLARLY_ARTICLES = auto() RIKSDAGEN_DOCUMENTS = auto() THESIS = auto() - ACADEMIC_JOURNALS = auto() \ No newline at end of file + ACADEMIC_JOURNALS = auto() diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index b0ed45a..761df9c 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -4,38 +4,42 @@ import logging import random from datetime import datetime -from typing import Union, List, TYPE_CHECKING +from typing import Union, List, TYPE_CHECKING, Optional from src import strip_prefix, print_best_practice, console, ask_yes_no_question, \ - TaskIds, print_found_items_table, ask_add_to_job_queue, print_keep_an_eye_on_wdqs_lag, print_running_jobs, \ - print_finished, print_job_statistics + TaskIds, print_found_items_table, ask_add_to_job_queue, print_keep_an_eye_on_wdqs_lag, print_finished, \ + print_job_statistics from src.helpers.menus import select_task -from src.models.academic_journals import AcademicJournalItems -from src.models.riksdagen_documents import RiksdagenDocumentItems -from src.models.scholarly_articles import ScholarlyArticleItems -from src.models.thesis import ThesisItems -from src.tasks import tasks, Task +from src.models.batch_jobs import BatchJobs +from src.models.items import Items +from src.models.items.academic_journals import AcademicJournalItems +from src.models.items.riksdagen_documents import RiksdagenDocumentItems +from src.models.items.scholarly_articles import ScholarlyArticleItems +from src.models.items.thesis import ThesisItems +from src.tasks import Task if TYPE_CHECKING: from src import Task, BatchJob +# TODO rewrite as OOP +logger = logging.getLogger(__name__) + def process_qid_into_job(qid: str = None, task: Task = None, args: argparse.Namespace = None, confirmation: bool = False) -> Union[BatchJob, None]: - # logger = logging.getLogger(__name__) if qid is None: raise ValueError("qid was None") if args is None: raise ValueError("args was None") if task is None: raise ValueError("task was None") - from src import Item + from src.models.wikimedia.wikidata.item import Item item = Item( id=strip_prefix(qid), - task=task ) + item.fetch_label_and_description_and_aliases(task=task) if item.label is not None: console.print(f"Working on {item}") # generate suggestion with all we need @@ -49,10 +53,14 @@ def process_qid_into_job(qid: str = None, answer = ask_yes_no_question("Do you want to continue?") if not answer: return None + suggestion.extract_search_strings() + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") with console.status(f'Fetching items with labels that have one of ' f'the search strings by running a total of ' f'{len(suggestion.search_strings) * task.number_of_queries_per_search_string} ' f'queries on WDQS...'): + items: Optional[Items] = None if task.id == TaskIds.SCHOLARLY_ARTICLES: items = ScholarlyArticleItems() elif task.id == TaskIds.RIKSDAGEN_DOCUMENTS: @@ -65,7 +73,13 @@ def process_qid_into_job(qid: str = None, raise ValueError(f"{task.id} was not recognized") items.fetch_based_on_label(suggestion=suggestion, task=task) + if items.list is None: + raise ValueError("items.list was None") if len(items.list) > 0: + # Remove duplicates + logger.warning(f"{len(items.list)} before duplicate removal") + items.list = list(set(items.list)) + logger.warning(f"{len(items.list)} after duplicate removal") # Randomize the list items.random_shuffle_list() print_found_items_table(args=args, @@ -75,14 +89,13 @@ def process_qid_into_job(qid: str = None, items=items, suggestion=suggestion ) - answer = ask_add_to_job_queue(job) - if answer: - return job + return job else: console.print("No matching items found") return None else: console.print(f"Label for {task.language_code} was None on {item.url()}, skipping") + return None def process_user_supplied_qids_into_batch_jobs(args: argparse.Namespace = None, @@ -105,52 +118,33 @@ def process_user_supplied_qids_into_batch_jobs(args: argparse.Namespace = None, return jobs -def run_jobs(jobs: List[BatchJob] = None): - if jobs is None: - raise ValueError("jobs was None") - print_keep_an_eye_on_wdqs_lag() - from src import login - login() - print_running_jobs(jobs) - count = 0 - start_time = datetime.now() - for job in jobs: - count += 1 - job.run(jobs=jobs, job_count=count) - console.print(f"runtime until now: {datetime.now() - start_time}") - print_finished() - end_time = datetime.now() - console.print(f'Total runtime: {end_time - start_time}') def handle_job_preparation_or_run_directly_if_any_jobs(args: argparse.Namespace = None, - jobs: List[BatchJob] = None): - if len(jobs) > 0: + batchjobs: BatchJobs = None): + if batchjobs is None: + raise ValueError("batchjobs was None") + if args is None: + raise ValueError("args was None") + if len(batchjobs.jobs) > 0: if args.prepare_jobs: - console.print(f"Adding {len(jobs)} job(s) to the jobs file") - for job in jobs: + console.print(f"Adding {len(batchjobs.jobs)} job(s) to the jobs file") + for job in batchjobs.jobs: from src import add_to_job_pickle add_to_job_pickle(job) - print_job_statistics(jobs=jobs) + print_job_statistics(batchjobs=batchjobs) console.print(f"You can run the jobs " f"non-interactively e.g. on the Toolforge " f"Kubernetes cluster using -r or --run-prepared-jobs. " f"See Kubernetes_HOWTO.md for details.") else: - run_jobs(jobs) + batchjobs.run_jobs() -def get_validated_main_subjects_as_jobs( - args: argparse.Namespace = None, - main_subjects: List[str] = None, - jobs: List[BatchJob] = None -) -> List[BatchJob]: +def get_validated_main_subjects_as_jobs(args: argparse.Namespace = None, + main_subjects: List[str] = None) -> BatchJobs: """This function randomly picks a subject and present it for validation""" logger = logging.getLogger(__name__) - if jobs is None: - raise ValueError("jobs was None") - if not isinstance(jobs, List): - raise ValueError("jobs was not a list") if args is None: raise ValueError("args was None") if main_subjects is None: @@ -161,6 +155,7 @@ def get_validated_main_subjects_as_jobs( raise ValueError("Got no task") if not isinstance(task, Task): raise ValueError("task was not a Task object") + batchjobs = BatchJobs(jobs=[]) while True: # Check if we have any subjects left in the list if len(subjects_not_picked_yet) > 0: @@ -173,13 +168,21 @@ def get_validated_main_subjects_as_jobs( args=args, confirmation=args.no_confirmation) if job is not None: - jobs.append(job) - logger.debug(f"joblist now has {len(jobs)} jobs") - print_job_statistics(jobs=jobs) + if args.no_ask_match_more_limit is None: + answer = ask_add_to_job_queue(job) + if answer: + batchjobs.jobs.append(job) + else: + batchjobs.jobs.append(job) + logger.debug(f"joblist now has {len(batchjobs.jobs)} jobs") + print_job_statistics(batchjobs=batchjobs) if len(subjects_not_picked_yet) > 0: if ( args.no_ask_match_more_limit is None or - args.no_ask_match_more_limit < sum(len(job.items.list) for job in jobs) + args.no_ask_match_more_limit < sum( + len(job.items.list) for job in batchjobs.jobs + if job.items.list is not None + ) ): answer_was_yes = ask_yes_no_question("Match one more?") if not answer_was_yes: @@ -190,4 +193,11 @@ def get_validated_main_subjects_as_jobs( else: console.print("No more subjects in the list. Exiting.") break - return jobs + if args.no_ask_match_more_limit is not None: + batchjobs_limit = BatchJobs(jobs=[]) + for job in batchjobs.jobs: + answer = ask_add_to_job_queue(job) + if answer: + batchjobs_limit.jobs.append(job) + return batchjobs_limit + return batchjobs diff --git a/src/helpers/menus.py b/src/helpers/menus.py index a35973e..26ee8b2 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -1,11 +1,11 @@ import logging from typing import List -from consolemenu import SelectionMenu +from consolemenu import SelectionMenu # type: ignore from src.models.suggestion import Suggestion -from src.models.wikidata import Item -from src.tasks import tasks, Task +from src.models.wikimedia.wikidata.item import Item +from src.tasks import Task def select_suggestion(suggestions: List[Suggestion] = None, @@ -20,7 +20,9 @@ def select_suggestion(suggestions: List[Suggestion] = None, selected_suggestion = None if selected_index > (len(suggestions) - 1): logger.debug("The user choose to skip") + return None else: + from src.tasks import tasks selected_suggestion = tasks[selected_index] logger.debug(f"selected:{selected_index}=" f"{selected_suggestion}") @@ -29,7 +31,10 @@ def select_suggestion(suggestions: List[Suggestion] = None, def select_task() -> Task: logger = logging.getLogger(__name__) - menu = SelectionMenu(tasks, "Select a task") + from src.tasks import tasks + labels = list([task.label for task in tasks]) + # console.print(labels) + menu = SelectionMenu(labels, "Select a task") menu.show() menu.join() task_index = menu.selected_option @@ -41,7 +46,6 @@ def select_task() -> Task: f"{selected_task}") return selected_task - # def select_language(): # logger = logging.getLogger(__name__) # menu = SelectionMenu(WikimediaLanguageCode.__members__.keys(), "Select a language") @@ -68,4 +72,4 @@ def select_task() -> Task: # selected_lexical_category = category_mapping[selected_lexical_category_index] # logger.debug(f"selected:{selected_lexical_category_index}=" # f"{selected_lexical_category}") -# return selected_lexical_category \ No newline at end of file +# return selected_lexical_category diff --git a/src/helpers/migration.py b/src/helpers/migration.py index 084b6c4..bbcbc88 100644 --- a/src/helpers/migration.py +++ b/src/helpers/migration.py @@ -6,9 +6,6 @@ def migrate_pickle_detection(): if config.job_pickle_file_path is None: raise ValueError("the variable job_pickle_file_path in config " "has to contain a string like 'pickle.dat'") - if config.main_subjects_pickle_file_path is None: - raise ValueError("The variable main_subjects_pickle_file_path" - "is None, see config.example.py") except AttributeError: raise ValueError("You need to migrate the new pickle variables" "in config.example.py to your config.py before " diff --git a/src/helpers/pickle.py b/src/helpers/pickle.py index f2a5085..42bb448 100644 --- a/src/helpers/pickle.py +++ b/src/helpers/pickle.py @@ -1,12 +1,15 @@ -import os import hashlib +import os import pickle -from typing import List +from typing import List, Optional import config from src.helpers.console import console from src.models.batch_job import BatchJob +# TODO rewrite as OOP +from src.models.batch_jobs import BatchJobs + def add_to_job_pickle(job: BatchJob = None): if job is None: @@ -16,12 +19,6 @@ def add_to_job_pickle(job: BatchJob = None): pickle.dump(job, file, pickle.DEFAULT_PROTOCOL) -def add_to_main_subject_pickle(subjects: List[str] = None): - with open(config.main_subjects_pickle_file_path, 'wb') as file: - for qid in subjects: - pickle.dump(qid, file, pickle.DEFAULT_PROTOCOL) - - def read_from_pickle(path): with open(path, 'rb') as file: try: @@ -38,7 +35,7 @@ def check_if_pickle_exists(path): return False -def parse_job_pickle(silent: bool = False) -> List[BatchJob]: +def parse_job_pickle(silent: bool = False) -> Optional[BatchJobs]: """Reads the pickle into a list of batch jobs""" if check_if_pickle_exists(config.job_pickle_file_path): jobs: List[BatchJob] = [] @@ -47,28 +44,13 @@ def parse_job_pickle(silent: bool = False) -> List[BatchJob]: if len(jobs) == 0: if not silent: console.print("No prepared jobs found") + return None else: - return jobs + return BatchJobs(jobs=jobs) else: if not silent: console.print("No pickle file found") - - -def parse_main_subjects_pickle() -> List[str]: - """Reads the pickle into a list of main subjects""" - if check_if_pickle_exists(config.main_subjects_pickle_file_path): - subjects = [] - for subject in read_from_pickle(config.main_subjects_pickle_file_path): - subjects.append(subject) - if len(subjects) == 0: - console.print("No qids found in the pickle.") - else: - # print(f"found:{subjects}") - return subjects - else: - console.print("No main subjects pickle file found. " - "Create it by running 'python fetch_main_subjects.py'") - exit(0) + return None def remove_job_pickle(silent: bool = False, diff --git a/src/models/batch_job.py b/src/models/batch_job.py index b96c5b4..e5cf6ec 100644 --- a/src/models/batch_job.py +++ b/src/models/batch_job.py @@ -1,21 +1,13 @@ -from __future__ import annotations -from dataclasses import dataclass -from typing import List, TYPE_CHECKING +from typing import List -if TYPE_CHECKING: - from src.models.suggestion import Suggestion - from src.models.wikidata import Items +from pydantic import BaseModel +from src.models.items import Items +from src.models.suggestion import Suggestion -@dataclass -class BatchJob: + +class BatchJob(BaseModel): """Models a batch job intended to be run non-interactively""" suggestion: Suggestion items: Items - def run(self, jobs: List[BatchJob], job_count: int = None): - if jobs is None: - raise ValueError("jobs was None") - if job_count is None: - raise ValueError("job count was None") - self.suggestion.add_to_items(items=self.items, jobs=jobs, job_count=job_count) diff --git a/src/models/batch_jobs.py b/src/models/batch_jobs.py new file mode 100644 index 0000000..820e931 --- /dev/null +++ b/src/models/batch_jobs.py @@ -0,0 +1,38 @@ +from datetime import datetime +from typing import List + +from pydantic import BaseModel + +from src.models.batch_job import BatchJob + + +class BatchJobs(BaseModel): + jobs: List[BatchJob] + + @property + def job_count(self): + return len(self.jobs) + + def print_running_jobs(self): + if not isinstance(self.jobs, list): + raise ValueError("jobs is not a list") + from src.helpers.console import console + console.print(f"Running {len(self.jobs)} job(s) with a total of " + f"{sum(len(job.items.list) for job in self.jobs if job.items.list is not None)} items " + f"non-interactively now. You can take a " + f"coffee break and lean back :)") + + def run_jobs(self): + from src.helpers.console import console, print_keep_an_eye_on_wdqs_lag, print_finished + if self.jobs is None or len(self.jobs) == 0: + raise ValueError("did not get what we need") + print_keep_an_eye_on_wdqs_lag() + from src import login + login() + self.print_running_jobs() + start_time = datetime.now() + for job in self.jobs: + job.suggestion.add_to_items(items=job.items, jobs=self.jobs, job_count=self.job_count) + print_finished() + end_time = datetime.now() + console.print(f'Total runtime: {end_time - start_time}') diff --git a/src/models/items/__init__.py b/src/models/items/__init__.py new file mode 100644 index 0000000..1b5dd9e --- /dev/null +++ b/src/models/items/__init__.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import random +from typing import List, TYPE_CHECKING, Optional + +from pydantic import BaseModel + +from src.models.task import Task +from src.models.wikimedia.wikidata.sparql_item import SparqlItem + +if TYPE_CHECKING: + from src.models.suggestion import Suggestion + + +class Items(BaseModel): + list: Optional[List[SparqlItem]] + + def fetch_based_on_label(self, + suggestion: Suggestion = None, + task: Task = None): + pass + + def random_shuffle_list(self): + random.shuffle(self.list) diff --git a/src/models/academic_journals.py b/src/models/items/academic_journals.py similarity index 70% rename from src/models/academic_journals.py rename to src/models/items/academic_journals.py index 4ecf2cb..1b14772 100644 --- a/src/models/academic_journals.py +++ b/src/models/items/academic_journals.py @@ -1,22 +1,14 @@ import logging -from wikibaseintegrator.wbi_helpers import execute_sparql_query +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config from src.helpers.cleaning import strip_bad_chars from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Items, Item - - -def process_results(results): - items = [] - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = Item(json=item_json) - items.append(item) - return items +from src.models.wikimedia.wikidata.item import Item +from src.models.items import Items class AcademicJournalItems(Items): @@ -25,11 +17,30 @@ class AcademicJournalItems(Items): def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): + def process_results(results): + # TODO refactor into private method + items = [] + for item_json in results["results"]["bindings"]: + logging.debug(f"item_json:{item_json}") + item = Item(json=item_json) + items.append(item) + return items + # logger = logging.getLogger(__name__) if suggestion is None: raise ValueError("suggestion was None") if task is None: raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") + if suggestion.item is None: + raise ValueError("suggestion.item was None") + if suggestion.item.id is None: + raise ValueError("suggestion.item.id was None") + if suggestion.args is None: + raise ValueError("suggestion.args was None") # Fetch all items matching the search strings self.list = [] for search_string in suggestion.search_strings: diff --git a/src/models/riksdagen_documents.py b/src/models/items/riksdagen_documents.py similarity index 82% rename from src/models/riksdagen_documents.py rename to src/models/items/riksdagen_documents.py index d2fb8df..c9ac4be 100644 --- a/src/models/riksdagen_documents.py +++ b/src/models/items/riksdagen_documents.py @@ -1,12 +1,13 @@ import logging -from wikibaseintegrator.wbi_helpers import execute_sparql_query +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config from src.helpers.console import console +from src.models.items import Items from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Items, Item +from src.models.wikimedia.wikidata.sparql_item import SparqlItem class RiksdagenDocumentItems(Items): @@ -16,11 +17,19 @@ def fetch_based_on_label(self, # logger = logging.getLogger(__name__) if suggestion is None: raise ValueError("suggestion was None") + if suggestion.item is None: + raise ValueError("suggestion.item was None") + if suggestion.args is None: + raise ValueError("suggestion.args was None") if suggestion.args.limit_to_items_without_p921: raise Exception("Limiting to items without P921 is not " "supported yet for this task.") + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") if task is None: raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") # Fetch all items maching the search strings self.list = [] # Include spaces around the n-gram to avoid edits like this one @@ -53,8 +62,7 @@ def fetch_based_on_label(self, ''', debug=suggestion.args.debug_sparql) for item_json in results["results"]["bindings"]: logging.debug(f"item_json:{item_json}") - item = Item(json=item_json, - task=task) + item = SparqlItem(**item_json) self.list.append(item) logging.info(f'Got {len(results["results"]["bindings"])} items from ' f'WDQS using the search string {search_string}') diff --git a/src/models/items/scholarly_articles.py b/src/models/items/scholarly_articles.py new file mode 100644 index 0000000..3001563 --- /dev/null +++ b/src/models/items/scholarly_articles.py @@ -0,0 +1,147 @@ +import logging + +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore + +import config +from src.helpers.cleaning import strip_bad_chars +from src.helpers.console import console +from src.models.suggestion import Suggestion +from src.models.task import Task +from src.models.items import Items +from src.models.wikimedia.wikidata.sparql_item import SparqlItem + + +class ScholarlyArticleItems(Items): + """This supports both published peer reviewed articles and preprints""" + + def fetch_based_on_label(self, + suggestion: Suggestion = None, + task: Task = None): + def build_query(suggestion: Suggestion = None, + search_string: str = None, + task: Task = None, + cirrussearch_parameters: str = None): + # TODO refactor + if suggestion is None: + raise ValueError("suggestion was None") + if suggestion.item is None: + raise ValueError("suggestion.item was None") + if search_string is None: + raise ValueError("search_string was None") + if task is None: + raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") + if cirrussearch_parameters is None: + raise ValueError("cirrussearch_parameters was None") + # This query uses https://www.w3.org/TR/sparql11-property-paths/ to + # find subjects that are subclass of one another up to 3 hops away + # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI + # which has a hardcoded limit of 10,000 items so you will never get more matches than that + # This query use regex to match beginning, middle and end of the label of matched items + # The replacing lines should match the similar python replacements in cleaning.py + # The replacing with "\\\\\\\\" becomes "\\\\" after leaving python and then it works in + # SPARQL where it becomes "\\" and thus match a single backslash + return (f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + hint:Query hint:optimizer "None". + BIND(STR('{cirrussearch_parameters} \"{search_string}\"') as ?search_string) + SERVICE wikibase:mwapi {{ + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch ?search_string. + ?title wikibase:apiOutput mwapi:title. + }} + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + BIND(REPLACE(LCASE(?label), ",", "") as ?label1) + BIND(REPLACE(?label1, ":", "") as ?label2) + BIND(REPLACE(?label2, ";", "") as ?label3) + BIND(REPLACE(?label3, "\\\\(", "") as ?label4) + BIND(REPLACE(?label4, "\\\\)", "") as ?label5) + BIND(REPLACE(?label5, "\\\\[", "") as ?label6) + BIND(REPLACE(?label6, "\\\\]", "") as ?label7) + BIND(REPLACE(?label7, "\\\\\\\\", "") as ?label8) + BIND(?label8 as ?cleaned_label) + FILTER(CONTAINS(?cleaned_label, ' {search_string.lower()} '@{task.language_code.value}) || + REGEX(?cleaned_label, '.* {search_string.lower()}$'@{task.language_code.value}) || + REGEX(?cleaned_label, '^{search_string.lower()} .*'@{task.language_code.value})) + MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} + MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} + MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + """) + + def process_results(results): + # TODO refactor + items = [] + for item_json in results["results"]["bindings"]: + logging.debug(f"item_json:{item_json}") + item = SparqlItem(**item_json) + item.validate_qid_and_copy_label() + items.append(item) + return items + + # logger = logging.getLogger(__name__) + if suggestion is None: + raise ValueError("suggestion was None") + if suggestion.item is None: + raise ValueError("suggestion.item was None") + if suggestion.args is None: + raise ValueError("suggestion.args was None") + if suggestion.args.limit_to_items_without_p921: + raise Exception("Limiting to items without P921 is not " + "supported yet for this task.") + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") + if task is None: + raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") + if suggestion.args.limit_to_items_without_p921: + console.print("Limiting to scholarly articles without P921 main subject only") + cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921" + else: + cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={suggestion.item.id}" + # Fetch all items matching the search strings + self.list = [] + for search_string in suggestion.search_strings: + search_string = strip_bad_chars(search_string) + results = execute_sparql_query( + build_query( + cirrussearch_parameters=cirrussearch_parameters, + suggestion=suggestion, + search_string=search_string, + task=task) + ) + logging.info(f'Got {len(results["results"]["bindings"])} scholarly items from ' + f'WDQS using the search string {search_string}') + self.list.extend(process_results(results)) + # preprints + # We don't use CirrusSearch in this query because we can do it more easily in + # SPARQL on a small subgraph like this + # find all items that are ?item wdt:P31/wd:P279* wd:Q1266946 + # minus the QID we want to add + results_preprint = execute_sparql_query(f''' + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + ?item wdt:P31/wd:P279* wd:Q580922. # preprint + MINUS {{ + ?item wdt:P921 wd:{suggestion.item.id}; + }} + ?item rdfs:label ?label. + FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || + REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || + REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) + MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + ''', debug=suggestion.args.debug_sparql) + logging.info(f'Got {len(results["results"]["bindings"])} preprint items from ' + f'WDQS using the search string {search_string}') + self.list.extend(process_results(results_preprint)) + console.print(f"Got a total of {len(self.list)} items") diff --git a/src/models/thesis.py b/src/models/items/thesis.py similarity index 85% rename from src/models/thesis.py rename to src/models/items/thesis.py index 5569ae9..db205a4 100644 --- a/src/models/thesis.py +++ b/src/models/items/thesis.py @@ -1,13 +1,13 @@ import logging -from wikibaseintegrator.wbi_helpers import execute_sparql_query +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore from src.helpers.console import console +from src.models.items import Items from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Items, Item - # There were ~16.000 thesis' in WD when this was written +from src.models.wikimedia.wikidata.sparql_item import SparqlItem class ThesisItems(Items): @@ -17,11 +17,15 @@ def fetch_based_on_label(self, # logger = logging.getLogger(__name__) if suggestion is None: raise ValueError("suggestion was None") + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") if suggestion.args.limit_to_items_without_p921: raise Exception("Limiting to items without P921 is not " "supported yet for this task.") if task is None: raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") # Fetch all items maching the search strings self.list = [] for search_string in suggestion.search_strings: @@ -54,8 +58,7 @@ def fetch_based_on_label(self, ''', debug=suggestion.args.debug_sparql) for item_json in results["results"]["bindings"]: logging.debug(f"item_json:{item_json}") - item = Item(json=item_json, - task=task) + item = SparqlItem(**item_json) self.list.append(item) logging.info(f'Got {len(results["results"]["bindings"])} items from ' f'WDQS using the search string {search_string}') diff --git a/src/models/quickstatements.py b/src/models/quickstatements.py deleted file mode 100644 index e5313fa..0000000 --- a/src/models/quickstatements.py +++ /dev/null @@ -1,18 +0,0 @@ -from dataclasses import dataclass - -from src.models.wikidata import EntityID - - -@dataclass -class QuickStatementsCommandVersion1: - """This models the simple line-based QS commands - - For now we only support QID-values - - Q1\tP1\tQ1""" - target: EntityID = None - property: EntityID = None - value: EntityID = None - - def __str__(self): - return f"{self.target}\t{self.property}\t{self.value}" diff --git a/src/models/scholarly_articles.py b/src/models/scholarly_articles.py deleted file mode 100644 index 812ffba..0000000 --- a/src/models/scholarly_articles.py +++ /dev/null @@ -1,129 +0,0 @@ -import logging - -from wikibaseintegrator.wbi_helpers import execute_sparql_query - -import config -from src.helpers.cleaning import strip_bad_chars -from src.helpers.console import console -from src.models.suggestion import Suggestion -from src.models.task import Task -from src.models.wikidata import Items, Item - - -def build_query(suggestion: Suggestion = None, - search_string: str = None, - task: Task = None, - cirrussearch_parameters: str = None): - if suggestion is None: - raise ValueError("suggestion was None") - if search_string is None: - raise ValueError("search_string was None") - if task is None: - raise ValueError("task was None") - if cirrussearch_parameters is None: - raise ValueError("cirrussearch_parameters was None") - # This query uses https://www.w3.org/TR/sparql11-property-paths/ to - # find subjects that are subclass of one another up to 3 hops away - # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI - # which has a hardcoded limit of 10,000 items so you will never get more matches than that - # This query use regex to match beginning, middle and end of the label of matched items - # The replacing lines should match the similar python replacements in cleaning.py - # The replacing with "\\\\\\\\" becomes "\\\\" after leaving python and then it works in - # SPARQL where it becomes "\\" and thus match a single backslash - return (f""" - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - hint:Query hint:optimizer "None". - BIND(STR('{cirrussearch_parameters} \"{search_string}\"') as ?search_string) - SERVICE wikibase:mwapi {{ - bd:serviceParam wikibase:api "Search"; - wikibase:endpoint "www.wikidata.org"; - mwapi:srsearch ?search_string. - ?title wikibase:apiOutput mwapi:title. - }} - BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) - ?item rdfs:label ?label. - BIND(REPLACE(LCASE(?label), ",", "") as ?label1) - BIND(REPLACE(?label1, ":", "") as ?label2) - BIND(REPLACE(?label2, ";", "") as ?label3) - BIND(REPLACE(?label3, "\\\\(", "") as ?label4) - BIND(REPLACE(?label4, "\\\\)", "") as ?label5) - BIND(REPLACE(?label5, "\\\\[", "") as ?label6) - BIND(REPLACE(?label6, "\\\\]", "") as ?label7) - BIND(REPLACE(?label7, "\\\\\\\\", "") as ?label8) - BIND(?label8 as ?cleaned_label) - FILTER(CONTAINS(?cleaned_label, ' {search_string.lower()} '@{task.language_code.value}) || - REGEX(?cleaned_label, '.* {search_string.lower()}$'@{task.language_code.value}) || - REGEX(?cleaned_label, '^{search_string.lower()} .*'@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """) - - -def process_results(results): - items = [] - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = Item(json=item_json) - items.append(item) - return items - - -class ScholarlyArticleItems(Items): - """This supports both published peer reviewed articles and preprints""" - def fetch_based_on_label(self, - suggestion: Suggestion = None, - task: Task = None): - # logger = logging.getLogger(__name__) - if suggestion is None: - raise ValueError("suggestion was None") - if task is None: - raise ValueError("task was None") - if suggestion.args.limit_to_items_without_p921: - console.print("Limiting to scholarly articles without P921 main subject only") - cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921" - else: - cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={suggestion.item.id}" - # Fetch all items matching the search strings - self.list = [] - for search_string in suggestion.search_strings: - search_string = strip_bad_chars(search_string) - results = execute_sparql_query( - build_query( - cirrussearch_parameters=cirrussearch_parameters, - suggestion=suggestion, - search_string=search_string, - task=task) - ) - logging.info(f'Got {len(results["results"]["bindings"])} scholarly items from ' - f'WDQS using the search string {search_string}') - self.list.extend(process_results(results)) - # preprints - # We don't use CirrusSearch in this query because we can do it more easily in - # SPARQL on a small subgraph like this - # find all items that are ?item wdt:P31/wd:P279* wd:Q1266946 - # minus the QID we want to add - results_preprint = execute_sparql_query(f''' - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - ?item wdt:P31/wd:P279* wd:Q580922. # preprint - MINUS {{ - ?item wdt:P921 wd:{suggestion.item.id}; - }} - ?item rdfs:label ?label. - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - ''', debug=suggestion.args.debug_sparql) - logging.info(f'Got {len(results["results"]["bindings"])} preprint items from ' - f'WDQS using the search string {search_string}') - self.list.extend(process_results(results_preprint)) - console.print(f"Got a total of {len(self.list)} items") diff --git a/src/models/suggestion.py b/src/models/suggestion.py index d1cac86..c55689e 100644 --- a/src/models/suggestion.py +++ b/src/models/suggestion.py @@ -1,40 +1,32 @@ +from __future__ import annotations + import argparse import logging -from typing import List +from typing import List, Optional, TYPE_CHECKING from urllib.parse import quote -from wikibaseintegrator.datatypes import Item as ItemType +from pydantic import BaseModel +from wikibaseintegrator.datatypes import Item as ItemType # type: ignore import config from src.helpers.calculations import calculate_random_editgroups_hash from src.helpers.cleaning import clean_rich_formatting -from src.helpers.console import print_search_strings_table, console -from src.helpers.enums import TaskIds -from src.models.batch_job import BatchJob +from src.models.items import Items from src.models.task import Task -from src.models.wikidata import Item, Items +from src.models.wikimedia.wikidata.item import Item +if TYPE_CHECKING: + from src.models.batch_job import BatchJob -class Suggestion: - item: Item = None - search_strings: List[str] = None - task: Task = None - args: argparse.Namespace = None - def __init__(self, - item: Item = None, - task: Task = None, - args=None): - if item is None: - raise ValueError("item was None") - else: - self.item = item - if task is None: - raise ValueError("task was None") - else: - self.task = task - self.args = args - self.extract_search_strings() +class Suggestion(BaseModel): + item: Item + task: Task + args: argparse.Namespace + search_strings: Optional[List[str]] = None + + class Config: + arbitrary_types_allowed = True def __str__(self): """Return label and description, the latter cut to 50 chars""" @@ -61,6 +53,8 @@ def add_to_items(self, This function is non-interactive""" if items is None: raise ValueError("Items was None") + if items.list is None: + raise ValueError("items.list was None") if jobs is None: raise ValueError("jobs was None") if job_count is None: @@ -69,6 +63,7 @@ def add_to_items(self, count = 0 for target_item in items.list: count += 1 + from src import console with console.status(f"Uploading main subject " f"[green]{clean_rich_formatting(self.item.label)}[/green] " f"to {clean_rich_formatting(target_item.label)}"): @@ -96,20 +91,22 @@ def extract_search_strings(self): def clean_special_symbols(string: str): return string.replace("®", "").replace("™", "") + from src.helpers.console import console logger = logging.getLogger(__name__) if self.args is None: raise ValueError("args was None") else: logger.debug(f"args:{self.args}") if self.args.no_aliases is True: + from src import console console.print("Alias matching is turned off") no_aliases = True else: no_aliases = False self.search_strings: List[str] = [clean_special_symbols(self.item.label)] if ( - self.item.aliases is not None and - no_aliases is False + self.item.aliases is not None and + no_aliases is False ): for alias in self.item.aliases: # logger.debug(f"extracting alias:{alias}") @@ -121,10 +118,13 @@ def clean_special_symbols(string: str): else: self.search_strings.append(clean_special_symbols(alias)) # logger.debug(f"search_strings:{self.search_strings}") + from src.helpers.console import print_search_strings_table print_search_strings_table(args=self.args, search_strings=self.search_strings) def search_urls(self) -> List[str]: + if self.search_strings is None: + raise ValueError("self.search_strings was None") urls = [] for search_string in self.search_strings: search_term = quote(f'"{search_string}"') diff --git a/src/models/task.py b/src/models/task.py index 1260025..3a4858b 100644 --- a/src/models/task.py +++ b/src/models/task.py @@ -1,37 +1,18 @@ from typing import Union +from pydantic import BaseModel + from src.helpers.enums import SupportedLanguageCode, TaskIds -# console-menu does not support dataclass (yet) -# @dataclass -class Task: +class Task(BaseModel): """This class holds the tasks presented to the user in the menu and related data""" - best_practice_information: Union[str, None] = None - id: TaskIds = None - label: str = None - language_code: SupportedLanguageCode = None + best_practice_information: Union[str, None] + id: TaskIds + label: str + language_code: SupportedLanguageCode number_of_queries_per_search_string = 1 - def __init__(self, - best_practice_information: str = None, - id: TaskIds = None, - label: str = None, - language_code: SupportedLanguageCode = None, - number_of_queries_per_search_string: int = None): - if id is None: - raise ValueError("Got no id") - if label is None: - raise ValueError("Got no label") - if language_code is None: - raise ValueError("Got no language_code") - self.id = id - self.label = label - self.language_code = language_code - self.best_practice_information = best_practice_information - if number_of_queries_per_search_string is not None: - self.number_of_queries_per_search_string = number_of_queries_per_search_string - def __str__(self): return f"{self.label}" diff --git a/src/models/wikidata.py b/src/models/wikidata.py deleted file mode 100644 index f4a25ea..0000000 --- a/src/models/wikidata.py +++ /dev/null @@ -1,883 +0,0 @@ -""" -Model from LexUtils -""" -import logging -import random -from enum import Enum -from typing import List - -from wikibaseintegrator import wbi_config, WikibaseIntegrator -from wikibaseintegrator.datatypes import BaseDataType -from wikibaseintegrator.models import Alias -from wikibaseintegrator.wbi_enums import ActionIfExists - -import config -# We get the URL for the Wikibase from here -from src.models.task import Task - -wbi_config.config['USER_AGENT'] = config.user_agent - - -class WikidataGrammaticalFeature(Enum): - # Swedish - ACTIVE_VOICE = "Q1317831" - PRETERITE = "Q442485" - INFINITIVE = "Q179230" - PRESENT_TENSE = "Q192613" - SUPINE = "Q548470" - IMPERATIVE = "Q22716" - PASSIVE_VOICE = "Q1194697" - SINGULAR = "Q110786" - NOMINATIVE_CASE = "Q131105" - INDEFINITE = "Q53997857" - DEFINITE = "Q53997851" - PLURAL = "Q146786" - GENITIVE_CASE = "Q146233" - # English - SIMPLE_PRESENT = "Q3910936" - THIRD_PERSON_SINGULAR = "Q51929447" - - -class WikidataLexicalCategory(Enum): - NOUN = "Q1084" - VERB = "Q24905" - ADVERB = "Q380057" - ADJECTIVE = "Q34698" - AFFIX = "Q62155" - PROPER_NOUN = "Q147276" - - -class WikimediaLanguageCode(Enum): - DANISH = "da" - SWEDISH = "sv" - BOKMÅL = "nb" - ENGLISH = "en" - FRENCH = "fr" - RUSSIAN = "ru" - ESTONIAN = "et" - MALAYALAM = "ml" - LATIN = "la" - HEBREW = "he" - BASQUE = "eu" - GERMAN = "de" - BENGALI = "bn" - CZECH = "cs" - - -class WikimediaLanguageQID(Enum): - DANISH = "Q9035" - SWEDISH = "Q9027" - BOKMÅL = "Q25167" - ENGLISH = "Q1860" - FRENCH = "Q150" - RUSSIAN = "Q7737" - ESTONIAN = "Q9072" - MALAYALAM = "Q36236" - LATIN = "Q397" - HEBREW = "Q9288" - BASQUE = "Q8752" - GERMAN = "Q188" - BENGALI = "Q9610" - CZECH = "Q9056" - - -class WikidataNamespaceLetters(Enum): - PROPERTY = "P" - ITEM = "Q" - LEXEME = "L" - #FORM = "F" - #SENSE = "S" - - -class EntityID: - letter: WikidataNamespaceLetters - # This can be e.g. "32698-F1" in the case of a lexeme - rest: str - - def __init__(self, - entity_id: str): - logger = logging.getLogger(__name__) - if entity_id is not None: - # Remove prefix if found - if config.wd_prefix in entity_id: - logger.debug("Removing prefix") - entity_id = entity_id.replace(config.wd_prefix, "") - if len(entity_id) > 1: - logger.info(f"entity_id:{entity_id}") - self.letter = WikidataNamespaceLetters(entity_id[0]) - self.rest = entity_id[1:] - else: - raise ValueError("Entity ID was too short.") - else: - raise ValueError("Entity ID was None") - - def __str__(self): - return f"{self.letter.value}{self.rest}" - - # def extract_wdqs_json_entity_id(self, json: Dict, sparql_variable: str): - # self.__init__(json[sparql_variable]["value"].replace( - # config.wd_prefix, "" - # )) - - -class ForeignID: - id: str - property: str # This is the property with type ExternalId - source_item_id: str # This is the Q-item for the source - - def __init__(self, - id: str = None, - property: str = None, - source_item_id: str = None): - self.id = id - self.property = str(EntityID(property)) - self.source_item_id = str(EntityID(source_item_id)) - - -class Form: - """ - Model for a Wikibase form - """ - id: str - representation: str - grammatical_features: List[WikidataGrammaticalFeature] - # We store these on the form because they are needed - # to determine if an example fits or not - lexeme_id: str - lexeme_category: str - - def __init__(self, json): - """Parse the form json""" - logger = logging.getLogger(__name__) - try: - logger.info(json["lexeme"]) - self.id = str(EntityID(json["lexeme"]["value"])) - except KeyError: - pass - try: - logger.info(json["form"]) - self.id = str(EntityID(json["form"]["value"])) - except KeyError: - pass - try: - self.representation: str = json["form_representation"]["value"] - except KeyError: - pass - try: - self.lexeme_category: WikidataLexicalCategory = WikidataLexicalCategory( - str(EntityID(json["category"]["value"])) - ) - except: - raise ValueError(f'Could not find lexical category from ' - f'{json["category"]["value"]}') - try: - self.grammatical_features = [] - logger.info(json["grammatical_features"]) - for feature in json["grammatical_features"]["value"].split(","): - # TODO parse features with Enum - feature_id = WikidataGrammaticalFeature(str(EntityID(feature))) - self.grammatical_features.append(feature_id) - except KeyError: - pass - - -class Sense: - pass - - -class Entity: - """Base entity with code that is the same for both items and lexemes""" - id: str - label: str - - def upload_one_statement_to_wikidata(self, - statement: BaseDataType = None, - summary: str = None, - editgroups_hash: str = None): - """Upload one statement and always append - This mandates an editgroups hash to be supplied""" - logger = logging.getLogger(__name__) - if self.id is None: - raise ValueError("no id on item") - if statement is None: - raise ValueError("Statement was None") - if summary is None: - raise ValueError("summary was None") - if editgroups_hash is None: - raise ValueError("editgroup_hash was None") - if config.login_instance is None: - raise ValueError("No login instance in config.login_instance") - wbi = WikibaseIntegrator(login=config.login_instance) - item = wbi.item.get(self.id) - item.add_claims( - [statement], - action_if_exists=ActionIfExists.APPEND) - result = item.write( - summary=f"Added {summary} with [[{config.tool_wikipage}]] " - f"([[:toolforge:editgroups/b/CB/{editgroups_hash}|details]])" - ) - logger.debug(f"result from WBI:{result}") - - def url(self): - return f"http://www.wikidata.org/entity/{self.id}" - - -# class Lexeme(Entity): -# id: str -# lemma: str -# lexical_category: WikidataLexicalCategory -# forms: List[Form] -# senses: List[Sense] -# # Needed for duplicate lookup -# language_code: WikimediaLanguageCode -# -# def __init__(self, -# id: str = None, -# lemma: str = None, -# lexical_category: str = None, -# language_code: WikimediaLanguageCode = None): -# if id is not None: -# self.id = str(EntityID(id)) -# self.lemma = lemma -# if lexical_category is None: -# raise ValueError("Lexical category was None") -# if isinstance(lexical_category, WikidataLexicalCategory): -# self.lexical_category = lexical_category -# else: -# self.lexical_category = WikidataLexicalCategory(EntityID(lexical_category)) -# if language_code is not None: -# self.language_code: WikimediaLanguageCode = language_code -# -# def create(self): -# if self.id is not None: -# raise ValueError("Lexeme already has an id, aborting") -# lexeme = wbi_core.LexemeEngine() -# -# def parse_from_wdqs_json(self, json): -# self.forms = [] -# self.senses = [] -# for variable in json: -# logging.debug(variable) -# if variable == "form": -# form = Form(variable) -# self.forms.append(form) -# if variable == "sense": -# sense = Sense(variable) -# self.senses.append(sense) -# if variable == "category": -# self.lexical_category = EntityID(wdqs.extract_wikibase_value(variable)) -# -# def url(self): -# return f"{config.wd_prefix}{self.id}" -# -# def upload_foreign_id_to_wikidata(self, -# foreign_id: ForeignID = None): -# """Upload to enrich the wonderful Wikidata <3""" -# logger = logging.getLogger(__name__) -# if foreign_id is None: -# raise Exception("Foreign id was None") -# print(f"Uploading {foreign_id.id} to {self.id}: {self.lemma}") -# statement = wbi_datatype.ExternalID( -# prop_nr=foreign_id.property, -# value=foreign_id.id, -# ) -# described_by_source = wbi_datatype.ItemID( -# prop_nr="P1343", # stated in -# value=foreign_id.source_item_id -# ) -# # TODO does this overwrite or append? -# item = wbi_core.ItemEngine( -# data=[statement, -# described_by_source], -# item_id=self.id -# ) -# # debug WBI error -# # print(item.get_json_representation()) -# result = item.write( -# config.login_instance, -# edit_summary=f"Added foreign identifier with [[{config.tool_url}]]" -# ) -# logger.debug(f"result from WBI:{result}") -# print(self.url()) -# # exit(0) -# -# def count_number_of_senses_with_P5137(self): -# """Returns an int""" -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?sense) as ?count) -# WHERE {{ -# VALUES ?l {{wd:{self.id}}}. -# ?l ontolex:sense ?sense. -# ?sense skos:definition ?gloss. -# # Exclude lexemes without a linked QID from at least one sense -# ?sense wdt:P5137 []. -# }}''')) -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# return count -# -# def add_usage_example( -# document_id=None, -# sentence=None, -# lid=None, -# form_id=None, -# sense_id=None, -# word=None, -# publication_date=None, -# language_style=None, -# type_of_reference=None, -# source=None, -# line=None, -# ): -# # TODO convert to use OOP -# logger = logging.getLogger(__name__) -# # Use WikibaseIntegrator aka wbi to upload the changes in one edit -# link_to_form = wbi_datatype.Form( -# prop_nr="P5830", -# value=form_id, -# is_qualifier=True -# ) -# link_to_sense = wbi_datatype.Sense( -# prop_nr="P6072", -# value=sense_id, -# is_qualifier=True -# ) -# if language_style == "formal": -# style = "Q104597585" -# else: -# if language_style == "informal": -# style = "Q901711" -# else: -# print(_("Error. Language style {} ".format(language_style) + -# "not one of (formal,informal). Please report a bug at " + -# "https://github.com/egils-consulting/LexUtils/issues")) -# sleep(config.sleep_time) -# return "error" -# logging.debug("Generating qualifier language_style " + -# f"with {style}") -# language_style_qualifier = wbi_datatype.ItemID( -# prop_nr="P6191", -# value=style, -# is_qualifier=True -# ) -# # oral or written -# if type_of_reference == "written": -# medium = "Q47461344" -# else: -# if type_of_reference == "oral": -# medium = "Q52946" -# else: -# print(_("Error. Type of reference {} ".format(type_of_reference) + -# "not one of (written,oral). Please report a bug at " + -# "https://github.com/egils-consulting/LexUtils/issues")) -# sleep(config.sleep_time) -# return "error" -# logging.debug(_("Generating qualifier type of reference " + -# "with {}".format(medium))) -# type_of_reference_qualifier = wbi_datatype.ItemID( -# prop_nr="P3865", -# value=medium, -# is_qualifier=True -# ) -# if source == "riksdagen": -# if publication_date is not None: -# publication_date = datetime.fromisoformat(publication_date) -# else: -# print(_("Publication date of document {} ".format(document_id) + -# "is missing. We have no fallback for that at the moment. " + -# "Abort adding usage example.")) -# return "error" -# stated_in = wbi_datatype.ItemID( -# prop_nr="P248", -# value="Q21592569", -# is_reference=True -# ) -# # TODO lookup if we have a QID for the source -# document_id = wbi_datatype.ExternalID( -# prop_nr="P8433", # Riksdagen Document ID -# value=document_id, -# is_reference=True -# ) -# reference = [ -# stated_in, -# document_id, -# wbi_datatype.Time( -# prop_nr="P813", # Fetched today -# time=datetime.utcnow().replace( -# tzinfo=timezone.utc -# ).replace( -# hour=0, -# minute=0, -# second=0, -# ).strftime("+%Y-%m-%dT%H:%M:%SZ"), -# is_reference=True, -# ), -# wbi_datatype.Time( -# prop_nr="P577", # Publication date -# time=publication_date.strftime("+%Y-%m-%dT00:00:00Z"), -# is_reference=True, -# ), -# type_of_reference_qualifier, -# ] -# elif source == "europarl": -# stated_in = wbi_datatype.ItemID( -# prop_nr="P248", -# value="Q5412081", -# is_reference=True -# ) -# reference = [ -# stated_in, -# wbi_datatype.Time( -# prop_nr="P813", # Fetched today -# time=datetime.utcnow().replace( -# tzinfo=timezone.utc -# ).replace( -# hour=0, -# minute=0, -# second=0, -# ).strftime("+%Y-%m-%dT%H:%M:%SZ"), -# is_reference=True, -# ), -# wbi_datatype.Time( -# prop_nr="P577", # Publication date -# time="+2012-05-12T00:00:00Z", -# is_reference=True, -# ), -# wbi_datatype.Url( -# prop_nr="P854", # reference url -# value="http://www.statmt.org/europarl/v7/sv-en.tgz", -# is_reference=True, -# ), -# # filename in archive -# wbi_datatype.String( -# (f"europarl-v7.{config.language_code}" + -# f"-en.{config.language_code}"), -# "P7793", -# is_reference=True, -# ), -# # line number -# wbi_datatype.String( -# str(line), -# "P7421", -# is_reference=True, -# ), -# type_of_reference_qualifier, -# ] -# elif source == "ksamsok": -# # No date is provided unfortunately, so we set it to unknown value -# stated_in = wbi_datatype.ItemID( -# prop_nr="P248", -# value="Q7654799", -# is_reference=True -# ) -# document_id = wbi_datatype.ExternalID( -# # K-Samsök URI -# prop_nr="P1260", -# value=document_id, -# is_reference=True -# ) -# reference = [ -# stated_in, -# document_id, -# wbi_datatype.Time( -# prop_nr="P813", # Fetched today -# time=datetime.utcnow().replace( -# tzinfo=timezone.utc -# ).replace( -# hour=0, -# minute=0, -# second=0, -# ).strftime("+%Y-%m-%dT%H:%M:%SZ"), -# is_reference=True, -# ), -# wbi_datatype.Time( -# # We don't know the value of the publication dates unfortunately -# prop_nr="P577", # Publication date -# time="", -# snak_type="somevalue", -# is_reference=True, -# ), -# type_of_reference_qualifier, -# ] -# else: -# raise ValueError(f"Did not recognize the source {source}") -# if reference is None: -# raise ValueError(_("No reference defined, cannot add usage example")) -# else: -# # This is the usage example statement -# claim = wbi_datatype.MonolingualText( -# sentence, -# "P5831", -# language=config.language_code, -# # Add qualifiers -# qualifiers=[ -# link_to_form, -# link_to_sense, -# language_style_qualifier, -# ], -# # Add reference -# references=[reference], -# ) -# if config.debug_json: -# logging.debug(f"claim:{claim.get_json_representation()}") -# item = wbi_core.ItemEngine( -# item_id=lid, -# ) -# # Updating appends by default in v0.11.0 -# item.update(data=[claim]) -# # if config.debug_json: -# # print(item.get_json_representation()) -# if config.login_instance is None: -# # Authenticate with WikibaseIntegrator -# print("Logging in with Wikibase Integrator") -# config.login_instance = wbi_login.Login( -# user=config.username, pwd=config.password -# ) -# result = item.write( -# config.login_instance, -# edit_summary=( -# _("Added usage example " + -# "with [[Wikidata:Tools/LexUtils]] v{}".format(config.version)) -# ) -# ) -# if config.debug_json: -# logging.debug(f"result from WBI:{result}") -# # TODO add handling of result from WBI and return True == Success or False -# return result -# -# def find_duplicates(self): -# """Lookup duplicates using the -# Wikidata Lexeme Forms Duplicate API""" -# url = ("https://lexeme-forms.toolforge.org/api/v1/duplicates/www/" -# f"{self.language_code.value}/{self.lemma}") -# response = requests.get(url, headers={"Accept": "application/json"}) -# if response.status_code == 204: -# return None -# elif response.status_code == 200: -# return response.json() -# else: -# raise Exception(f"Got {response.status_code}: {response.text}") -# -# -# class LexemeLanguage: -# lexemes: List[Lexeme] -# language_code: WikimediaLanguageCode -# language_qid: WikimediaLanguageQID -# senses_with_P5137_per_lexeme: float -# senses_with_P5137: int -# forms: int -# forms_with_an_example: int -# forms_without_an_example: List[Form] -# lexemes_count: int -# -# def __init__(self, language_code: str): -# self.language_code = WikimediaLanguageCode(language_code) -# self.language_qid = WikimediaLanguageQID[self.language_code.name] -# -# def fetch_forms_missing_an_example(self): -# logger = logging.getLogger(__name__) -# results = execute_sparql_query(f''' -# #title:Forms that have no example demonstrating them -# select ?lexeme ?form ?form_representation ?category -# (group_concat(distinct ?feature; separator = ",") as ?grammatical_features) -# WHERE {{ -# ?lexeme dct:language wd:{self.language_qid.value}; -# wikibase:lemma ?lemma; -# wikibase:lexicalCategory ?category; -# ontolex:lexicalForm ?form. -# ?form ontolex:representation ?form_representation; -# wikibase:grammaticalFeature ?feature. -# MINUS {{ -# ?lexeme p:P5831 ?statement. -# ?statement ps:P5831 ?example; -# pq:P6072 []; -# pq:P5830 ?form_with_example. -# }} -# }} -# group by ?lexeme ?form ?form_representation ?category -# limit 50''') -# self.forms_without_an_example = [] -# logger.info("Got the data") -# logger.info(f"data:{results.keys()}") -# try: -# #logger.info(f"data:{results['results']['bindings']}") -# for entry in results["results"]['bindings']: -# logger.info(f"data:{entry.keys()}") -# logging.info(f"lexeme_json:{entry}") -# f = Form(entry) -# self.forms_without_an_example.append(f) -# except KeyError: -# logger.error("Got no results") -# logger.info(f"Got {len(self.forms_without_an_example)} " -# f"forms from WDQS for language {self.language_code.name}") -# -# def fetch_lexemes(self): -# # TODO port to use the Lexeme class instead of heavy dataframes which we don't need -# raise Exception("This is deprecated.") -# results = execute_sparql_query(f''' -# SELECT DISTINCT -# ?entity_lid ?form ?word (?categoryLabel as ?category) -# (?grammatical_featureLabel as ?feature) ?sense ?gloss -# WHERE {{ -# ?entity_lid a ontolex:LexicalEntry; dct:language wd:{self.language_qid.value}. -# VALUES ?excluded {{ -# # exclude affixes and interfix -# wd:Q62155 # affix -# wd:Q134830 # prefix -# wd:Q102047 # suffix -# wd:Q1153504 # interfix -# }} -# MINUS {{?entity_lid wdt:P31 ?excluded.}} -# ?entity_lid wikibase:lexicalCategory ?category. -# -# # We want only lexemes with both forms and at least one sense -# ?entity_lid ontolex:lexicalForm ?form. -# ?entity_lid ontolex:sense ?sense. -# -# # Exclude lexemes without a linked QID from at least one sense -# ?sense wdt:P5137 []. -# ?sense skos:definition ?gloss. -# # Get only the swedish gloss, exclude otherwise -# FILTER(LANG(?gloss) = "{self.language_code.value}") -# -# # This remove all lexemes with at least one example which is not -# # ideal -# MINUS {{?entity_lid wdt:P5831 ?example.}} -# ?form wikibase:grammaticalFeature ?grammatical_feature. -# # We extract the word of the form -# ?form ontolex:representation ?word. -# SERVICE wikibase:label -# {{ bd:serviceParam wikibase:language "{self.language_code.value},en". }} -# }} -# limit {config.sparql_results_size} -# offset {config.sparql_offset} -# ''') -# self.lexemes = [] -# for lexeme_json in results: -# logging.debug(f"lexeme_json:{lexeme_json}") -# l = Lexeme.parse_wdqs_json(lexeme_json) -# self.lexemes.append(l) -# logging.info(f"Got {len(self.lexemes)} lexemes from " -# f"WDQS for language {self.language_code.name}") -# -# def count_number_of_lexemes(self): -# """Returns an int""" -# logger = logging.getLogger(__name__) -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?l) as ?count) -# WHERE {{ -# ?l dct:language wd:{self.language_qid.value}. -# }}''')) -# logger.debug(f"result:{result}") -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# return count -# -# def count_number_of_senses_with_p5137(self): -# """Returns an int""" -# logger = logging.getLogger(__name__) -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?sense) as ?count) -# WHERE {{ -# ?l dct:language wd:{self.language_qid.value}. -# ?l ontolex:sense ?sense. -# ?sense skos:definition ?gloss. -# # Exclude lexemes without a linked QID from at least one sense -# ?sense wdt:P5137 []. -# }}''')) -# logger.debug(f"result:{result}") -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# return count -# -# def count_number_of_forms_without_an_example(self): -# """Returns an int""" -# # TODO fix this to count all senses in a given language -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?form) as ?count) -# WHERE {{ -# ?l dct:language wd:{self.language_qid.value}. -# ?l ontolex:lexicalForm ?form. -# ?l ontolex:sense ?sense. -# # exclude lexemes that already have at least one example -# MINUS {{?l wdt:P5831 ?example.}} -# # Exclude lexemes without a linked QID from at least one sense -# ?sense wdt:P5137 []. -# }}''')) -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# self.forms_without_an_example = count -# -# def count_number_of_forms_with_examples(self): -# pass -# -# def count_number_of_forms(self): -# pass -# -# def calculate_statistics(self): -# self.lexemes_count: int = self.count_number_of_lexemes() -# self.senses_with_P5137: int = self.count_number_of_senses_with_p5137() -# self.calculate_senses_with_p5137_per_lexeme() -# -# def calculate_senses_with_p5137_per_lexeme(self): -# self.senses_with_P5137_per_lexeme = round( -# self.senses_with_P5137 / self.lexemes_count, 3 -# ) -# -# def print(self): -# print(f"{self.language_code.name} has " -# f"{self.senses_with_P5137} senses with linked QID in " -# f"total on {self.lexemes_count} lexemes " -# f"which is {self.senses_with_P5137_per_lexeme} per lexeme.") -# -# # TODO decide where to put this code -# class LexemeStatistics: -# total_lexemes: int -# -# def __init__(self): -# self.calculate_total_lexemes() -# self.rank_languages_based_on_statistics() -# -# def calculate_total_lexemes(self) -> int: -# """Calculate how many lexemes exists in Wikidata""" -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?l) as ?count) -# WHERE {{ -# ?l a ontolex:LexicalEntry. -# }}''')) -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# self.total_lexemes = count -# -# def rank_languages_based_on_statistics(self): -# logger = logging.getLogger(__name__) -# language_objects = [] -# print("Fetching data...") -# for language_code in WikimediaLanguageCode: -# logger.info(f"Working on {language_code.name}") -# language = LexemeLanguage(language_code) -# language.calculate_statistics() -# language_objects.append(language) -# sorted_by_senses_with_p5137_per_lexeme = sorted( -# language_objects, -# key=lambda language: language.senses_with_P5137_per_lexeme, -# reverse=True -# ) -# print("Languages ranked by most senses linked to items:") -# for language in sorted_by_senses_with_p5137_per_lexeme: -# language.print() -# # Generator expression -# total_lexemes_among_supported_languages: int = sum( -# language.lexemes_count for language in language_objects -# ) -# # logger.debug(f"total:{total_lexemes_among_supported_languages}") -# percent = round( -# total_lexemes_among_supported_languages * 100 / self.total_lexemes -# ) -# print(f"These languages have {total_lexemes_among_supported_languages} " -# f"lexemes out of {self.total_lexemes} in total ({percent}%)") -# -# -class Item(Entity): - """This represents an item in Wikidata - We always work on one language at a time, - so don't bother with languages here and keep to simple strings""" - id: str = None - label: str = None - description: str = None - aliases: List[str] = None - - def __init__(self, - id: str = None, - json: str = None, - label: str = None, - description: str = None, - aliases: List[str] = None, - task: Task = None): - if json is not None: - self.parse_json(json) - else: - if id is not None: - self.id = str(EntityID(id)) - if description is None and label is None and aliases is None: - logging.debug("No of description, label or aliases received") - if task is None: - raise ValueError("Got no task") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - self.fetch_label_and_description_and_aliases(task=task) - elif label is None or aliases is None: - raise ValueError("This is not supported. " - "Either both state the label and " - "aliases or None of them") - else: - self.label = label - self.aliases = aliases - self.description = description - - def __str__(self): - return f"{self.label}, see {self.url()}" - - def parse_json(self, json): - """Parse the WDQS json""" - logger = logging.getLogger(__name__) - try: - logger.debug(f'item_json:{json["item"]}') - self.id = str(EntityID(json["item"]["value"])) - except KeyError: - pass - try: - logger.debug(json["itemLabel"]) - self.label = (json["itemLabel"]["value"]) - except KeyError: - logger.info(f"no label found") - - def parse_from_wdqs_json(self, json): - """Parse the json into the object""" - for variable in json: - logging.debug(variable) - if variable == "item": - self.id = variable - if variable == "itemLabel": - self.label = variable - - def fetch_label_and_description_and_aliases(self, - task: Task = None): - """Fetch label and aliases in the task language from the Wikidata API""" - if task is None: - raise ValueError("task was None") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - from src.helpers.console import console - with console.status(f"Fetching {task.language_code.name.title()} label and aliases from the Wikidata API..."): - wbi = WikibaseIntegrator() - item = wbi.item.get(self.id) - label = item.labels.get(task.language_code.value) - if label is not None: - self.label = str(label) - description = item.descriptions.get(task.language_code.value) - if description is not None: - self.description = str(description) - aliases: List[Alias] = item.aliases.get(task.language_code.value) - # logging.debug(f"aliases from wbi:{item.aliases.get('en')}") - if aliases is not None: - self.aliases = [] - for alias in aliases: - self.aliases.append(str(alias)) - # logging.debug(f"appended:{alias.value}") - # logging.debug(f"aliases:{self.aliases}") - - -class Items: - list: List[Item] = [] - - def fetch_based_on_label(self): - pass - - def random_shuffle_list(self): - random.shuffle(self.list) diff --git a/src/models/wikimedia/__init__.py b/src/models/wikimedia/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/wikimedia/enum.py b/src/models/wikimedia/enum.py new file mode 100644 index 0000000..ef8afeb --- /dev/null +++ b/src/models/wikimedia/enum.py @@ -0,0 +1,35 @@ +from enum import Enum + + +class WikimediaLanguageCode(Enum): + BASQUE = "eu" + BENGALI = "bn" + BOKMÅL = "nb" + CZECH = "cs" + DANISH = "da" + ENGLISH = "en" + ESTONIAN = "et" + FRENCH = "fr" + GERMAN = "de" + HEBREW = "he" + LATIN = "la" + MALAYALAM = "ml" + RUSSIAN = "ru" + SWEDISH = "sv" + + +class WikimediaLanguageQID(Enum): + BASQUE = "Q8752" + BENGALI = "Q9610" + BOKMÅL = "Q25167" + CZECH = "Q9056" + DANISH = "Q9035" + ENGLISH = "Q1860" + ESTONIAN = "Q9072" + FRENCH = "Q150" + GERMAN = "Q188" + HEBREW = "Q9288" + LATIN = "Q397" + MALAYALAM = "Q36236" + RUSSIAN = "Q7737" + SWEDISH = "Q9027" \ No newline at end of file diff --git a/src/models/wikimedia/wikidata/__init__.py b/src/models/wikimedia/wikidata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/wikimedia/wikidata/entity.py b/src/models/wikimedia/wikidata/entity.py new file mode 100644 index 0000000..89a91e9 --- /dev/null +++ b/src/models/wikimedia/wikidata/entity.py @@ -0,0 +1,57 @@ +import logging +from typing import Optional + +from pydantic import BaseModel +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator import wbi_config +from wikibaseintegrator.datatypes import BaseDataType # type: ignore +from wikibaseintegrator.wbi_enums import ActionIfExists # type: ignore + +import config + +wbi_config.config['USER_AGENT'] = config.user_agent + + +class Entity(BaseModel): + """Base entity with code that is the same for both items and lexemes""" + id: Optional[str] + label: Optional[str] + + def __eq__(self, other): + """This helps in removing duplicates + https://stackoverflow.com/questions/4169252/remove-duplicates-in-list-of-object-with-python""" + return self.id == other.id + + def __hash__(self): + return hash(('id', self.id)) + + def upload_one_statement_to_wikidata(self, + statement: BaseDataType = None, + summary: str = None, + editgroups_hash: str = None): + """Upload one statement and always append + This mandates an editgroups hash to be supplied""" + logger = logging.getLogger(__name__) + if self.id is None: + raise ValueError("no id on item") + if statement is None: + raise ValueError("Statement was None") + if summary is None: + raise ValueError("summary was None") + if editgroups_hash is None: + raise ValueError("editgroup_hash was None") + if config.login_instance is None: + raise ValueError("No login instance in config.login_instance") + wbi = WikibaseIntegrator(login=config.login_instance) + item = wbi.item.get(self.id) + item.add_claims( + [statement], + action_if_exists=ActionIfExists.APPEND) + result = item.write( + summary=f"Added {summary} with [[{config.tool_wikipage}]] " + f"([[:toolforge:editgroups/b/CB/{editgroups_hash}|details]])" + ) + logger.debug(f"result from WBI:{result}") + + def url(self): + return f"http://www.wikidata.org/entity/{self.id}" diff --git a/src/models/wikimedia/wikidata/entiyt_id.py b/src/models/wikimedia/wikidata/entiyt_id.py new file mode 100644 index 0000000..fe86931 --- /dev/null +++ b/src/models/wikimedia/wikidata/entiyt_id.py @@ -0,0 +1,35 @@ +import logging + +import config +from src.models.wikimedia.wikidata.enums import WikidataNamespaceLetters + +# TODO convert this to special constr type with a validator +class EntityId: + letter: WikidataNamespaceLetters + # This can be e.g. "32698-F1" in the case of a lexeme + rest: str + + def __init__(self, + entity_id: str): + logger = logging.getLogger(__name__) + if entity_id is not None: + # Remove prefix if found + if config.wd_prefix in entity_id: + logger.debug("Removing prefix") + entity_id = entity_id.replace(config.wd_prefix, "") + if len(entity_id) > 1: + logger.info(f"entity_id:{entity_id}") + self.letter = WikidataNamespaceLetters(entity_id[0]) + self.rest = entity_id[1:] + else: + raise ValueError("Entity ID was too short.") + else: + raise ValueError("Entity ID was None") + + def __str__(self): + return f"{self.letter.value}{self.rest}" + + # def extract_wdqs_json_entity_id(self, json: Dict, sparql_variable: str): + # self.__init__(json[sparql_variable]["value"].replace( + # config.wd_prefix, "" + # )) diff --git a/src/models/wikimedia/wikidata/enums.py b/src/models/wikimedia/wikidata/enums.py new file mode 100644 index 0000000..1e40e46 --- /dev/null +++ b/src/models/wikimedia/wikidata/enums.py @@ -0,0 +1,36 @@ +from enum import Enum + + +class WikidataGrammaticalFeature(Enum): + ACTIVE_VOICE = "Q1317831" + DEFINITE = "Q53997851" + GENITIVE_CASE = "Q146233" + IMPERATIVE = "Q22716" + INDEFINITE = "Q53997857" + INFINITIVE = "Q179230" + NOMINATIVE_CASE = "Q131105" + PASSIVE_VOICE = "Q1194697" + PLURAL = "Q146786" + PRESENT_TENSE = "Q192613" + PRETERITE = "Q442485" + SIMPLE_PRESENT = "Q3910936" + SINGULAR = "Q110786" + SUPINE = "Q548470" + THIRD_PERSON_SINGULAR = "Q51929447" + + +class WikidataLexicalCategory(Enum): + ADJECTIVE = "Q34698" + ADVERB = "Q380057" + AFFIX = "Q62155" + NOUN = "Q1084" + PROPER_NOUN = "Q147276" + VERB = "Q24905" + + +class WikidataNamespaceLetters(Enum): + ITEM = "Q" + LEXEME = "L" + PROPERTY = "P" + # FORM = "F" + # SENSE = "S" diff --git a/src/models/wikimedia/wikidata/foreign_id.py b/src/models/wikimedia/wikidata/foreign_id.py new file mode 100644 index 0000000..85e5c79 --- /dev/null +++ b/src/models/wikimedia/wikidata/foreign_id.py @@ -0,0 +1,21 @@ +from typing import Optional + +from src.models.wikimedia.wikidata.entiyt_id import EntityId + + +class ForeignID: + id: Optional[str] + property: Optional[str] # This is the property with type ExternalId + source_item_id: Optional[str] # This is the Q-item for the source + + def __init__(self, + id: Optional[str] = None, + property: Optional[str] = None, + source_item_id: Optional[str] = None): + self.id = id + if property is None: + raise ValueError("property was None") + self.property = str(EntityId(property)) + if source_item_id is None: + raise ValueError("source_item_id was None") + self.source_item_id = str(EntityId(source_item_id)) diff --git a/src/models/wikimedia/wikidata/item.py b/src/models/wikimedia/wikidata/item.py new file mode 100644 index 0000000..1e041d0 --- /dev/null +++ b/src/models/wikimedia/wikidata/item.py @@ -0,0 +1,50 @@ +from typing import List, Optional + +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator import wbi_config # type: ignore +from wikibaseintegrator.models import Alias # type: ignore + +import config +from src.models.task import Task +from src.models.wikimedia.wikidata.entity import Entity + +wbi_config.config['USER_AGENT'] = config.user_agent + + +class Item(Entity): + """This represents an item in Wikidata + We always work on one language at a time, + so we don't bother with languages here and keep to simple strings""" + description: Optional[str] = None + aliases: Optional[List[str]] = None + + def __str__(self): + return f"{self.label}, see {self.url()}" + + def fetch_label_and_description_and_aliases(self, + task: Task = None): + """Fetch label and aliases in the task language from the Wikidata API""" + if task is None: + raise ValueError("task was None") + if not isinstance(task, Task): + raise ValueError("task was not a Task object") + if task.language_code is None: + raise ValueError("task.language_code was None") + from src.helpers.console import console + with console.status(f"Fetching {task.language_code.name.title()} label and aliases from the Wikidata API..."): + wbi = WikibaseIntegrator() + item = wbi.item.get(self.id) + label = item.labels.get(task.language_code.value) + if label is not None: + self.label = str(label) + description = item.descriptions.get(task.language_code.value) + if description is not None: + self.description = str(description) + aliases: List[Alias] = item.aliases.get(task.language_code.value) + # logging.debug(f"aliases from wbi:{item.aliases.get('en')}") + if aliases is not None: + self.aliases = [] + for alias in aliases: + self.aliases.append(str(alias)) + # logging.debug(f"appended:{alias.value}") + # logging.debug(f"aliases:{self.aliases}") diff --git a/src/models/wikimedia/wikidata/sparql_item.py b/src/models/wikimedia/wikidata/sparql_item.py new file mode 100644 index 0000000..e486a13 --- /dev/null +++ b/src/models/wikimedia/wikidata/sparql_item.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.models.wikimedia.wikidata.entiyt_id import EntityId +from src.models.wikimedia.wikidata.item import Item + + +class Value(BaseModel): + value: str + + +class SparqlItem(Item): + """This class models the data we get from SPARQL""" + item: Value + itemLabel: Value + + def validate_qid_and_copy_label(self): + self.id = str(EntityId(self.item.value)) + self.label = self.itemLabel.value diff --git a/src/tasks.py b/src/tasks.py index 5b80b5e..1e1a7a9 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -4,7 +4,7 @@ # When adding a new task, also add it in the enum tasks = [ - Task( + Task(**dict( id=TaskIds.SCHOLARLY_ARTICLES, label="Add main subject to scholarly articles and preprints", language_code=SupportedLanguageCode.ENGLISH, @@ -22,14 +22,14 @@ "sub forms of screening have been matched." ), number_of_queries_per_search_string=2 - ), - Task( + )), + Task(**dict( id=TaskIds.RIKSDAGEN_DOCUMENTS, label="Add main subject to documents from Riksdagen", language_code=SupportedLanguageCode.SWEDISH, best_practice_information=None - ), - Task( + )), + Task(**dict( id=TaskIds.THESIS, label="Add main subject to thesis' and technical reports", language_code=SupportedLanguageCode.ENGLISH, @@ -46,11 +46,11 @@ "avoid the more general 'cancer screening' until all " "sub forms of screening have been matched." ), - ), - Task( + )), + Task(**dict( id=TaskIds.ACADEMIC_JOURNALS, label="Add main subject to academic journals", language_code=SupportedLanguageCode.ENGLISH, best_practice_information=None - ), + )), ]