From 997acd3f19bc95ba455466a5e120f32ad375d4a1 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 17 Feb 2022 09:30:58 +0100 Subject: [PATCH 01/13] Add diagram sequence_sparql.puml --- diagrams/sequence_sparql.puml | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 diagrams/sequence_sparql.puml diff --git a/diagrams/sequence_sparql.puml b/diagrams/sequence_sparql.puml new file mode 100644 index 0000000..3f94e25 --- /dev/null +++ b/diagrams/sequence_sparql.puml @@ -0,0 +1,50 @@ +@startuml +'https://plantuml.com/sequence-diagram + +autonumber +actor User +'cloud Wikidata +User -> ItemSubjector : start script +alt "arguments: sparql && limit" + ItemSubjector -> Wikidata : fetch subjects + Wikidata -> ItemSubjector : response + loop "for each item in list" + alt "below limit" + ItemSubjector -> Wikidata : fetch details about the item + Wikidata -> ItemSubjector : response + ItemSubjector -> Wikidata : fetch scientific articles according to SPARQL query built based on the details + Wikidata -> ItemSubjector : response + ItemSubjector -> User : present max 50 items + ItemSubjector -> User : ask for approval of batch + ItemSubjector -> User : show count of batches and matches in the job list in memory + end + alt "above limit" + ItemSubjector -> User : ask before continuing + end + end + alt "user choose not to continue" + ItemSubjector -> Wikidata : Upload main subjects to all matches + end +end +alt "arguments: sparql && limit && prepare-jobs" + ItemSubjector -> Wikidata : fetch subjects + Wikidata -> ItemSubjector : response + loop "for each item in list" + alt "below limit" + ItemSubjector -> Wikidata : fetch details about the item + Wikidata -> ItemSubjector : response + ItemSubjector -> Wikidata : fetch scientific articles according to SPARQL query built based on the details + Wikidata -> ItemSubjector : response + ItemSubjector -> User : present max 50 items + ItemSubjector -> User : ask for approval of batch + ItemSubjector -> User : show count of batches and matches in the job list in memory + end + alt "above limit" + ItemSubjector -> User : ask before continuing + end + end + alt "user choose not to continue" + ItemSubjector -> Wikidata : save to job list on disk + end +end +@enduml \ No newline at end of file From 45942695bda8caa8a6c0f4c9a5bb2d7f51cf27f5 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 17 Feb 2022 09:50:33 +0100 Subject: [PATCH 02/13] Add diagram classes.puml (WIP) --- diagrams/classes.puml | 69 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 diagrams/classes.puml diff --git a/diagrams/classes.puml b/diagrams/classes.puml new file mode 100644 index 0000000..753459f --- /dev/null +++ b/diagrams/classes.puml @@ -0,0 +1,69 @@ +@startuml +'https://plantuml.com/class-diagram + +abstract class Items +Items <|-- AcademicJournalItems +Items <|-- RiksdagenDocumentItems +Items <|-- ScholarlyArticleItems +Items <|-- ThesisItems + +'package helpers { +'} +class AcademicJournalItems { +fetch_based_on_label() +} +class RiksdagenDocumentItems { ++list ++fetch_based_on_label() +} + +class ScholarlyArticleItems { ++list ++fetch_based_on_label() +} +class ThesisItems { +list +fetch_based_on_label() +} + +class Suggestion { + item: Item = None + search_strings: List[str] = None + task: Task = None + args: argparse.Namespace = None + __init__() + __str__() + add_to_items() + extract_search_strings() + search_urls ()) +} + +class Task { + best_practice_information: Union[str, None] = None + id: TaskIds = None + label: str = None + language_code: SupportedLanguageCode = None + number_of_queries_per_search_string = 1 + __init__() + __str__() +} + +class BatchJob { + +suggestion: Suggestion + +items: Items + run() +} +class QuickStatementsCommandVersion1 { + +target: EntityID = None + +property: EntityID = None + +value: EntityID = None + -__str__() +} + +enum TimeUnit { +DAYS +HOURS +MINUTES +} + +@enduml \ No newline at end of file From 7155e6ca179f5e3284ddd0efa6b508d3836dab79 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 17 Feb 2022 10:27:10 +0100 Subject: [PATCH 03/13] Reorganize models into one file per class. Run mypy on the codebase and fix validation errors. --- config.example.py | 4 +- fetch_main_subjects.py | 8 +- itemsubjector.py | 4 +- requirements.txt | 3 +- src/__init__.py | 10 +- src/helpers/cleaning.py | 24 +- src/helpers/console.py | 14 +- src/helpers/enums.py | 2 +- src/helpers/jobs.py | 5 +- src/helpers/menus.py | 7 +- src/helpers/pickle.py | 10 +- src/models/academic_journals.py | 25 +- src/models/batch_job.py | 2 +- src/models/quickstatements.py | 9 +- src/models/riksdagen_documents.py | 7 +- src/models/scholarly_articles.py | 134 ++--- src/models/suggestion.py | 20 +- src/models/task.py | 8 +- src/models/thesis.py | 7 +- src/models/wikidata.py | 883 ------------------------------ src/models/wikidata/__init__.py | 0 src/models/wikidata/entity.py | 47 ++ src/models/wikidata/entiyt_id.py | 35 ++ src/models/wikidata/enums.py | 72 +++ src/models/wikidata/foreign_id.py | 21 + src/models/wikidata/item.py | 103 ++++ src/models/wikidata/items.py | 14 + 27 files changed, 458 insertions(+), 1020 deletions(-) delete mode 100644 src/models/wikidata.py create mode 100644 src/models/wikidata/__init__.py create mode 100644 src/models/wikidata/entity.py create mode 100644 src/models/wikidata/entiyt_id.py create mode 100644 src/models/wikidata/enums.py create mode 100644 src/models/wikidata/foreign_id.py create mode 100644 src/models/wikidata/item.py create mode 100644 src/models/wikidata/items.py diff --git a/config.example.py b/config.example.py index 256149d..deb86c4 100644 --- a/config.example.py +++ b/config.example.py @@ -3,12 +3,14 @@ from pathlib import Path # Add your botpassword and login here: +from typing import List + username = "" password = "" # Global settings wiki_user = "User:Username" # Change this to your username -list_of_allowed_aliases = [] # Add elements like this ["API"] +list_of_allowed_aliases: List[str] = [] # Add elements like this ["API"] logging.basicConfig(level=logging.WARNING) version = "0.2" # Don't touch this. wd_prefix = "http://www.wikidata.org/entity/" diff --git a/fetch_main_subjects.py b/fetch_main_subjects.py index 27e83cd..14740e1 100644 --- a/fetch_main_subjects.py +++ b/fetch_main_subjects.py @@ -1,8 +1,8 @@ import logging import random -from wikibaseintegrator import wbi_config -from wikibaseintegrator.wbi_helpers import execute_sparql_query +from wikibaseintegrator import wbi_config # type: ignore +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config from src import console @@ -19,7 +19,7 @@ # the same subset of subjects every time we run it randomizing_offset: int = random.randint(1, 500000) console.print(f"Random offset used: {randomizing_offset} for this run") -for i in range(0+randomizing_offset, 100000+randomizing_offset, 10000): +for i in range(0 + randomizing_offset, 100000 + randomizing_offset, 10000): print(i) # title: Get main subjects used at least once on scholarly articles results = execute_sparql_query(f""" @@ -68,4 +68,4 @@ console.print(f"Saving {len(subjects_without_duplicates)} " f"to pickle '{config.main_subjects_pickle_file_path}' (overwriting)") add_to_main_subject_pickle(subjects) -console.print("Done") \ No newline at end of file +console.print("Done") diff --git a/itemsubjector.py b/itemsubjector.py index 344ed19..182c802 100644 --- a/itemsubjector.py +++ b/itemsubjector.py @@ -1,6 +1,6 @@ import logging -from src import * +import src logging.basicConfig(level=logging.DEBUG) -main() \ No newline at end of file +src.main() diff --git a/requirements.txt b/requirements.txt index bef4108..404a2cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ console-menu git+git://github.com/LeMyst/WikibaseIntegrator@v0.12.0.dev5#egg=wikibaseintegrator rich~=10.9.0 -SPARQLWrapper~=1.8.5 \ No newline at end of file +SPARQLWrapper~=1.8.5 +pydantic \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py index 7223475..5ee4549 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -2,8 +2,9 @@ import logging from typing import List -from wikibaseintegrator import wbi_login, wbi_config -from wikibaseintegrator.wbi_helpers import execute_sparql_query +import pandas as pd # type: ignore +from wikibaseintegrator import wbi_login, wbi_config # type: ignore +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config from src.helpers.argparse_setup import setup_argparse_and_return_args @@ -22,7 +23,7 @@ from src.models.quickstatements import QuickStatementsCommandVersion1 from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Item, EntityID +from src.models.wikidata.entiyt_id import EntityID from src.tasks import tasks @@ -62,6 +63,8 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None, raise ValueError("jobs was None") if not isinstance(jobs, List): raise ValueError("jobs was not a list") + if args is None or args.sparql is None: + raise ValueError("args.sparql was None") if "P1889" not in args.sparql: console.print("Your SPARQL did not contain P1889 (different from). " "Please include 'MINUS {?item wdt:P1889 [].}' " @@ -117,6 +120,7 @@ def export_jobs_to_dataframe(): else: console.print("No jobs found. Create a job list first by using '--prepare-jobs'") + def export_jobs_to_quickstatements(): logger = logging.getLogger(__name__) logger.info("Exporting jobs to QuickStatements V1 commands. One file for each job.") diff --git a/src/helpers/cleaning.py b/src/helpers/cleaning.py index b15ffec..12de0a9 100644 --- a/src/helpers/cleaning.py +++ b/src/helpers/cleaning.py @@ -5,17 +5,17 @@ def strip_bad_chars(string): # https://stackoverflow.com/questions/3411771/best-way-to-replace-multiple-characters-in-a-string return ( string - # Needed for matching backslashes e.g. "Dmel\CG5330" on Q29717230 - .replace("\\", "\\\\") - # Needed for when labels contain apostrophe - .replace("'", "\\'") - .replace(",", "") - .replace(":", "") - .replace(";", "") - .replace("(", "") - .replace(")", "") - .replace("[", "") - .replace("]", "") + # Needed for matching backslashes e.g. "Dmel\CG5330" on Q29717230 + .replace("\\", "\\\\") + # Needed for when labels contain apostrophe + .replace("'", "\\'") + .replace(",", "") + .replace(":", "") + .replace(";", "") + .replace("(", "") + .replace(")", "") + .replace("[", "") + .replace("]", "") ) @@ -30,4 +30,4 @@ def strip_prefix(qid): if "http://www.wikidata.org/entity/" in qid: qid = qid[31:] # logger.debug(f"qid:{qid}") - return qid \ No newline at end of file + return qid diff --git a/src/helpers/console.py b/src/helpers/console.py index 6a19f23..0c8163a 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -8,7 +8,7 @@ from src.helpers.cleaning import clean_rich_formatting from src.models.batch_job import BatchJob from src.models.task import Task -from src.models.wikidata import Items +from src.models.wikidata.items import Items console = Console() @@ -98,11 +98,13 @@ def print_found_items_table(args: argparse.Namespace = None, def ask_add_to_job_queue(job: BatchJob = None): - return ask_yes_no_question(f"Do you want to add this job for " - f"[magenta]{job.suggestion.item.label}: " - f"{job.suggestion.item.description}[/magenta] with " - f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})") - + if job is not None: + return ask_yes_no_question(f"Do you want to add this job for " + f"[magenta]{job.suggestion.item.label}: " + f"{job.suggestion.item.description}[/magenta] with " + f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})") + else: + raise ValueError("job was None") def print_running_jobs(jobs: List[BatchJob] = None): if jobs is None: diff --git a/src/helpers/enums.py b/src/helpers/enums.py index bb7fff0..be9cd65 100644 --- a/src/helpers/enums.py +++ b/src/helpers/enums.py @@ -10,4 +10,4 @@ class TaskIds(Enum): SCHOLARLY_ARTICLES = auto() RIKSDAGEN_DOCUMENTS = auto() THESIS = auto() - ACADEMIC_JOURNALS = auto() \ No newline at end of file + ACADEMIC_JOURNALS = auto() diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index b0ed45a..5e9d1cf 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -14,6 +14,7 @@ from src.models.riksdagen_documents import RiksdagenDocumentItems from src.models.scholarly_articles import ScholarlyArticleItems from src.models.thesis import ThesisItems +from src.models.wikidata.items import Items from src.tasks import tasks, Task if TYPE_CHECKING: @@ -31,7 +32,7 @@ def process_qid_into_job(qid: str = None, raise ValueError("args was None") if task is None: raise ValueError("task was None") - from src import Item + from src.models.wikidata.item import Item item = Item( id=strip_prefix(qid), task=task @@ -53,6 +54,7 @@ def process_qid_into_job(qid: str = None, f'the search strings by running a total of ' f'{len(suggestion.search_strings) * task.number_of_queries_per_search_string} ' f'queries on WDQS...'): + items: Items = None if task.id == TaskIds.SCHOLARLY_ARTICLES: items = ScholarlyArticleItems() elif task.id == TaskIds.RIKSDAGEN_DOCUMENTS: @@ -83,6 +85,7 @@ def process_qid_into_job(qid: str = None, return None else: console.print(f"Label for {task.language_code} was None on {item.url()}, skipping") + return None def process_user_supplied_qids_into_batch_jobs(args: argparse.Namespace = None, diff --git a/src/helpers/menus.py b/src/helpers/menus.py index a35973e..4d0c466 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -1,10 +1,10 @@ import logging from typing import List -from consolemenu import SelectionMenu +from consolemenu import SelectionMenu # type: ignore from src.models.suggestion import Suggestion -from src.models.wikidata import Item +from src.models.wikidata.item import Item from src.tasks import tasks, Task @@ -41,7 +41,6 @@ def select_task() -> Task: f"{selected_task}") return selected_task - # def select_language(): # logger = logging.getLogger(__name__) # menu = SelectionMenu(WikimediaLanguageCode.__members__.keys(), "Select a language") @@ -68,4 +67,4 @@ def select_task() -> Task: # selected_lexical_category = category_mapping[selected_lexical_category_index] # logger.debug(f"selected:{selected_lexical_category_index}=" # f"{selected_lexical_category}") -# return selected_lexical_category \ No newline at end of file +# return selected_lexical_category diff --git a/src/helpers/pickle.py b/src/helpers/pickle.py index f2a5085..05ee8b9 100644 --- a/src/helpers/pickle.py +++ b/src/helpers/pickle.py @@ -1,7 +1,7 @@ -import os import hashlib +import os import pickle -from typing import List +from typing import List, Optional import config from src.helpers.console import console @@ -17,6 +17,8 @@ def add_to_job_pickle(job: BatchJob = None): def add_to_main_subject_pickle(subjects: List[str] = None): + if subjects is None: + raise ValueError("subjects was None") with open(config.main_subjects_pickle_file_path, 'wb') as file: for qid in subjects: pickle.dump(qid, file, pickle.DEFAULT_PROTOCOL) @@ -38,7 +40,7 @@ def check_if_pickle_exists(path): return False -def parse_job_pickle(silent: bool = False) -> List[BatchJob]: +def parse_job_pickle(silent: bool = False) -> Optional[List[BatchJob]]: """Reads the pickle into a list of batch jobs""" if check_if_pickle_exists(config.job_pickle_file_path): jobs: List[BatchJob] = [] @@ -47,11 +49,13 @@ def parse_job_pickle(silent: bool = False) -> List[BatchJob]: if len(jobs) == 0: if not silent: console.print("No prepared jobs found") + return None else: return jobs else: if not silent: console.print("No pickle file found") + return None def parse_main_subjects_pickle() -> List[str]: diff --git a/src/models/academic_journals.py b/src/models/academic_journals.py index 4ecf2cb..14d0977 100644 --- a/src/models/academic_journals.py +++ b/src/models/academic_journals.py @@ -1,22 +1,14 @@ import logging -from wikibaseintegrator.wbi_helpers import execute_sparql_query +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config from src.helpers.cleaning import strip_bad_chars from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Items, Item - - -def process_results(results): - items = [] - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = Item(json=item_json) - items.append(item) - return items +from src.models.wikidata.item import Item +from src.models.wikidata.items import Items class AcademicJournalItems(Items): @@ -25,11 +17,22 @@ class AcademicJournalItems(Items): def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): + def process_results(results): + # TODO refactor into private method + items = [] + for item_json in results["results"]["bindings"]: + logging.debug(f"item_json:{item_json}") + item = Item(json=item_json) + items.append(item) + return items + # logger = logging.getLogger(__name__) if suggestion is None: raise ValueError("suggestion was None") if task is None: raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") # Fetch all items matching the search strings self.list = [] for search_string in suggestion.search_strings: diff --git a/src/models/batch_job.py b/src/models/batch_job.py index b96c5b4..770d9ea 100644 --- a/src/models/batch_job.py +++ b/src/models/batch_job.py @@ -4,7 +4,7 @@ if TYPE_CHECKING: from src.models.suggestion import Suggestion - from src.models.wikidata import Items + from src.models.wikidata.items import Items @dataclass diff --git a/src/models/quickstatements.py b/src/models/quickstatements.py index e5313fa..b6f3245 100644 --- a/src/models/quickstatements.py +++ b/src/models/quickstatements.py @@ -1,6 +1,7 @@ from dataclasses import dataclass +from typing import Optional -from src.models.wikidata import EntityID +from src.models.wikidata.entiyt_id import EntityID @dataclass @@ -10,9 +11,9 @@ class QuickStatementsCommandVersion1: For now we only support QID-values Q1\tP1\tQ1""" - target: EntityID = None - property: EntityID = None - value: EntityID = None + target: Optional[EntityID] = None + property: Optional[EntityID] = None + value: Optional[EntityID] = None def __str__(self): return f"{self.target}\t{self.property}\t{self.value}" diff --git a/src/models/riksdagen_documents.py b/src/models/riksdagen_documents.py index d2fb8df..daa4d0b 100644 --- a/src/models/riksdagen_documents.py +++ b/src/models/riksdagen_documents.py @@ -1,12 +1,13 @@ import logging -from wikibaseintegrator.wbi_helpers import execute_sparql_query +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Items, Item +from src.models.wikidata.item import Item +from src.models.wikidata.items import Items class RiksdagenDocumentItems(Items): @@ -21,6 +22,8 @@ def fetch_based_on_label(self, "supported yet for this task.") if task is None: raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") # Fetch all items maching the search strings self.list = [] # Include spaces around the n-gram to avoid edits like this one diff --git a/src/models/scholarly_articles.py b/src/models/scholarly_articles.py index 812ffba..c8672ed 100644 --- a/src/models/scholarly_articles.py +++ b/src/models/scholarly_articles.py @@ -1,83 +1,87 @@ import logging -from wikibaseintegrator.wbi_helpers import execute_sparql_query +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config from src.helpers.cleaning import strip_bad_chars from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Items, Item - - -def build_query(suggestion: Suggestion = None, - search_string: str = None, - task: Task = None, - cirrussearch_parameters: str = None): - if suggestion is None: - raise ValueError("suggestion was None") - if search_string is None: - raise ValueError("search_string was None") - if task is None: - raise ValueError("task was None") - if cirrussearch_parameters is None: - raise ValueError("cirrussearch_parameters was None") - # This query uses https://www.w3.org/TR/sparql11-property-paths/ to - # find subjects that are subclass of one another up to 3 hops away - # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI - # which has a hardcoded limit of 10,000 items so you will never get more matches than that - # This query use regex to match beginning, middle and end of the label of matched items - # The replacing lines should match the similar python replacements in cleaning.py - # The replacing with "\\\\\\\\" becomes "\\\\" after leaving python and then it works in - # SPARQL where it becomes "\\" and thus match a single backslash - return (f""" - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - hint:Query hint:optimizer "None". - BIND(STR('{cirrussearch_parameters} \"{search_string}\"') as ?search_string) - SERVICE wikibase:mwapi {{ - bd:serviceParam wikibase:api "Search"; - wikibase:endpoint "www.wikidata.org"; - mwapi:srsearch ?search_string. - ?title wikibase:apiOutput mwapi:title. - }} - BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) - ?item rdfs:label ?label. - BIND(REPLACE(LCASE(?label), ",", "") as ?label1) - BIND(REPLACE(?label1, ":", "") as ?label2) - BIND(REPLACE(?label2, ";", "") as ?label3) - BIND(REPLACE(?label3, "\\\\(", "") as ?label4) - BIND(REPLACE(?label4, "\\\\)", "") as ?label5) - BIND(REPLACE(?label5, "\\\\[", "") as ?label6) - BIND(REPLACE(?label6, "\\\\]", "") as ?label7) - BIND(REPLACE(?label7, "\\\\\\\\", "") as ?label8) - BIND(?label8 as ?cleaned_label) - FILTER(CONTAINS(?cleaned_label, ' {search_string.lower()} '@{task.language_code.value}) || - REGEX(?cleaned_label, '.* {search_string.lower()}$'@{task.language_code.value}) || - REGEX(?cleaned_label, '^{search_string.lower()} .*'@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """) - - -def process_results(results): - items = [] - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = Item(json=item_json) - items.append(item) - return items +from src.models.wikidata.item import Item +from src.models.wikidata.items import Items class ScholarlyArticleItems(Items): """This supports both published peer reviewed articles and preprints""" + def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): + def build_query(suggestion: Suggestion = None, + search_string: str = None, + task: Task = None, + cirrussearch_parameters: str = None): + # TODO refactor + if suggestion is None: + raise ValueError("suggestion was None") + if search_string is None: + raise ValueError("search_string was None") + if task is None: + raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") + if cirrussearch_parameters is None: + raise ValueError("cirrussearch_parameters was None") + # This query uses https://www.w3.org/TR/sparql11-property-paths/ to + # find subjects that are subclass of one another up to 3 hops away + # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI + # which has a hardcoded limit of 10,000 items so you will never get more matches than that + # This query use regex to match beginning, middle and end of the label of matched items + # The replacing lines should match the similar python replacements in cleaning.py + # The replacing with "\\\\\\\\" becomes "\\\\" after leaving python and then it works in + # SPARQL where it becomes "\\" and thus match a single backslash + return (f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + hint:Query hint:optimizer "None". + BIND(STR('{cirrussearch_parameters} \"{search_string}\"') as ?search_string) + SERVICE wikibase:mwapi {{ + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch ?search_string. + ?title wikibase:apiOutput mwapi:title. + }} + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + BIND(REPLACE(LCASE(?label), ",", "") as ?label1) + BIND(REPLACE(?label1, ":", "") as ?label2) + BIND(REPLACE(?label2, ";", "") as ?label3) + BIND(REPLACE(?label3, "\\\\(", "") as ?label4) + BIND(REPLACE(?label4, "\\\\)", "") as ?label5) + BIND(REPLACE(?label5, "\\\\[", "") as ?label6) + BIND(REPLACE(?label6, "\\\\]", "") as ?label7) + BIND(REPLACE(?label7, "\\\\\\\\", "") as ?label8) + BIND(?label8 as ?cleaned_label) + FILTER(CONTAINS(?cleaned_label, ' {search_string.lower()} '@{task.language_code.value}) || + REGEX(?cleaned_label, '.* {search_string.lower()}$'@{task.language_code.value}) || + REGEX(?cleaned_label, '^{search_string.lower()} .*'@{task.language_code.value})) + MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} + MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} + MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + """) + + def process_results(results): + # TODO refactor + items = [] + for item_json in results["results"]["bindings"]: + logging.debug(f"item_json:{item_json}") + item = Item(json=item_json) + items.append(item) + return items + # logger = logging.getLogger(__name__) if suggestion is None: raise ValueError("suggestion was None") diff --git a/src/models/suggestion.py b/src/models/suggestion.py index d1cac86..b1224de 100644 --- a/src/models/suggestion.py +++ b/src/models/suggestion.py @@ -1,25 +1,25 @@ import argparse import logging -from typing import List +from typing import List, Optional from urllib.parse import quote -from wikibaseintegrator.datatypes import Item as ItemType +from wikibaseintegrator.datatypes import Item as ItemType # type: ignore import config from src.helpers.calculations import calculate_random_editgroups_hash from src.helpers.cleaning import clean_rich_formatting from src.helpers.console import print_search_strings_table, console -from src.helpers.enums import TaskIds from src.models.batch_job import BatchJob from src.models.task import Task -from src.models.wikidata import Item, Items +from src.models.wikidata.item import Item +from src.models.wikidata.items import Items class Suggestion: - item: Item = None - search_strings: List[str] = None - task: Task = None - args: argparse.Namespace = None + item: Optional[Item] = None + search_strings: Optional[List[str]] = None + task: Optional[Task] = None + args: Optional[argparse.Namespace] = None def __init__(self, item: Item = None, @@ -108,8 +108,8 @@ def clean_special_symbols(string: str): no_aliases = False self.search_strings: List[str] = [clean_special_symbols(self.item.label)] if ( - self.item.aliases is not None and - no_aliases is False + self.item.aliases is not None and + no_aliases is False ): for alias in self.item.aliases: # logger.debug(f"extracting alias:{alias}") diff --git a/src/models/task.py b/src/models/task.py index 1260025..7511718 100644 --- a/src/models/task.py +++ b/src/models/task.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Optional from src.helpers.enums import SupportedLanguageCode, TaskIds @@ -9,9 +9,9 @@ class Task: """This class holds the tasks presented to the user in the menu and related data""" best_practice_information: Union[str, None] = None - id: TaskIds = None - label: str = None - language_code: SupportedLanguageCode = None + id: Optional[TaskIds] = None + label: Optional[str] = None + language_code: Optional[SupportedLanguageCode] = None number_of_queries_per_search_string = 1 def __init__(self, diff --git a/src/models/thesis.py b/src/models/thesis.py index 5569ae9..bf23a74 100644 --- a/src/models/thesis.py +++ b/src/models/thesis.py @@ -1,11 +1,12 @@ import logging -from wikibaseintegrator.wbi_helpers import execute_sparql_query +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata import Items, Item +from src.models.wikidata.item import Item +from src.models.wikidata.items import Items # There were ~16.000 thesis' in WD when this was written @@ -22,6 +23,8 @@ def fetch_based_on_label(self, "supported yet for this task.") if task is None: raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") # Fetch all items maching the search strings self.list = [] for search_string in suggestion.search_strings: diff --git a/src/models/wikidata.py b/src/models/wikidata.py deleted file mode 100644 index f4a25ea..0000000 --- a/src/models/wikidata.py +++ /dev/null @@ -1,883 +0,0 @@ -""" -Model from LexUtils -""" -import logging -import random -from enum import Enum -from typing import List - -from wikibaseintegrator import wbi_config, WikibaseIntegrator -from wikibaseintegrator.datatypes import BaseDataType -from wikibaseintegrator.models import Alias -from wikibaseintegrator.wbi_enums import ActionIfExists - -import config -# We get the URL for the Wikibase from here -from src.models.task import Task - -wbi_config.config['USER_AGENT'] = config.user_agent - - -class WikidataGrammaticalFeature(Enum): - # Swedish - ACTIVE_VOICE = "Q1317831" - PRETERITE = "Q442485" - INFINITIVE = "Q179230" - PRESENT_TENSE = "Q192613" - SUPINE = "Q548470" - IMPERATIVE = "Q22716" - PASSIVE_VOICE = "Q1194697" - SINGULAR = "Q110786" - NOMINATIVE_CASE = "Q131105" - INDEFINITE = "Q53997857" - DEFINITE = "Q53997851" - PLURAL = "Q146786" - GENITIVE_CASE = "Q146233" - # English - SIMPLE_PRESENT = "Q3910936" - THIRD_PERSON_SINGULAR = "Q51929447" - - -class WikidataLexicalCategory(Enum): - NOUN = "Q1084" - VERB = "Q24905" - ADVERB = "Q380057" - ADJECTIVE = "Q34698" - AFFIX = "Q62155" - PROPER_NOUN = "Q147276" - - -class WikimediaLanguageCode(Enum): - DANISH = "da" - SWEDISH = "sv" - BOKMÅL = "nb" - ENGLISH = "en" - FRENCH = "fr" - RUSSIAN = "ru" - ESTONIAN = "et" - MALAYALAM = "ml" - LATIN = "la" - HEBREW = "he" - BASQUE = "eu" - GERMAN = "de" - BENGALI = "bn" - CZECH = "cs" - - -class WikimediaLanguageQID(Enum): - DANISH = "Q9035" - SWEDISH = "Q9027" - BOKMÅL = "Q25167" - ENGLISH = "Q1860" - FRENCH = "Q150" - RUSSIAN = "Q7737" - ESTONIAN = "Q9072" - MALAYALAM = "Q36236" - LATIN = "Q397" - HEBREW = "Q9288" - BASQUE = "Q8752" - GERMAN = "Q188" - BENGALI = "Q9610" - CZECH = "Q9056" - - -class WikidataNamespaceLetters(Enum): - PROPERTY = "P" - ITEM = "Q" - LEXEME = "L" - #FORM = "F" - #SENSE = "S" - - -class EntityID: - letter: WikidataNamespaceLetters - # This can be e.g. "32698-F1" in the case of a lexeme - rest: str - - def __init__(self, - entity_id: str): - logger = logging.getLogger(__name__) - if entity_id is not None: - # Remove prefix if found - if config.wd_prefix in entity_id: - logger.debug("Removing prefix") - entity_id = entity_id.replace(config.wd_prefix, "") - if len(entity_id) > 1: - logger.info(f"entity_id:{entity_id}") - self.letter = WikidataNamespaceLetters(entity_id[0]) - self.rest = entity_id[1:] - else: - raise ValueError("Entity ID was too short.") - else: - raise ValueError("Entity ID was None") - - def __str__(self): - return f"{self.letter.value}{self.rest}" - - # def extract_wdqs_json_entity_id(self, json: Dict, sparql_variable: str): - # self.__init__(json[sparql_variable]["value"].replace( - # config.wd_prefix, "" - # )) - - -class ForeignID: - id: str - property: str # This is the property with type ExternalId - source_item_id: str # This is the Q-item for the source - - def __init__(self, - id: str = None, - property: str = None, - source_item_id: str = None): - self.id = id - self.property = str(EntityID(property)) - self.source_item_id = str(EntityID(source_item_id)) - - -class Form: - """ - Model for a Wikibase form - """ - id: str - representation: str - grammatical_features: List[WikidataGrammaticalFeature] - # We store these on the form because they are needed - # to determine if an example fits or not - lexeme_id: str - lexeme_category: str - - def __init__(self, json): - """Parse the form json""" - logger = logging.getLogger(__name__) - try: - logger.info(json["lexeme"]) - self.id = str(EntityID(json["lexeme"]["value"])) - except KeyError: - pass - try: - logger.info(json["form"]) - self.id = str(EntityID(json["form"]["value"])) - except KeyError: - pass - try: - self.representation: str = json["form_representation"]["value"] - except KeyError: - pass - try: - self.lexeme_category: WikidataLexicalCategory = WikidataLexicalCategory( - str(EntityID(json["category"]["value"])) - ) - except: - raise ValueError(f'Could not find lexical category from ' - f'{json["category"]["value"]}') - try: - self.grammatical_features = [] - logger.info(json["grammatical_features"]) - for feature in json["grammatical_features"]["value"].split(","): - # TODO parse features with Enum - feature_id = WikidataGrammaticalFeature(str(EntityID(feature))) - self.grammatical_features.append(feature_id) - except KeyError: - pass - - -class Sense: - pass - - -class Entity: - """Base entity with code that is the same for both items and lexemes""" - id: str - label: str - - def upload_one_statement_to_wikidata(self, - statement: BaseDataType = None, - summary: str = None, - editgroups_hash: str = None): - """Upload one statement and always append - This mandates an editgroups hash to be supplied""" - logger = logging.getLogger(__name__) - if self.id is None: - raise ValueError("no id on item") - if statement is None: - raise ValueError("Statement was None") - if summary is None: - raise ValueError("summary was None") - if editgroups_hash is None: - raise ValueError("editgroup_hash was None") - if config.login_instance is None: - raise ValueError("No login instance in config.login_instance") - wbi = WikibaseIntegrator(login=config.login_instance) - item = wbi.item.get(self.id) - item.add_claims( - [statement], - action_if_exists=ActionIfExists.APPEND) - result = item.write( - summary=f"Added {summary} with [[{config.tool_wikipage}]] " - f"([[:toolforge:editgroups/b/CB/{editgroups_hash}|details]])" - ) - logger.debug(f"result from WBI:{result}") - - def url(self): - return f"http://www.wikidata.org/entity/{self.id}" - - -# class Lexeme(Entity): -# id: str -# lemma: str -# lexical_category: WikidataLexicalCategory -# forms: List[Form] -# senses: List[Sense] -# # Needed for duplicate lookup -# language_code: WikimediaLanguageCode -# -# def __init__(self, -# id: str = None, -# lemma: str = None, -# lexical_category: str = None, -# language_code: WikimediaLanguageCode = None): -# if id is not None: -# self.id = str(EntityID(id)) -# self.lemma = lemma -# if lexical_category is None: -# raise ValueError("Lexical category was None") -# if isinstance(lexical_category, WikidataLexicalCategory): -# self.lexical_category = lexical_category -# else: -# self.lexical_category = WikidataLexicalCategory(EntityID(lexical_category)) -# if language_code is not None: -# self.language_code: WikimediaLanguageCode = language_code -# -# def create(self): -# if self.id is not None: -# raise ValueError("Lexeme already has an id, aborting") -# lexeme = wbi_core.LexemeEngine() -# -# def parse_from_wdqs_json(self, json): -# self.forms = [] -# self.senses = [] -# for variable in json: -# logging.debug(variable) -# if variable == "form": -# form = Form(variable) -# self.forms.append(form) -# if variable == "sense": -# sense = Sense(variable) -# self.senses.append(sense) -# if variable == "category": -# self.lexical_category = EntityID(wdqs.extract_wikibase_value(variable)) -# -# def url(self): -# return f"{config.wd_prefix}{self.id}" -# -# def upload_foreign_id_to_wikidata(self, -# foreign_id: ForeignID = None): -# """Upload to enrich the wonderful Wikidata <3""" -# logger = logging.getLogger(__name__) -# if foreign_id is None: -# raise Exception("Foreign id was None") -# print(f"Uploading {foreign_id.id} to {self.id}: {self.lemma}") -# statement = wbi_datatype.ExternalID( -# prop_nr=foreign_id.property, -# value=foreign_id.id, -# ) -# described_by_source = wbi_datatype.ItemID( -# prop_nr="P1343", # stated in -# value=foreign_id.source_item_id -# ) -# # TODO does this overwrite or append? -# item = wbi_core.ItemEngine( -# data=[statement, -# described_by_source], -# item_id=self.id -# ) -# # debug WBI error -# # print(item.get_json_representation()) -# result = item.write( -# config.login_instance, -# edit_summary=f"Added foreign identifier with [[{config.tool_url}]]" -# ) -# logger.debug(f"result from WBI:{result}") -# print(self.url()) -# # exit(0) -# -# def count_number_of_senses_with_P5137(self): -# """Returns an int""" -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?sense) as ?count) -# WHERE {{ -# VALUES ?l {{wd:{self.id}}}. -# ?l ontolex:sense ?sense. -# ?sense skos:definition ?gloss. -# # Exclude lexemes without a linked QID from at least one sense -# ?sense wdt:P5137 []. -# }}''')) -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# return count -# -# def add_usage_example( -# document_id=None, -# sentence=None, -# lid=None, -# form_id=None, -# sense_id=None, -# word=None, -# publication_date=None, -# language_style=None, -# type_of_reference=None, -# source=None, -# line=None, -# ): -# # TODO convert to use OOP -# logger = logging.getLogger(__name__) -# # Use WikibaseIntegrator aka wbi to upload the changes in one edit -# link_to_form = wbi_datatype.Form( -# prop_nr="P5830", -# value=form_id, -# is_qualifier=True -# ) -# link_to_sense = wbi_datatype.Sense( -# prop_nr="P6072", -# value=sense_id, -# is_qualifier=True -# ) -# if language_style == "formal": -# style = "Q104597585" -# else: -# if language_style == "informal": -# style = "Q901711" -# else: -# print(_("Error. Language style {} ".format(language_style) + -# "not one of (formal,informal). Please report a bug at " + -# "https://github.com/egils-consulting/LexUtils/issues")) -# sleep(config.sleep_time) -# return "error" -# logging.debug("Generating qualifier language_style " + -# f"with {style}") -# language_style_qualifier = wbi_datatype.ItemID( -# prop_nr="P6191", -# value=style, -# is_qualifier=True -# ) -# # oral or written -# if type_of_reference == "written": -# medium = "Q47461344" -# else: -# if type_of_reference == "oral": -# medium = "Q52946" -# else: -# print(_("Error. Type of reference {} ".format(type_of_reference) + -# "not one of (written,oral). Please report a bug at " + -# "https://github.com/egils-consulting/LexUtils/issues")) -# sleep(config.sleep_time) -# return "error" -# logging.debug(_("Generating qualifier type of reference " + -# "with {}".format(medium))) -# type_of_reference_qualifier = wbi_datatype.ItemID( -# prop_nr="P3865", -# value=medium, -# is_qualifier=True -# ) -# if source == "riksdagen": -# if publication_date is not None: -# publication_date = datetime.fromisoformat(publication_date) -# else: -# print(_("Publication date of document {} ".format(document_id) + -# "is missing. We have no fallback for that at the moment. " + -# "Abort adding usage example.")) -# return "error" -# stated_in = wbi_datatype.ItemID( -# prop_nr="P248", -# value="Q21592569", -# is_reference=True -# ) -# # TODO lookup if we have a QID for the source -# document_id = wbi_datatype.ExternalID( -# prop_nr="P8433", # Riksdagen Document ID -# value=document_id, -# is_reference=True -# ) -# reference = [ -# stated_in, -# document_id, -# wbi_datatype.Time( -# prop_nr="P813", # Fetched today -# time=datetime.utcnow().replace( -# tzinfo=timezone.utc -# ).replace( -# hour=0, -# minute=0, -# second=0, -# ).strftime("+%Y-%m-%dT%H:%M:%SZ"), -# is_reference=True, -# ), -# wbi_datatype.Time( -# prop_nr="P577", # Publication date -# time=publication_date.strftime("+%Y-%m-%dT00:00:00Z"), -# is_reference=True, -# ), -# type_of_reference_qualifier, -# ] -# elif source == "europarl": -# stated_in = wbi_datatype.ItemID( -# prop_nr="P248", -# value="Q5412081", -# is_reference=True -# ) -# reference = [ -# stated_in, -# wbi_datatype.Time( -# prop_nr="P813", # Fetched today -# time=datetime.utcnow().replace( -# tzinfo=timezone.utc -# ).replace( -# hour=0, -# minute=0, -# second=0, -# ).strftime("+%Y-%m-%dT%H:%M:%SZ"), -# is_reference=True, -# ), -# wbi_datatype.Time( -# prop_nr="P577", # Publication date -# time="+2012-05-12T00:00:00Z", -# is_reference=True, -# ), -# wbi_datatype.Url( -# prop_nr="P854", # reference url -# value="http://www.statmt.org/europarl/v7/sv-en.tgz", -# is_reference=True, -# ), -# # filename in archive -# wbi_datatype.String( -# (f"europarl-v7.{config.language_code}" + -# f"-en.{config.language_code}"), -# "P7793", -# is_reference=True, -# ), -# # line number -# wbi_datatype.String( -# str(line), -# "P7421", -# is_reference=True, -# ), -# type_of_reference_qualifier, -# ] -# elif source == "ksamsok": -# # No date is provided unfortunately, so we set it to unknown value -# stated_in = wbi_datatype.ItemID( -# prop_nr="P248", -# value="Q7654799", -# is_reference=True -# ) -# document_id = wbi_datatype.ExternalID( -# # K-Samsök URI -# prop_nr="P1260", -# value=document_id, -# is_reference=True -# ) -# reference = [ -# stated_in, -# document_id, -# wbi_datatype.Time( -# prop_nr="P813", # Fetched today -# time=datetime.utcnow().replace( -# tzinfo=timezone.utc -# ).replace( -# hour=0, -# minute=0, -# second=0, -# ).strftime("+%Y-%m-%dT%H:%M:%SZ"), -# is_reference=True, -# ), -# wbi_datatype.Time( -# # We don't know the value of the publication dates unfortunately -# prop_nr="P577", # Publication date -# time="", -# snak_type="somevalue", -# is_reference=True, -# ), -# type_of_reference_qualifier, -# ] -# else: -# raise ValueError(f"Did not recognize the source {source}") -# if reference is None: -# raise ValueError(_("No reference defined, cannot add usage example")) -# else: -# # This is the usage example statement -# claim = wbi_datatype.MonolingualText( -# sentence, -# "P5831", -# language=config.language_code, -# # Add qualifiers -# qualifiers=[ -# link_to_form, -# link_to_sense, -# language_style_qualifier, -# ], -# # Add reference -# references=[reference], -# ) -# if config.debug_json: -# logging.debug(f"claim:{claim.get_json_representation()}") -# item = wbi_core.ItemEngine( -# item_id=lid, -# ) -# # Updating appends by default in v0.11.0 -# item.update(data=[claim]) -# # if config.debug_json: -# # print(item.get_json_representation()) -# if config.login_instance is None: -# # Authenticate with WikibaseIntegrator -# print("Logging in with Wikibase Integrator") -# config.login_instance = wbi_login.Login( -# user=config.username, pwd=config.password -# ) -# result = item.write( -# config.login_instance, -# edit_summary=( -# _("Added usage example " + -# "with [[Wikidata:Tools/LexUtils]] v{}".format(config.version)) -# ) -# ) -# if config.debug_json: -# logging.debug(f"result from WBI:{result}") -# # TODO add handling of result from WBI and return True == Success or False -# return result -# -# def find_duplicates(self): -# """Lookup duplicates using the -# Wikidata Lexeme Forms Duplicate API""" -# url = ("https://lexeme-forms.toolforge.org/api/v1/duplicates/www/" -# f"{self.language_code.value}/{self.lemma}") -# response = requests.get(url, headers={"Accept": "application/json"}) -# if response.status_code == 204: -# return None -# elif response.status_code == 200: -# return response.json() -# else: -# raise Exception(f"Got {response.status_code}: {response.text}") -# -# -# class LexemeLanguage: -# lexemes: List[Lexeme] -# language_code: WikimediaLanguageCode -# language_qid: WikimediaLanguageQID -# senses_with_P5137_per_lexeme: float -# senses_with_P5137: int -# forms: int -# forms_with_an_example: int -# forms_without_an_example: List[Form] -# lexemes_count: int -# -# def __init__(self, language_code: str): -# self.language_code = WikimediaLanguageCode(language_code) -# self.language_qid = WikimediaLanguageQID[self.language_code.name] -# -# def fetch_forms_missing_an_example(self): -# logger = logging.getLogger(__name__) -# results = execute_sparql_query(f''' -# #title:Forms that have no example demonstrating them -# select ?lexeme ?form ?form_representation ?category -# (group_concat(distinct ?feature; separator = ",") as ?grammatical_features) -# WHERE {{ -# ?lexeme dct:language wd:{self.language_qid.value}; -# wikibase:lemma ?lemma; -# wikibase:lexicalCategory ?category; -# ontolex:lexicalForm ?form. -# ?form ontolex:representation ?form_representation; -# wikibase:grammaticalFeature ?feature. -# MINUS {{ -# ?lexeme p:P5831 ?statement. -# ?statement ps:P5831 ?example; -# pq:P6072 []; -# pq:P5830 ?form_with_example. -# }} -# }} -# group by ?lexeme ?form ?form_representation ?category -# limit 50''') -# self.forms_without_an_example = [] -# logger.info("Got the data") -# logger.info(f"data:{results.keys()}") -# try: -# #logger.info(f"data:{results['results']['bindings']}") -# for entry in results["results"]['bindings']: -# logger.info(f"data:{entry.keys()}") -# logging.info(f"lexeme_json:{entry}") -# f = Form(entry) -# self.forms_without_an_example.append(f) -# except KeyError: -# logger.error("Got no results") -# logger.info(f"Got {len(self.forms_without_an_example)} " -# f"forms from WDQS for language {self.language_code.name}") -# -# def fetch_lexemes(self): -# # TODO port to use the Lexeme class instead of heavy dataframes which we don't need -# raise Exception("This is deprecated.") -# results = execute_sparql_query(f''' -# SELECT DISTINCT -# ?entity_lid ?form ?word (?categoryLabel as ?category) -# (?grammatical_featureLabel as ?feature) ?sense ?gloss -# WHERE {{ -# ?entity_lid a ontolex:LexicalEntry; dct:language wd:{self.language_qid.value}. -# VALUES ?excluded {{ -# # exclude affixes and interfix -# wd:Q62155 # affix -# wd:Q134830 # prefix -# wd:Q102047 # suffix -# wd:Q1153504 # interfix -# }} -# MINUS {{?entity_lid wdt:P31 ?excluded.}} -# ?entity_lid wikibase:lexicalCategory ?category. -# -# # We want only lexemes with both forms and at least one sense -# ?entity_lid ontolex:lexicalForm ?form. -# ?entity_lid ontolex:sense ?sense. -# -# # Exclude lexemes without a linked QID from at least one sense -# ?sense wdt:P5137 []. -# ?sense skos:definition ?gloss. -# # Get only the swedish gloss, exclude otherwise -# FILTER(LANG(?gloss) = "{self.language_code.value}") -# -# # This remove all lexemes with at least one example which is not -# # ideal -# MINUS {{?entity_lid wdt:P5831 ?example.}} -# ?form wikibase:grammaticalFeature ?grammatical_feature. -# # We extract the word of the form -# ?form ontolex:representation ?word. -# SERVICE wikibase:label -# {{ bd:serviceParam wikibase:language "{self.language_code.value},en". }} -# }} -# limit {config.sparql_results_size} -# offset {config.sparql_offset} -# ''') -# self.lexemes = [] -# for lexeme_json in results: -# logging.debug(f"lexeme_json:{lexeme_json}") -# l = Lexeme.parse_wdqs_json(lexeme_json) -# self.lexemes.append(l) -# logging.info(f"Got {len(self.lexemes)} lexemes from " -# f"WDQS for language {self.language_code.name}") -# -# def count_number_of_lexemes(self): -# """Returns an int""" -# logger = logging.getLogger(__name__) -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?l) as ?count) -# WHERE {{ -# ?l dct:language wd:{self.language_qid.value}. -# }}''')) -# logger.debug(f"result:{result}") -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# return count -# -# def count_number_of_senses_with_p5137(self): -# """Returns an int""" -# logger = logging.getLogger(__name__) -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?sense) as ?count) -# WHERE {{ -# ?l dct:language wd:{self.language_qid.value}. -# ?l ontolex:sense ?sense. -# ?sense skos:definition ?gloss. -# # Exclude lexemes without a linked QID from at least one sense -# ?sense wdt:P5137 []. -# }}''')) -# logger.debug(f"result:{result}") -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# return count -# -# def count_number_of_forms_without_an_example(self): -# """Returns an int""" -# # TODO fix this to count all senses in a given language -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?form) as ?count) -# WHERE {{ -# ?l dct:language wd:{self.language_qid.value}. -# ?l ontolex:lexicalForm ?form. -# ?l ontolex:sense ?sense. -# # exclude lexemes that already have at least one example -# MINUS {{?l wdt:P5831 ?example.}} -# # Exclude lexemes without a linked QID from at least one sense -# ?sense wdt:P5137 []. -# }}''')) -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# self.forms_without_an_example = count -# -# def count_number_of_forms_with_examples(self): -# pass -# -# def count_number_of_forms(self): -# pass -# -# def calculate_statistics(self): -# self.lexemes_count: int = self.count_number_of_lexemes() -# self.senses_with_P5137: int = self.count_number_of_senses_with_p5137() -# self.calculate_senses_with_p5137_per_lexeme() -# -# def calculate_senses_with_p5137_per_lexeme(self): -# self.senses_with_P5137_per_lexeme = round( -# self.senses_with_P5137 / self.lexemes_count, 3 -# ) -# -# def print(self): -# print(f"{self.language_code.name} has " -# f"{self.senses_with_P5137} senses with linked QID in " -# f"total on {self.lexemes_count} lexemes " -# f"which is {self.senses_with_P5137_per_lexeme} per lexeme.") -# -# # TODO decide where to put this code -# class LexemeStatistics: -# total_lexemes: int -# -# def __init__(self): -# self.calculate_total_lexemes() -# self.rank_languages_based_on_statistics() -# -# def calculate_total_lexemes(self) -> int: -# """Calculate how many lexemes exists in Wikidata""" -# result = (execute_sparql_query(f''' -# SELECT -# (COUNT(?l) as ?count) -# WHERE {{ -# ?l a ontolex:LexicalEntry. -# }}''')) -# count: int = wdqs.extract_count(result) -# logging.debug(f"count:{count}") -# self.total_lexemes = count -# -# def rank_languages_based_on_statistics(self): -# logger = logging.getLogger(__name__) -# language_objects = [] -# print("Fetching data...") -# for language_code in WikimediaLanguageCode: -# logger.info(f"Working on {language_code.name}") -# language = LexemeLanguage(language_code) -# language.calculate_statistics() -# language_objects.append(language) -# sorted_by_senses_with_p5137_per_lexeme = sorted( -# language_objects, -# key=lambda language: language.senses_with_P5137_per_lexeme, -# reverse=True -# ) -# print("Languages ranked by most senses linked to items:") -# for language in sorted_by_senses_with_p5137_per_lexeme: -# language.print() -# # Generator expression -# total_lexemes_among_supported_languages: int = sum( -# language.lexemes_count for language in language_objects -# ) -# # logger.debug(f"total:{total_lexemes_among_supported_languages}") -# percent = round( -# total_lexemes_among_supported_languages * 100 / self.total_lexemes -# ) -# print(f"These languages have {total_lexemes_among_supported_languages} " -# f"lexemes out of {self.total_lexemes} in total ({percent}%)") -# -# -class Item(Entity): - """This represents an item in Wikidata - We always work on one language at a time, - so don't bother with languages here and keep to simple strings""" - id: str = None - label: str = None - description: str = None - aliases: List[str] = None - - def __init__(self, - id: str = None, - json: str = None, - label: str = None, - description: str = None, - aliases: List[str] = None, - task: Task = None): - if json is not None: - self.parse_json(json) - else: - if id is not None: - self.id = str(EntityID(id)) - if description is None and label is None and aliases is None: - logging.debug("No of description, label or aliases received") - if task is None: - raise ValueError("Got no task") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - self.fetch_label_and_description_and_aliases(task=task) - elif label is None or aliases is None: - raise ValueError("This is not supported. " - "Either both state the label and " - "aliases or None of them") - else: - self.label = label - self.aliases = aliases - self.description = description - - def __str__(self): - return f"{self.label}, see {self.url()}" - - def parse_json(self, json): - """Parse the WDQS json""" - logger = logging.getLogger(__name__) - try: - logger.debug(f'item_json:{json["item"]}') - self.id = str(EntityID(json["item"]["value"])) - except KeyError: - pass - try: - logger.debug(json["itemLabel"]) - self.label = (json["itemLabel"]["value"]) - except KeyError: - logger.info(f"no label found") - - def parse_from_wdqs_json(self, json): - """Parse the json into the object""" - for variable in json: - logging.debug(variable) - if variable == "item": - self.id = variable - if variable == "itemLabel": - self.label = variable - - def fetch_label_and_description_and_aliases(self, - task: Task = None): - """Fetch label and aliases in the task language from the Wikidata API""" - if task is None: - raise ValueError("task was None") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - from src.helpers.console import console - with console.status(f"Fetching {task.language_code.name.title()} label and aliases from the Wikidata API..."): - wbi = WikibaseIntegrator() - item = wbi.item.get(self.id) - label = item.labels.get(task.language_code.value) - if label is not None: - self.label = str(label) - description = item.descriptions.get(task.language_code.value) - if description is not None: - self.description = str(description) - aliases: List[Alias] = item.aliases.get(task.language_code.value) - # logging.debug(f"aliases from wbi:{item.aliases.get('en')}") - if aliases is not None: - self.aliases = [] - for alias in aliases: - self.aliases.append(str(alias)) - # logging.debug(f"appended:{alias.value}") - # logging.debug(f"aliases:{self.aliases}") - - -class Items: - list: List[Item] = [] - - def fetch_based_on_label(self): - pass - - def random_shuffle_list(self): - random.shuffle(self.list) diff --git a/src/models/wikidata/__init__.py b/src/models/wikidata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/wikidata/entity.py b/src/models/wikidata/entity.py new file mode 100644 index 0000000..d44f53a --- /dev/null +++ b/src/models/wikidata/entity.py @@ -0,0 +1,47 @@ +import logging + +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator import wbi_config +from wikibaseintegrator.datatypes import BaseDataType # type: ignore +from wikibaseintegrator.wbi_enums import ActionIfExists # type: ignore + +import config + +wbi_config.config['USER_AGENT'] = config.user_agent + + +class Entity: + """Base entity with code that is the same for both items and lexemes""" + id: str + label: str + + def upload_one_statement_to_wikidata(self, + statement: BaseDataType = None, + summary: str = None, + editgroups_hash: str = None): + """Upload one statement and always append + This mandates an editgroups hash to be supplied""" + logger = logging.getLogger(__name__) + if self.id is None: + raise ValueError("no id on item") + if statement is None: + raise ValueError("Statement was None") + if summary is None: + raise ValueError("summary was None") + if editgroups_hash is None: + raise ValueError("editgroup_hash was None") + if config.login_instance is None: + raise ValueError("No login instance in config.login_instance") + wbi = WikibaseIntegrator(login=config.login_instance) + item = wbi.item.get(self.id) + item.add_claims( + [statement], + action_if_exists=ActionIfExists.APPEND) + result = item.write( + summary=f"Added {summary} with [[{config.tool_wikipage}]] " + f"([[:toolforge:editgroups/b/CB/{editgroups_hash}|details]])" + ) + logger.debug(f"result from WBI:{result}") + + def url(self): + return f"http://www.wikidata.org/entity/{self.id}" diff --git a/src/models/wikidata/entiyt_id.py b/src/models/wikidata/entiyt_id.py new file mode 100644 index 0000000..545ca6a --- /dev/null +++ b/src/models/wikidata/entiyt_id.py @@ -0,0 +1,35 @@ +import logging + +import config +from src.models.wikidata.enums import WikidataNamespaceLetters + + +class EntityID: + letter: WikidataNamespaceLetters + # This can be e.g. "32698-F1" in the case of a lexeme + rest: str + + def __init__(self, + entity_id: str): + logger = logging.getLogger(__name__) + if entity_id is not None: + # Remove prefix if found + if config.wd_prefix in entity_id: + logger.debug("Removing prefix") + entity_id = entity_id.replace(config.wd_prefix, "") + if len(entity_id) > 1: + logger.info(f"entity_id:{entity_id}") + self.letter = WikidataNamespaceLetters(entity_id[0]) + self.rest = entity_id[1:] + else: + raise ValueError("Entity ID was too short.") + else: + raise ValueError("Entity ID was None") + + def __str__(self): + return f"{self.letter.value}{self.rest}" + + # def extract_wdqs_json_entity_id(self, json: Dict, sparql_variable: str): + # self.__init__(json[sparql_variable]["value"].replace( + # config.wd_prefix, "" + # )) diff --git a/src/models/wikidata/enums.py b/src/models/wikidata/enums.py new file mode 100644 index 0000000..278b7e4 --- /dev/null +++ b/src/models/wikidata/enums.py @@ -0,0 +1,72 @@ +from enum import Enum + + +class WikidataGrammaticalFeature(Enum): + # Swedish + ACTIVE_VOICE = "Q1317831" + PRETERITE = "Q442485" + INFINITIVE = "Q179230" + PRESENT_TENSE = "Q192613" + SUPINE = "Q548470" + IMPERATIVE = "Q22716" + PASSIVE_VOICE = "Q1194697" + SINGULAR = "Q110786" + NOMINATIVE_CASE = "Q131105" + INDEFINITE = "Q53997857" + DEFINITE = "Q53997851" + PLURAL = "Q146786" + GENITIVE_CASE = "Q146233" + # English + SIMPLE_PRESENT = "Q3910936" + THIRD_PERSON_SINGULAR = "Q51929447" + + +class WikidataLexicalCategory(Enum): + NOUN = "Q1084" + VERB = "Q24905" + ADVERB = "Q380057" + ADJECTIVE = "Q34698" + AFFIX = "Q62155" + PROPER_NOUN = "Q147276" + + +class WikimediaLanguageCode(Enum): + DANISH = "da" + SWEDISH = "sv" + BOKMÅL = "nb" + ENGLISH = "en" + FRENCH = "fr" + RUSSIAN = "ru" + ESTONIAN = "et" + MALAYALAM = "ml" + LATIN = "la" + HEBREW = "he" + BASQUE = "eu" + GERMAN = "de" + BENGALI = "bn" + CZECH = "cs" + + +class WikimediaLanguageQID(Enum): + DANISH = "Q9035" + SWEDISH = "Q9027" + BOKMÅL = "Q25167" + ENGLISH = "Q1860" + FRENCH = "Q150" + RUSSIAN = "Q7737" + ESTONIAN = "Q9072" + MALAYALAM = "Q36236" + LATIN = "Q397" + HEBREW = "Q9288" + BASQUE = "Q8752" + GERMAN = "Q188" + BENGALI = "Q9610" + CZECH = "Q9056" + + +class WikidataNamespaceLetters(Enum): + PROPERTY = "P" + ITEM = "Q" + LEXEME = "L" + # FORM = "F" + # SENSE = "S" diff --git a/src/models/wikidata/foreign_id.py b/src/models/wikidata/foreign_id.py new file mode 100644 index 0000000..1e8abd3 --- /dev/null +++ b/src/models/wikidata/foreign_id.py @@ -0,0 +1,21 @@ +from typing import Optional + +from src.models.wikidata.entiyt_id import EntityID + + +class ForeignID: + id: Optional[str] + property: Optional[str] # This is the property with type ExternalId + source_item_id: Optional[str] # This is the Q-item for the source + + def __init__(self, + id: Optional[str] = None, + property: Optional[str] = None, + source_item_id: Optional[str] = None): + self.id = id + if property is None: + raise ValueError("property was None") + self.property = str(EntityID(property)) + if source_item_id is None: + raise ValueError("source_item_id was None") + self.source_item_id = str(EntityID(source_item_id)) diff --git a/src/models/wikidata/item.py b/src/models/wikidata/item.py new file mode 100644 index 0000000..dbd67ce --- /dev/null +++ b/src/models/wikidata/item.py @@ -0,0 +1,103 @@ +import logging +from typing import List, Optional + +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator import wbi_config # type: ignore +from wikibaseintegrator.models import Alias # type: ignore + +import config +from src.models.task import Task +from src.models.wikidata.entity import Entity +from src.models.wikidata.entiyt_id import EntityID + +wbi_config.config['USER_AGENT'] = config.user_agent + + +class Item(Entity): + """This represents an item in Wikidata + We always work on one language at a time, + so don't bother with languages here and keep to simple strings""" + id: Optional[str] = None + label: Optional[str] = None + description: Optional[str] = None + aliases: Optional[List[str]] = None + + def __init__(self, + id: str = None, + json: str = None, + label: str = None, + description: str = None, + aliases: List[str] = None, + task: Task = None): + if json is not None: + self.parse_json(json) + else: + if id is not None: + self.id = str(EntityID(id)) + if description is None and label is None and aliases is None: + logging.debug("No of description, label or aliases received") + if task is None: + raise ValueError("Got no task") + if not isinstance(task, Task): + raise ValueError("task was not a Task object") + self.fetch_label_and_description_and_aliases(task=task) + elif label is None or aliases is None: + raise ValueError("This is not supported. " + "Either both state the label and " + "aliases or None of them") + else: + self.label = label + self.aliases = aliases + self.description = description + + def __str__(self): + return f"{self.label}, see {self.url()}" + + def parse_json(self, json): + """Parse the WDQS json""" + logger = logging.getLogger(__name__) + try: + logger.debug(f'item_json:{json["item"]}') + self.id = str(EntityID(json["item"]["value"])) + except KeyError: + pass + try: + logger.debug(json["itemLabel"]) + self.label = (json["itemLabel"]["value"]) + except KeyError: + logger.info(f"no label found") + + def parse_from_wdqs_json(self, json): + """Parse the json into the object""" + for variable in json: + logging.debug(variable) + if variable == "item": + self.id = variable + if variable == "itemLabel": + self.label = variable + + def fetch_label_and_description_and_aliases(self, + task: Task = None): + """Fetch label and aliases in the task language from the Wikidata API""" + if task is None: + raise ValueError("task was None") + if not isinstance(task, Task): + raise ValueError("task was not a Task object") + from src.helpers.console import console + with console.status(f"Fetching {task.language_code.name.title()} label and aliases from the Wikidata API..."): + wbi = WikibaseIntegrator() + item = wbi.item.get(self.id) + label = item.labels.get(task.language_code.value) + if label is not None: + self.label = str(label) + description = item.descriptions.get(task.language_code.value) + if description is not None: + self.description = str(description) + aliases: List[Alias] = item.aliases.get(task.language_code.value) + # logging.debug(f"aliases from wbi:{item.aliases.get('en')}") + if aliases is not None: + self.aliases = [] + for alias in aliases: + self.aliases.append(str(alias)) + # logging.debug(f"appended:{alias.value}") + # logging.debug(f"aliases:{self.aliases}") diff --git a/src/models/wikidata/items.py b/src/models/wikidata/items.py new file mode 100644 index 0000000..1017a29 --- /dev/null +++ b/src/models/wikidata/items.py @@ -0,0 +1,14 @@ +import random +from typing import List + +from src.models.wikidata.item import Item + + +class Items: + list: List[Item] = [] + + def fetch_based_on_label(self): + pass + + def random_shuffle_list(self): + random.shuffle(self.list) From 96794b17a58a4f9f646798c0feccdde4c5cdb3d5 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Sat, 19 Feb 2022 14:01:22 +0100 Subject: [PATCH 04/13] Reorganize models into packages. Update classes.puml. Use pydantic and inherit from BaseModel. --- diagrams/classes.puml | 152 ++++++++++++++---- src/__init__.py | 8 +- src/helpers/console.py | 27 +++- src/helpers/jobs.py | 31 ++-- src/helpers/menus.py | 2 +- src/helpers/pickle.py | 5 +- src/models/batch_job.py | 7 +- src/models/items/__init__.py | 19 +++ src/models/{ => items}/academic_journals.py | 12 +- src/models/{ => items}/riksdagen_documents.py | 10 +- src/models/{ => items}/scholarly_articles.py | 20 ++- src/models/{ => items}/thesis.py | 4 +- src/models/quickstatements.py | 13 +- src/models/suggestion.py | 31 ++-- src/models/task.py | 52 +++--- src/models/wikidata/item.py | 103 ------------ src/models/wikidata/items.py | 14 -- .../{wikidata => wikimedia}/__init__.py | 0 src/models/wikimedia/enum.py | 35 ++++ src/models/wikimedia/wikidata/__init__.py | 0 src/models/{ => wikimedia}/wikidata/entity.py | 8 +- .../{ => wikimedia}/wikidata/entiyt_id.py | 6 +- src/models/{ => wikimedia}/wikidata/enums.py | 62 ++----- .../{ => wikimedia}/wikidata/foreign_id.py | 6 +- src/models/wikimedia/wikidata/item.py | 50 ++++++ src/models/wikimedia/wikidata/sparql_item.py | 18 +++ 26 files changed, 405 insertions(+), 290 deletions(-) create mode 100644 src/models/items/__init__.py rename src/models/{ => items}/academic_journals.py (85%) rename src/models/{ => items}/riksdagen_documents.py (89%) rename src/models/{ => items}/scholarly_articles.py (88%) rename src/models/{ => items}/thesis.py (97%) delete mode 100644 src/models/wikidata/item.py delete mode 100644 src/models/wikidata/items.py rename src/models/{wikidata => wikimedia}/__init__.py (100%) create mode 100644 src/models/wikimedia/enum.py create mode 100644 src/models/wikimedia/wikidata/__init__.py rename src/models/{ => wikimedia}/wikidata/entity.py (93%) rename src/models/{ => wikimedia}/wikidata/entiyt_id.py (87%) rename src/models/{ => wikimedia}/wikidata/enums.py (54%) rename src/models/{ => wikimedia}/wikidata/foreign_id.py (77%) create mode 100644 src/models/wikimedia/wikidata/item.py create mode 100644 src/models/wikimedia/wikidata/sparql_item.py diff --git a/diagrams/classes.puml b/diagrams/classes.puml index 753459f..b0600a5 100644 --- a/diagrams/classes.puml +++ b/diagrams/classes.puml @@ -1,31 +1,123 @@ @startuml 'https://plantuml.com/class-diagram -abstract class Items -Items <|-- AcademicJournalItems -Items <|-- RiksdagenDocumentItems -Items <|-- ScholarlyArticleItems -Items <|-- ThesisItems +abstract class BaseModel -'package helpers { -'} -class AcademicJournalItems { -fetch_based_on_label() -} -class RiksdagenDocumentItems { -+list -+fetch_based_on_label() +package wikimedia { + enum WikimediaLanguageCode { + BASQUE + BENGALI + BOKMÅL + CZECH + DANISH + ENGLISH + ESTONIAN + FRENCH + GERMAN + HEBREW + LATIN + MALAYALAM + RUSSIAN + SWEDISH + } + enum WikimediaLanguageQID { + BASQUE = "Q8752" + BENGALI = "Q9610" + BOKMÅL = "Q25167" + CZECH = "Q9056" + DANISH = "Q9035" + ENGLISH = "Q1860" + ESTONIAN = "Q9072" + FRENCH = "Q150" + GERMAN = "Q188" + HEBREW = "Q9288" + LATIN = "Q397" + MALAYALAM = "Q36236" + RUSSIAN = "Q7737" + SWEDISH = "Q9027" + } + package wikidata { + class Entity { + id: Optional[str] + label: str + upload_one_statement_to_wikidata() + url() + } + class EntityID{ + letter: WikidataNamespaceLetters + rest: str + __init__() + __str__() + } + class ForeignID{ + __init__() + } + class SparqlItem{ + item: Value + itemLabel: Value + validate_qid_and_copy_label() + } + class Item{ + label: Optional[str] = None + description: Optional[str] = None + aliases: Optional[List[str]] = None + __init__() + __str__() + parse_json() + parse_from_wdqs_json() + fetch_label_and_description_and_aliases() + } + enum WikidataGrammaticalFeature { + ACTIVE_VOICE + DEFINITE + GENITIVE_CASE + IMPERATIVE + INDEFINITE + INFINITIVE + NOMINATIVE_CASE + PASSIVE_VOICE + PLURAL + PRESENT_TENSE + PRETERITE + SIMPLE_PRESENT + SINGULAR + SUPINE + THIRD_PERSON_SINGULAR + } + enum WikidataLexicalCategory { + ADJECTIVE + ADVERB + AFFIX + NOUN + PROPER_NOUN + VERB + } + enum WikidataNamespaceLetters { + ITEM + LEXEME + PROPERTY + } + } } +package items { + abstract class Items + class AcademicJournalItems { + fetch_based_on_label() + } + class RiksdagenDocumentItems { + +list + +fetch_based_on_label() + } -class ScholarlyArticleItems { -+list -+fetch_based_on_label() -} -class ThesisItems { -list -fetch_based_on_label() + class ScholarlyArticleItems { + +list + +fetch_based_on_label() + } + class ThesisItems { + list + fetch_based_on_label() + } } - class Suggestion { item: Item = None search_strings: List[str] = None @@ -59,11 +151,17 @@ class QuickStatementsCommandVersion1 { +value: EntityID = None -__str__() } - -enum TimeUnit { -DAYS -HOURS -MINUTES -} +Items <|-- AcademicJournalItems +Items <|-- RiksdagenDocumentItems +Items <|-- ScholarlyArticleItems +Items <|-- ThesisItems +BaseModel <|-- Entity +BaseModel <|-- Task +BaseModel <|-- Suggestion +BaseModel <|-- BatchJob +BaseModel <|-- QuickStatementsCommandVersion1 +BaseModel <|-- Items +Entity <|-- Item +Item <|-- SparqlItem @enduml \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py index 5ee4549..163aee3 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -23,7 +23,7 @@ from src.models.quickstatements import QuickStatementsCommandVersion1 from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata.entiyt_id import EntityID +from src.models.wikimedia.wikidata.entiyt_id import EntityId from src.tasks import tasks @@ -131,9 +131,9 @@ def export_jobs_to_quickstatements(): lines = [] for item in job.items.list: line = QuickStatementsCommandVersion1( - target=EntityID(item.id), - property=EntityID("P921"), - value=EntityID(job.suggestion.item.id), + target=EntityId(item.id), + property=EntityId("P921"), + value=EntityId(job.suggestion.item.id), ) lines.append(line) logger.debug(f"Got {len(lines)} QS lines to export") diff --git a/src/helpers/console.py b/src/helpers/console.py index 0c8163a..50efb70 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -1,5 +1,6 @@ +from __future__ import annotations import argparse -from typing import List +from typing import List, TYPE_CHECKING from urllib.parse import quote from rich.console import Console @@ -7,8 +8,10 @@ from src.helpers.cleaning import clean_rich_formatting from src.models.batch_job import BatchJob -from src.models.task import Task -from src.models.wikidata.items import Items + +if TYPE_CHECKING: + from src.models.items import Items + from src.models.task import Task console = Console() @@ -89,6 +92,8 @@ def print_found_items_table(args: argparse.Namespace = None, if args.show_item_urls: table.add_column(f"Wikidata URL") for item in list_to_show: + if item.label is None: + raise ValueError("item.label was None") if args.show_item_urls: label = clean_rich_formatting(item.label) table.add_row(label, item.url()) @@ -98,13 +103,21 @@ def print_found_items_table(args: argparse.Namespace = None, def ask_add_to_job_queue(job: BatchJob = None): - if job is not None: - return ask_yes_no_question(f"Do you want to add this job for " + if job is None: + raise ValueError("job was None") + if job.suggestion.item is None: + raise ValueError("job.suggestion.item was None") + if job.suggestion.item.label is None: + raise ValueError("job.suggestion.item.label was None") + if job.suggestion.item.description is None: + raise ValueError("job.suggestion.item.description was None") + if job.items.list is None: + raise ValueError("job.items.list was None") + return ask_yes_no_question(f"Do you want to add this job for " f"[magenta]{job.suggestion.item.label}: " f"{job.suggestion.item.description}[/magenta] with " f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})") - else: - raise ValueError("job was None") + def print_running_jobs(jobs: List[BatchJob] = None): if jobs is None: diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 5e9d1cf..94ea5e5 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -4,23 +4,25 @@ import logging import random from datetime import datetime -from typing import Union, List, TYPE_CHECKING +from typing import Union, List, TYPE_CHECKING, Optional from src import strip_prefix, print_best_practice, console, ask_yes_no_question, \ TaskIds, print_found_items_table, ask_add_to_job_queue, print_keep_an_eye_on_wdqs_lag, print_running_jobs, \ print_finished, print_job_statistics from src.helpers.menus import select_task -from src.models.academic_journals import AcademicJournalItems -from src.models.riksdagen_documents import RiksdagenDocumentItems -from src.models.scholarly_articles import ScholarlyArticleItems -from src.models.thesis import ThesisItems -from src.models.wikidata.items import Items -from src.tasks import tasks, Task +from src.models.items.academic_journals import AcademicJournalItems +from src.models.items import Items +from src.models.items.riksdagen_documents import RiksdagenDocumentItems +from src.models.items.scholarly_articles import ScholarlyArticleItems +from src.models.items.thesis import ThesisItems +from src.tasks import Task if TYPE_CHECKING: from src import Task, BatchJob +# TODO rewrite as OOP + def process_qid_into_job(qid: str = None, task: Task = None, args: argparse.Namespace = None, @@ -32,11 +34,11 @@ def process_qid_into_job(qid: str = None, raise ValueError("args was None") if task is None: raise ValueError("task was None") - from src.models.wikidata.item import Item + from src.models.wikimedia.wikidata import Item item = Item( id=strip_prefix(qid), - task=task ) + item.fetch_label_and_description_and_aliases(task=task) if item.label is not None: console.print(f"Working on {item}") # generate suggestion with all we need @@ -50,11 +52,14 @@ def process_qid_into_job(qid: str = None, answer = ask_yes_no_question("Do you want to continue?") if not answer: return None + suggestion.extract_search_strings() + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") with console.status(f'Fetching items with labels that have one of ' f'the search strings by running a total of ' f'{len(suggestion.search_strings) * task.number_of_queries_per_search_string} ' f'queries on WDQS...'): - items: Items = None + items: Optional[Items] = None if task.id == TaskIds.SCHOLARLY_ARTICLES: items = ScholarlyArticleItems() elif task.id == TaskIds.RIKSDAGEN_DOCUMENTS: @@ -80,6 +85,8 @@ def process_qid_into_job(qid: str = None, answer = ask_add_to_job_queue(job) if answer: return job + else: + return None else: console.print("No matching items found") return None @@ -128,6 +135,10 @@ def run_jobs(jobs: List[BatchJob] = None): def handle_job_preparation_or_run_directly_if_any_jobs(args: argparse.Namespace = None, jobs: List[BatchJob] = None): + if jobs is None: + raise ValueError("jobs was None") + if args is None: + raise ValueError("args was None") if len(jobs) > 0: if args.prepare_jobs: console.print(f"Adding {len(jobs)} job(s) to the jobs file") diff --git a/src/helpers/menus.py b/src/helpers/menus.py index 4d0c466..022d695 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -4,7 +4,7 @@ from consolemenu import SelectionMenu # type: ignore from src.models.suggestion import Suggestion -from src.models.wikidata.item import Item +from src.models.wikimedia.wikidata import Item from src.tasks import tasks, Task diff --git a/src/helpers/pickle.py b/src/helpers/pickle.py index 05ee8b9..8724794 100644 --- a/src/helpers/pickle.py +++ b/src/helpers/pickle.py @@ -7,6 +7,7 @@ from src.helpers.console import console from src.models.batch_job import BatchJob +# TODO rewrite as OOP def add_to_job_pickle(job: BatchJob = None): if job is None: @@ -58,7 +59,7 @@ def parse_job_pickle(silent: bool = False) -> Optional[List[BatchJob]]: return None -def parse_main_subjects_pickle() -> List[str]: +def parse_main_subjects_pickle() -> Optional[List[str]]: """Reads the pickle into a list of main subjects""" if check_if_pickle_exists(config.main_subjects_pickle_file_path): subjects = [] @@ -66,6 +67,7 @@ def parse_main_subjects_pickle() -> List[str]: subjects.append(subject) if len(subjects) == 0: console.print("No qids found in the pickle.") + return None else: # print(f"found:{subjects}") return subjects @@ -74,7 +76,6 @@ def parse_main_subjects_pickle() -> List[str]: "Create it by running 'python fetch_main_subjects.py'") exit(0) - def remove_job_pickle(silent: bool = False, hash: str = None): if hash is None: diff --git a/src/models/batch_job.py b/src/models/batch_job.py index 770d9ea..25195f5 100644 --- a/src/models/batch_job.py +++ b/src/models/batch_job.py @@ -2,13 +2,14 @@ from dataclasses import dataclass from typing import List, TYPE_CHECKING +from pydantic import BaseModel + if TYPE_CHECKING: from src.models.suggestion import Suggestion - from src.models.wikidata.items import Items + from src.models.items import Items -@dataclass -class BatchJob: +class BatchJob(BaseModel): """Models a batch job intended to be run non-interactively""" suggestion: Suggestion items: Items diff --git a/src/models/items/__init__.py b/src/models/items/__init__.py new file mode 100644 index 0000000..ed85493 --- /dev/null +++ b/src/models/items/__init__.py @@ -0,0 +1,19 @@ +import random +from typing import List + +from pydantic import BaseModel + +from src import Suggestion, Task +from src.models.wikimedia.wikidata.sparql_item import SparqlItem + + +class Items(BaseModel): + list: List[SparqlItem] + + def fetch_based_on_label(self, + suggestion: Suggestion = None, + task: Task = None): + pass + + def random_shuffle_list(self): + random.shuffle(self.list) \ No newline at end of file diff --git a/src/models/academic_journals.py b/src/models/items/academic_journals.py similarity index 85% rename from src/models/academic_journals.py rename to src/models/items/academic_journals.py index 14d0977..1b14772 100644 --- a/src/models/academic_journals.py +++ b/src/models/items/academic_journals.py @@ -7,8 +7,8 @@ from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata.item import Item -from src.models.wikidata.items import Items +from src.models.wikimedia.wikidata.item import Item +from src.models.items import Items class AcademicJournalItems(Items): @@ -33,6 +33,14 @@ def process_results(results): raise ValueError("task was None") if task.language_code is None: raise ValueError("task.language_code was None") + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") + if suggestion.item is None: + raise ValueError("suggestion.item was None") + if suggestion.item.id is None: + raise ValueError("suggestion.item.id was None") + if suggestion.args is None: + raise ValueError("suggestion.args was None") # Fetch all items matching the search strings self.list = [] for search_string in suggestion.search_strings: diff --git a/src/models/riksdagen_documents.py b/src/models/items/riksdagen_documents.py similarity index 89% rename from src/models/riksdagen_documents.py rename to src/models/items/riksdagen_documents.py index daa4d0b..2d8d801 100644 --- a/src/models/riksdagen_documents.py +++ b/src/models/items/riksdagen_documents.py @@ -6,8 +6,8 @@ from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata.item import Item -from src.models.wikidata.items import Items +from src.models.wikimedia.wikidata.item import Item +from src.models.items import Items class RiksdagenDocumentItems(Items): @@ -17,9 +17,15 @@ def fetch_based_on_label(self, # logger = logging.getLogger(__name__) if suggestion is None: raise ValueError("suggestion was None") + if suggestion.item is None: + raise ValueError("suggestion.item was None") + if suggestion.args is None: + raise ValueError("suggestion.args was None") if suggestion.args.limit_to_items_without_p921: raise Exception("Limiting to items without P921 is not " "supported yet for this task.") + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") if task is None: raise ValueError("task was None") if task.language_code is None: diff --git a/src/models/scholarly_articles.py b/src/models/items/scholarly_articles.py similarity index 88% rename from src/models/scholarly_articles.py rename to src/models/items/scholarly_articles.py index c8672ed..3001563 100644 --- a/src/models/scholarly_articles.py +++ b/src/models/items/scholarly_articles.py @@ -7,8 +7,8 @@ from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata.item import Item -from src.models.wikidata.items import Items +from src.models.items import Items +from src.models.wikimedia.wikidata.sparql_item import SparqlItem class ScholarlyArticleItems(Items): @@ -24,6 +24,8 @@ def build_query(suggestion: Suggestion = None, # TODO refactor if suggestion is None: raise ValueError("suggestion was None") + if suggestion.item is None: + raise ValueError("suggestion.item was None") if search_string is None: raise ValueError("search_string was None") if task is None: @@ -78,15 +80,27 @@ def process_results(results): items = [] for item_json in results["results"]["bindings"]: logging.debug(f"item_json:{item_json}") - item = Item(json=item_json) + item = SparqlItem(**item_json) + item.validate_qid_and_copy_label() items.append(item) return items # logger = logging.getLogger(__name__) if suggestion is None: raise ValueError("suggestion was None") + if suggestion.item is None: + raise ValueError("suggestion.item was None") + if suggestion.args is None: + raise ValueError("suggestion.args was None") + if suggestion.args.limit_to_items_without_p921: + raise Exception("Limiting to items without P921 is not " + "supported yet for this task.") + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") if task is None: raise ValueError("task was None") + if task.language_code is None: + raise ValueError("task.language_code was None") if suggestion.args.limit_to_items_without_p921: console.print("Limiting to scholarly articles without P921 main subject only") cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921" diff --git a/src/models/thesis.py b/src/models/items/thesis.py similarity index 97% rename from src/models/thesis.py rename to src/models/items/thesis.py index bf23a74..8ef091f 100644 --- a/src/models/thesis.py +++ b/src/models/items/thesis.py @@ -5,8 +5,8 @@ from src.helpers.console import console from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikidata.item import Item -from src.models.wikidata.items import Items +from src.models.wikimedia.wikidata.item import Item +from src.models.items import Items # There were ~16.000 thesis' in WD when this was written diff --git a/src/models/quickstatements.py b/src/models/quickstatements.py index b6f3245..490eaf3 100644 --- a/src/models/quickstatements.py +++ b/src/models/quickstatements.py @@ -1,19 +1,20 @@ from dataclasses import dataclass from typing import Optional -from src.models.wikidata.entiyt_id import EntityID +from pydantic import BaseModel +from src.models.wikimedia.wikidata.entiyt_id import EntityId -@dataclass -class QuickStatementsCommandVersion1: + +class QuickStatementsCommandVersion1(BaseModel): """This models the simple line-based QS commands For now we only support QID-values Q1\tP1\tQ1""" - target: Optional[EntityID] = None - property: Optional[EntityID] = None - value: Optional[EntityID] = None + target: Optional[EntityId] = None + property: Optional[EntityId] = None + value: Optional[EntityId] = None def __str__(self): return f"{self.target}\t{self.property}\t{self.value}" diff --git a/src/models/suggestion.py b/src/models/suggestion.py index b1224de..a1626b1 100644 --- a/src/models/suggestion.py +++ b/src/models/suggestion.py @@ -3,6 +3,7 @@ from typing import List, Optional from urllib.parse import quote +from pydantic import BaseModel from wikibaseintegrator.datatypes import Item as ItemType # type: ignore import config @@ -10,31 +11,19 @@ from src.helpers.cleaning import clean_rich_formatting from src.helpers.console import print_search_strings_table, console from src.models.batch_job import BatchJob +from src.models.items import Items from src.models.task import Task -from src.models.wikidata.item import Item -from src.models.wikidata.items import Items +from src.models.wikimedia.wikidata import Item -class Suggestion: - item: Optional[Item] = None +class Suggestion(BaseModel): + item: Item + task: Task + args: argparse.Namespace search_strings: Optional[List[str]] = None - task: Optional[Task] = None - args: Optional[argparse.Namespace] = None - def __init__(self, - item: Item = None, - task: Task = None, - args=None): - if item is None: - raise ValueError("item was None") - else: - self.item = item - if task is None: - raise ValueError("task was None") - else: - self.task = task - self.args = args - self.extract_search_strings() + class Config: + arbitrary_types_allowed = True def __str__(self): """Return label and description, the latter cut to 50 chars""" @@ -125,6 +114,8 @@ def clean_special_symbols(string: str): search_strings=self.search_strings) def search_urls(self) -> List[str]: + if self.search_strings is None: + raise ValueError("self.search_strings was None") urls = [] for search_string in self.search_strings: search_term = quote(f'"{search_string}"') diff --git a/src/models/task.py b/src/models/task.py index 7511718..2525e81 100644 --- a/src/models/task.py +++ b/src/models/task.py @@ -1,37 +1,37 @@ -from typing import Union, Optional +from typing import Union + +from pydantic import BaseModel from src.helpers.enums import SupportedLanguageCode, TaskIds -# console-menu does not support dataclass (yet) -# @dataclass -class Task: +class Task(BaseModel): """This class holds the tasks presented to the user in the menu and related data""" - best_practice_information: Union[str, None] = None - id: Optional[TaskIds] = None - label: Optional[str] = None - language_code: Optional[SupportedLanguageCode] = None + best_practice_information: Union[str, None] + id: TaskIds + label: str + language_code: SupportedLanguageCode number_of_queries_per_search_string = 1 - def __init__(self, - best_practice_information: str = None, - id: TaskIds = None, - label: str = None, - language_code: SupportedLanguageCode = None, - number_of_queries_per_search_string: int = None): - if id is None: - raise ValueError("Got no id") - if label is None: - raise ValueError("Got no label") - if language_code is None: - raise ValueError("Got no language_code") - self.id = id - self.label = label - self.language_code = language_code - self.best_practice_information = best_practice_information - if number_of_queries_per_search_string is not None: - self.number_of_queries_per_search_string = number_of_queries_per_search_string + # def __init__(self, + # best_practice_information: str = None, + # id: TaskIds = None, + # label: str = None, + # language_code: SupportedLanguageCode = None, + # number_of_queries_per_search_string: int = None): + # if id is None: + # raise ValueError("Got no id") + # if label is None: + # raise ValueError("Got no label") + # if language_code is None: + # raise ValueError("Got no language_code") + # self.id = id + # self.label = label + # self.language_code = language_code + # self.best_practice_information = best_practice_information + # if number_of_queries_per_search_string is not None: + # self.number_of_queries_per_search_string = number_of_queries_per_search_string def __str__(self): return f"{self.label}" diff --git a/src/models/wikidata/item.py b/src/models/wikidata/item.py deleted file mode 100644 index dbd67ce..0000000 --- a/src/models/wikidata/item.py +++ /dev/null @@ -1,103 +0,0 @@ -import logging -from typing import List, Optional - -from wikibaseintegrator import WikibaseIntegrator # type: ignore -from wikibaseintegrator import wbi_config # type: ignore -from wikibaseintegrator.models import Alias # type: ignore - -import config -from src.models.task import Task -from src.models.wikidata.entity import Entity -from src.models.wikidata.entiyt_id import EntityID - -wbi_config.config['USER_AGENT'] = config.user_agent - - -class Item(Entity): - """This represents an item in Wikidata - We always work on one language at a time, - so don't bother with languages here and keep to simple strings""" - id: Optional[str] = None - label: Optional[str] = None - description: Optional[str] = None - aliases: Optional[List[str]] = None - - def __init__(self, - id: str = None, - json: str = None, - label: str = None, - description: str = None, - aliases: List[str] = None, - task: Task = None): - if json is not None: - self.parse_json(json) - else: - if id is not None: - self.id = str(EntityID(id)) - if description is None and label is None and aliases is None: - logging.debug("No of description, label or aliases received") - if task is None: - raise ValueError("Got no task") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - self.fetch_label_and_description_and_aliases(task=task) - elif label is None or aliases is None: - raise ValueError("This is not supported. " - "Either both state the label and " - "aliases or None of them") - else: - self.label = label - self.aliases = aliases - self.description = description - - def __str__(self): - return f"{self.label}, see {self.url()}" - - def parse_json(self, json): - """Parse the WDQS json""" - logger = logging.getLogger(__name__) - try: - logger.debug(f'item_json:{json["item"]}') - self.id = str(EntityID(json["item"]["value"])) - except KeyError: - pass - try: - logger.debug(json["itemLabel"]) - self.label = (json["itemLabel"]["value"]) - except KeyError: - logger.info(f"no label found") - - def parse_from_wdqs_json(self, json): - """Parse the json into the object""" - for variable in json: - logging.debug(variable) - if variable == "item": - self.id = variable - if variable == "itemLabel": - self.label = variable - - def fetch_label_and_description_and_aliases(self, - task: Task = None): - """Fetch label and aliases in the task language from the Wikidata API""" - if task is None: - raise ValueError("task was None") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - from src.helpers.console import console - with console.status(f"Fetching {task.language_code.name.title()} label and aliases from the Wikidata API..."): - wbi = WikibaseIntegrator() - item = wbi.item.get(self.id) - label = item.labels.get(task.language_code.value) - if label is not None: - self.label = str(label) - description = item.descriptions.get(task.language_code.value) - if description is not None: - self.description = str(description) - aliases: List[Alias] = item.aliases.get(task.language_code.value) - # logging.debug(f"aliases from wbi:{item.aliases.get('en')}") - if aliases is not None: - self.aliases = [] - for alias in aliases: - self.aliases.append(str(alias)) - # logging.debug(f"appended:{alias.value}") - # logging.debug(f"aliases:{self.aliases}") diff --git a/src/models/wikidata/items.py b/src/models/wikidata/items.py deleted file mode 100644 index 1017a29..0000000 --- a/src/models/wikidata/items.py +++ /dev/null @@ -1,14 +0,0 @@ -import random -from typing import List - -from src.models.wikidata.item import Item - - -class Items: - list: List[Item] = [] - - def fetch_based_on_label(self): - pass - - def random_shuffle_list(self): - random.shuffle(self.list) diff --git a/src/models/wikidata/__init__.py b/src/models/wikimedia/__init__.py similarity index 100% rename from src/models/wikidata/__init__.py rename to src/models/wikimedia/__init__.py diff --git a/src/models/wikimedia/enum.py b/src/models/wikimedia/enum.py new file mode 100644 index 0000000..ef8afeb --- /dev/null +++ b/src/models/wikimedia/enum.py @@ -0,0 +1,35 @@ +from enum import Enum + + +class WikimediaLanguageCode(Enum): + BASQUE = "eu" + BENGALI = "bn" + BOKMÅL = "nb" + CZECH = "cs" + DANISH = "da" + ENGLISH = "en" + ESTONIAN = "et" + FRENCH = "fr" + GERMAN = "de" + HEBREW = "he" + LATIN = "la" + MALAYALAM = "ml" + RUSSIAN = "ru" + SWEDISH = "sv" + + +class WikimediaLanguageQID(Enum): + BASQUE = "Q8752" + BENGALI = "Q9610" + BOKMÅL = "Q25167" + CZECH = "Q9056" + DANISH = "Q9035" + ENGLISH = "Q1860" + ESTONIAN = "Q9072" + FRENCH = "Q150" + GERMAN = "Q188" + HEBREW = "Q9288" + LATIN = "Q397" + MALAYALAM = "Q36236" + RUSSIAN = "Q7737" + SWEDISH = "Q9027" \ No newline at end of file diff --git a/src/models/wikimedia/wikidata/__init__.py b/src/models/wikimedia/wikidata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/wikidata/entity.py b/src/models/wikimedia/wikidata/entity.py similarity index 93% rename from src/models/wikidata/entity.py rename to src/models/wikimedia/wikidata/entity.py index d44f53a..bca73d6 100644 --- a/src/models/wikidata/entity.py +++ b/src/models/wikimedia/wikidata/entity.py @@ -1,5 +1,7 @@ import logging +from typing import Optional +from pydantic import BaseModel from wikibaseintegrator import WikibaseIntegrator # type: ignore from wikibaseintegrator import wbi_config from wikibaseintegrator.datatypes import BaseDataType # type: ignore @@ -10,10 +12,10 @@ wbi_config.config['USER_AGENT'] = config.user_agent -class Entity: +class Entity(BaseModel): """Base entity with code that is the same for both items and lexemes""" - id: str - label: str + id: Optional[str] + label: Optional[str] def upload_one_statement_to_wikidata(self, statement: BaseDataType = None, diff --git a/src/models/wikidata/entiyt_id.py b/src/models/wikimedia/wikidata/entiyt_id.py similarity index 87% rename from src/models/wikidata/entiyt_id.py rename to src/models/wikimedia/wikidata/entiyt_id.py index 545ca6a..fe86931 100644 --- a/src/models/wikidata/entiyt_id.py +++ b/src/models/wikimedia/wikidata/entiyt_id.py @@ -1,10 +1,10 @@ import logging import config -from src.models.wikidata.enums import WikidataNamespaceLetters +from src.models.wikimedia.wikidata.enums import WikidataNamespaceLetters - -class EntityID: +# TODO convert this to special constr type with a validator +class EntityId: letter: WikidataNamespaceLetters # This can be e.g. "32698-F1" in the case of a lexeme rest: str diff --git a/src/models/wikidata/enums.py b/src/models/wikimedia/wikidata/enums.py similarity index 54% rename from src/models/wikidata/enums.py rename to src/models/wikimedia/wikidata/enums.py index 278b7e4..1e40e46 100644 --- a/src/models/wikidata/enums.py +++ b/src/models/wikimedia/wikidata/enums.py @@ -2,71 +2,35 @@ class WikidataGrammaticalFeature(Enum): - # Swedish ACTIVE_VOICE = "Q1317831" - PRETERITE = "Q442485" - INFINITIVE = "Q179230" - PRESENT_TENSE = "Q192613" - SUPINE = "Q548470" + DEFINITE = "Q53997851" + GENITIVE_CASE = "Q146233" IMPERATIVE = "Q22716" - PASSIVE_VOICE = "Q1194697" - SINGULAR = "Q110786" - NOMINATIVE_CASE = "Q131105" INDEFINITE = "Q53997857" - DEFINITE = "Q53997851" + INFINITIVE = "Q179230" + NOMINATIVE_CASE = "Q131105" + PASSIVE_VOICE = "Q1194697" PLURAL = "Q146786" - GENITIVE_CASE = "Q146233" - # English + PRESENT_TENSE = "Q192613" + PRETERITE = "Q442485" SIMPLE_PRESENT = "Q3910936" + SINGULAR = "Q110786" + SUPINE = "Q548470" THIRD_PERSON_SINGULAR = "Q51929447" class WikidataLexicalCategory(Enum): - NOUN = "Q1084" - VERB = "Q24905" - ADVERB = "Q380057" ADJECTIVE = "Q34698" + ADVERB = "Q380057" AFFIX = "Q62155" + NOUN = "Q1084" PROPER_NOUN = "Q147276" - - -class WikimediaLanguageCode(Enum): - DANISH = "da" - SWEDISH = "sv" - BOKMÅL = "nb" - ENGLISH = "en" - FRENCH = "fr" - RUSSIAN = "ru" - ESTONIAN = "et" - MALAYALAM = "ml" - LATIN = "la" - HEBREW = "he" - BASQUE = "eu" - GERMAN = "de" - BENGALI = "bn" - CZECH = "cs" - - -class WikimediaLanguageQID(Enum): - DANISH = "Q9035" - SWEDISH = "Q9027" - BOKMÅL = "Q25167" - ENGLISH = "Q1860" - FRENCH = "Q150" - RUSSIAN = "Q7737" - ESTONIAN = "Q9072" - MALAYALAM = "Q36236" - LATIN = "Q397" - HEBREW = "Q9288" - BASQUE = "Q8752" - GERMAN = "Q188" - BENGALI = "Q9610" - CZECH = "Q9056" + VERB = "Q24905" class WikidataNamespaceLetters(Enum): - PROPERTY = "P" ITEM = "Q" LEXEME = "L" + PROPERTY = "P" # FORM = "F" # SENSE = "S" diff --git a/src/models/wikidata/foreign_id.py b/src/models/wikimedia/wikidata/foreign_id.py similarity index 77% rename from src/models/wikidata/foreign_id.py rename to src/models/wikimedia/wikidata/foreign_id.py index 1e8abd3..85e5c79 100644 --- a/src/models/wikidata/foreign_id.py +++ b/src/models/wikimedia/wikidata/foreign_id.py @@ -1,6 +1,6 @@ from typing import Optional -from src.models.wikidata.entiyt_id import EntityID +from src.models.wikimedia.wikidata.entiyt_id import EntityId class ForeignID: @@ -15,7 +15,7 @@ def __init__(self, self.id = id if property is None: raise ValueError("property was None") - self.property = str(EntityID(property)) + self.property = str(EntityId(property)) if source_item_id is None: raise ValueError("source_item_id was None") - self.source_item_id = str(EntityID(source_item_id)) + self.source_item_id = str(EntityId(source_item_id)) diff --git a/src/models/wikimedia/wikidata/item.py b/src/models/wikimedia/wikidata/item.py new file mode 100644 index 0000000..1e041d0 --- /dev/null +++ b/src/models/wikimedia/wikidata/item.py @@ -0,0 +1,50 @@ +from typing import List, Optional + +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator import wbi_config # type: ignore +from wikibaseintegrator.models import Alias # type: ignore + +import config +from src.models.task import Task +from src.models.wikimedia.wikidata.entity import Entity + +wbi_config.config['USER_AGENT'] = config.user_agent + + +class Item(Entity): + """This represents an item in Wikidata + We always work on one language at a time, + so we don't bother with languages here and keep to simple strings""" + description: Optional[str] = None + aliases: Optional[List[str]] = None + + def __str__(self): + return f"{self.label}, see {self.url()}" + + def fetch_label_and_description_and_aliases(self, + task: Task = None): + """Fetch label and aliases in the task language from the Wikidata API""" + if task is None: + raise ValueError("task was None") + if not isinstance(task, Task): + raise ValueError("task was not a Task object") + if task.language_code is None: + raise ValueError("task.language_code was None") + from src.helpers.console import console + with console.status(f"Fetching {task.language_code.name.title()} label and aliases from the Wikidata API..."): + wbi = WikibaseIntegrator() + item = wbi.item.get(self.id) + label = item.labels.get(task.language_code.value) + if label is not None: + self.label = str(label) + description = item.descriptions.get(task.language_code.value) + if description is not None: + self.description = str(description) + aliases: List[Alias] = item.aliases.get(task.language_code.value) + # logging.debug(f"aliases from wbi:{item.aliases.get('en')}") + if aliases is not None: + self.aliases = [] + for alias in aliases: + self.aliases.append(str(alias)) + # logging.debug(f"appended:{alias.value}") + # logging.debug(f"aliases:{self.aliases}") diff --git a/src/models/wikimedia/wikidata/sparql_item.py b/src/models/wikimedia/wikidata/sparql_item.py new file mode 100644 index 0000000..e486a13 --- /dev/null +++ b/src/models/wikimedia/wikidata/sparql_item.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel + +from src.models.wikimedia.wikidata.entiyt_id import EntityId +from src.models.wikimedia.wikidata.item import Item + + +class Value(BaseModel): + value: str + + +class SparqlItem(Item): + """This class models the data we get from SPARQL""" + item: Value + itemLabel: Value + + def validate_qid_and_copy_label(self): + self.id = str(EntityId(self.item.value)) + self.label = self.itemLabel.value From 524e544d1636feb768af45b6c4c89a1584342bc4 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Sat, 19 Feb 2022 14:18:30 +0100 Subject: [PATCH 05/13] Fix mypy issues --- src/helpers/console.py | 6 +++++- src/helpers/jobs.py | 11 ++++++++--- src/helpers/menus.py | 3 ++- src/models/items/__init__.py | 13 +++++++++---- src/models/items/riksdagen_documents.py | 7 +++---- src/models/items/thesis.py | 2 ++ src/models/quickstatements.py | 3 +++ src/models/suggestion.py | 4 +++- 8 files changed, 35 insertions(+), 14 deletions(-) diff --git a/src/helpers/console.py b/src/helpers/console.py index 50efb70..79e6f1c 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -77,6 +77,8 @@ def print_found_items_table(args: argparse.Namespace = None, raise ValueError("args was None") if items is None: raise ValueError("items was None") + if items.list is None: + raise ValueError("items.list was None") table = Table(title="Matched items found") if len(items.list) < 1000: list_to_show = items.list[0:50] @@ -122,8 +124,10 @@ def ask_add_to_job_queue(job: BatchJob = None): def print_running_jobs(jobs: List[BatchJob] = None): if jobs is None: raise ValueError("jobs was None") + if not isinstance(jobs, list): + raise ValueError("jobs is not a list") console.print(f"Running {len(jobs)} job(s) with a total of " - f"{sum(len(job.items.list) for job in jobs)} items " + f"{sum(len(job.items.list) for job in jobs if job.items.list is not None)} items " f"non-interactively now. You can take a " f"coffee break and lean back :)") diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 94ea5e5..5e1a345 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -10,8 +10,8 @@ TaskIds, print_found_items_table, ask_add_to_job_queue, print_keep_an_eye_on_wdqs_lag, print_running_jobs, \ print_finished, print_job_statistics from src.helpers.menus import select_task -from src.models.items.academic_journals import AcademicJournalItems from src.models.items import Items +from src.models.items.academic_journals import AcademicJournalItems from src.models.items.riksdagen_documents import RiksdagenDocumentItems from src.models.items.scholarly_articles import ScholarlyArticleItems from src.models.items.thesis import ThesisItems @@ -34,7 +34,7 @@ def process_qid_into_job(qid: str = None, raise ValueError("args was None") if task is None: raise ValueError("task was None") - from src.models.wikimedia.wikidata import Item + from src.models.wikimedia.wikidata.item import Item item = Item( id=strip_prefix(qid), ) @@ -72,6 +72,8 @@ def process_qid_into_job(qid: str = None, raise ValueError(f"{task.id} was not recognized") items.fetch_based_on_label(suggestion=suggestion, task=task) + if items.list is None: + raise ValueError("items.list was None") if len(items.list) > 0: # Randomize the list items.random_shuffle_list() @@ -193,7 +195,10 @@ def get_validated_main_subjects_as_jobs( if len(subjects_not_picked_yet) > 0: if ( args.no_ask_match_more_limit is None or - args.no_ask_match_more_limit < sum(len(job.items.list) for job in jobs) + args.no_ask_match_more_limit < sum( + len(job.items.list) for job in jobs + if job.items.list is not None + ) ): answer_was_yes = ask_yes_no_question("Match one more?") if not answer_was_yes: diff --git a/src/helpers/menus.py b/src/helpers/menus.py index 022d695..2e5a21e 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -4,7 +4,7 @@ from consolemenu import SelectionMenu # type: ignore from src.models.suggestion import Suggestion -from src.models.wikimedia.wikidata import Item +from src.models.wikimedia.wikidata.item import Item from src.tasks import tasks, Task @@ -20,6 +20,7 @@ def select_suggestion(suggestions: List[Suggestion] = None, selected_suggestion = None if selected_index > (len(suggestions) - 1): logger.debug("The user choose to skip") + return None else: selected_suggestion = tasks[selected_index] logger.debug(f"selected:{selected_index}=" diff --git a/src/models/items/__init__.py b/src/models/items/__init__.py index ed85493..1b5dd9e 100644 --- a/src/models/items/__init__.py +++ b/src/models/items/__init__.py @@ -1,14 +1,19 @@ +from __future__ import annotations + import random -from typing import List +from typing import List, TYPE_CHECKING, Optional from pydantic import BaseModel -from src import Suggestion, Task +from src.models.task import Task from src.models.wikimedia.wikidata.sparql_item import SparqlItem +if TYPE_CHECKING: + from src.models.suggestion import Suggestion + class Items(BaseModel): - list: List[SparqlItem] + list: Optional[List[SparqlItem]] def fetch_based_on_label(self, suggestion: Suggestion = None, @@ -16,4 +21,4 @@ def fetch_based_on_label(self, pass def random_shuffle_list(self): - random.shuffle(self.list) \ No newline at end of file + random.shuffle(self.list) diff --git a/src/models/items/riksdagen_documents.py b/src/models/items/riksdagen_documents.py index 2d8d801..c9ac4be 100644 --- a/src/models/items/riksdagen_documents.py +++ b/src/models/items/riksdagen_documents.py @@ -4,10 +4,10 @@ import config from src.helpers.console import console +from src.models.items import Items from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikimedia.wikidata.item import Item -from src.models.items import Items +from src.models.wikimedia.wikidata.sparql_item import SparqlItem class RiksdagenDocumentItems(Items): @@ -62,8 +62,7 @@ def fetch_based_on_label(self, ''', debug=suggestion.args.debug_sparql) for item_json in results["results"]["bindings"]: logging.debug(f"item_json:{item_json}") - item = Item(json=item_json, - task=task) + item = SparqlItem(**item_json) self.list.append(item) logging.info(f'Got {len(results["results"]["bindings"])} items from ' f'WDQS using the search string {search_string}') diff --git a/src/models/items/thesis.py b/src/models/items/thesis.py index 8ef091f..77d1a98 100644 --- a/src/models/items/thesis.py +++ b/src/models/items/thesis.py @@ -18,6 +18,8 @@ def fetch_based_on_label(self, # logger = logging.getLogger(__name__) if suggestion is None: raise ValueError("suggestion was None") + if suggestion.search_strings is None: + raise ValueError("suggestion.search_strings was None") if suggestion.args.limit_to_items_without_p921: raise Exception("Limiting to items without P921 is not " "supported yet for this task.") diff --git a/src/models/quickstatements.py b/src/models/quickstatements.py index 490eaf3..d4daa54 100644 --- a/src/models/quickstatements.py +++ b/src/models/quickstatements.py @@ -16,5 +16,8 @@ class QuickStatementsCommandVersion1(BaseModel): property: Optional[EntityId] = None value: Optional[EntityId] = None + class Config: + arbitrary_types_allowed = True + def __str__(self): return f"{self.target}\t{self.property}\t{self.value}" diff --git a/src/models/suggestion.py b/src/models/suggestion.py index a1626b1..95680d9 100644 --- a/src/models/suggestion.py +++ b/src/models/suggestion.py @@ -13,7 +13,7 @@ from src.models.batch_job import BatchJob from src.models.items import Items from src.models.task import Task -from src.models.wikimedia.wikidata import Item +from src.models.wikimedia.wikidata.item import Item class Suggestion(BaseModel): @@ -50,6 +50,8 @@ def add_to_items(self, This function is non-interactive""" if items is None: raise ValueError("Items was None") + if items.list is None: + raise ValueError("items.list was None") if jobs is None: raise ValueError("jobs was None") if job_count is None: From e8be03cab8b784830db2ce92044fbe133ae39e5c Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Sat, 19 Feb 2022 16:04:27 +0100 Subject: [PATCH 06/13] Fix mypy issues and circular imports New class BatchJobs and use it everywhere --- diagrams/classes.puml | 9 +++++ src/__init__.py | 51 ++++++++++++----------------- src/helpers/console.py | 31 ++++++++---------- src/helpers/jobs.py | 67 ++++++++++++++++++-------------------- src/helpers/menus.py | 8 +++-- src/helpers/pickle.py | 6 ++-- src/models/batch_job.py | 15 ++------- src/models/batch_jobs.py | 26 +++++++++++++++ src/models/items/thesis.py | 8 ++--- src/models/suggestion.py | 13 ++++++-- src/models/task.py | 19 ----------- src/tasks.py | 16 ++++----- 12 files changed, 135 insertions(+), 134 deletions(-) create mode 100644 src/models/batch_jobs.py diff --git a/diagrams/classes.puml b/diagrams/classes.puml index b0600a5..d958ac7 100644 --- a/diagrams/classes.puml +++ b/diagrams/classes.puml @@ -140,11 +140,19 @@ class Task { __str__() } +class BatchJobs { +job_count +jobs: List[BatchJob] +print_running_jobs() +run_jobs() +} + class BatchJob { +suggestion: Suggestion +items: Items run() } + class QuickStatementsCommandVersion1 { +target: EntityID = None +property: EntityID = None @@ -159,6 +167,7 @@ BaseModel <|-- Entity BaseModel <|-- Task BaseModel <|-- Suggestion BaseModel <|-- BatchJob +BaseModel <|-- BatchJobs BaseModel <|-- QuickStatementsCommandVersion1 BaseModel <|-- Items Entity <|-- Item diff --git a/src/__init__.py b/src/__init__.py index 163aee3..077f9e1 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -9,7 +9,7 @@ import config from src.helpers.argparse_setup import setup_argparse_and_return_args from src.helpers.cleaning import strip_prefix -from src.helpers.console import console, print_found_items_table, ask_add_to_job_queue, print_running_jobs, \ +from src.helpers.console import console, print_found_items_table, ask_add_to_job_queue, \ ask_yes_no_question, print_finished, \ print_keep_an_eye_on_wdqs_lag, print_best_practice, print_job_statistics, ask_discard_existing_job_pickle from src.helpers.enums import TaskIds @@ -20,6 +20,7 @@ from src.helpers.pickle import parse_job_pickle, remove_job_pickle, add_to_job_pickle, check_if_pickle_exists, \ parse_main_subjects_pickle, get_hash_of_job_pickle from src.models.batch_job import BatchJob +from src.models.batch_jobs import BatchJobs from src.models.quickstatements import QuickStatementsCommandVersion1 from src.models.suggestion import Suggestion from src.models.task import Task @@ -48,21 +49,14 @@ def match_existing_main_subjects(args: argparse.Namespace = None, with console.status("Reading the main subjects file into memory"): main_subjects = parse_main_subjects_pickle() # raise Exception("debug exit") - jobs = get_validated_main_subjects_as_jobs(args=args, - main_subjects=main_subjects, - jobs=jobs) - handle_job_preparation_or_run_directly_if_any_jobs(args=args, jobs=jobs) + jobs = get_validated_main_subjects_as_jobs(args=args, main_subjects=main_subjects, batchjobs=jobs) + handle_job_preparation_or_run_directly_if_any_jobs(args=args, batchjobs=jobs) -def match_main_subjects_from_sparql(args: argparse.Namespace = None, - jobs: List[BatchJob] = None): +def match_main_subjects_from_sparql(args: argparse.Namespace = None): """Collect subjects via SPARQL and call get_validated_main_subjects() If we get any validated jobs we handle them""" logger = logging.getLogger(__name__) - if jobs is None: - raise ValueError("jobs was None") - if not isinstance(jobs, List): - raise ValueError("jobs was not a list") if args is None or args.sparql is None: raise ValueError("args.sparql was None") if "P1889" not in args.sparql: @@ -81,12 +75,9 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None, main_subjects.append(item_json["item"]["value"]) if len(main_subjects) > 0: console.print(f"Got {len(main_subjects)} results") - jobs = get_validated_main_subjects_as_jobs( - args=args, - main_subjects=main_subjects, - jobs=jobs - ) - handle_job_preparation_or_run_directly_if_any_jobs(args=args, jobs=jobs) + batchjobs = get_validated_main_subjects_as_jobs(args=args, + main_subjects=main_subjects) + handle_job_preparation_or_run_directly_if_any_jobs(args=args, batchjobs=batchjobs) else: console.print("Got 0 results. Try another query or debug it using --debug") @@ -150,8 +141,8 @@ def main(): """This is the main function that makes everything else happen""" logger = logging.getLogger(__name__) migrate_pickle_detection() - jobs: List[BatchJob] = [] args = setup_argparse_and_return_args() + batchjobs = None # console.print(args.list) if args.remove_prepared_jobs is True: remove_job_pickle() @@ -165,28 +156,24 @@ def main(): # to avoid running batches multiple times by # mistake (which does not harm Wikidata, but waste # precious computing resources which we want to avoid.) - jobs = parse_job_pickle(silent=True) - if len(jobs) > 0: - console.print(f"Found and loaded {len(jobs)} " + batchjobs = parse_job_pickle(silent=True) + if len(batchjobs.jobs) > 0: + console.print(f"Found and loaded {len(batchjobs.jobs)} " f"jobs with a total of " - f"{sum(len(job.items.list) for job in jobs)} items") + f"{sum(len(job.items.list) for job in batchjobs.jobs)} items") remove_job_pickle(silent=True) if args.run_prepared_jobs is True: logger.info("Running prepared jobs") - jobs = parse_job_pickle() - if jobs is not None and len(jobs) > 0: + batchjobs = parse_job_pickle() + if batchjobs is not None and len(batchjobs.jobs) > 0: file_hash = get_hash_of_job_pickle() - run_jobs(jobs) + run_jobs(batchjobs=batchjobs) # Remove the pickle afterwards remove_job_pickle(hash=file_hash) - if args.export_job_list_to_quickstatements: - export_jobs_to_quickstatements() elif args.export_jobs_to_dataframe: export_jobs_to_dataframe() - elif args.match_existing_main_subjects is True: - match_existing_main_subjects(args=args, jobs=jobs) elif args.sparql: - match_main_subjects_from_sparql(args=args, jobs=jobs) + match_main_subjects_from_sparql(args=args) else: # if not args.run_prepared_jobs: if args.add is None: @@ -195,8 +182,10 @@ def main(): task: Task = select_task() if task is None: raise ValueError("Got no task") + jobs = [] jobs.extend(process_user_supplied_qids_into_batch_jobs(args=args, task=task)) - handle_job_preparation_or_run_directly_if_any_jobs(args=args, jobs=jobs) + batchjobs = BatchJobs(jobs=jobs) + handle_job_preparation_or_run_directly_if_any_jobs(args=args, batchjobs=batchjobs) if __name__ == "__main__": diff --git a/src/helpers/console.py b/src/helpers/console.py index 79e6f1c..4af7b35 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -1,4 +1,5 @@ from __future__ import annotations + import argparse from typing import List, TYPE_CHECKING from urllib.parse import quote @@ -8,6 +9,7 @@ from src.helpers.cleaning import clean_rich_formatting from src.models.batch_job import BatchJob +from src.models.batch_jobs import BatchJobs if TYPE_CHECKING: from src.models.items import Items @@ -116,35 +118,30 @@ def ask_add_to_job_queue(job: BatchJob = None): if job.items.list is None: raise ValueError("job.items.list was None") return ask_yes_no_question(f"Do you want to add this job for " - f"[magenta]{job.suggestion.item.label}: " - f"{job.suggestion.item.description}[/magenta] with " - f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})") + f"[magenta]{job.suggestion.item.label}: " + f"{job.suggestion.item.description}[/magenta] with " + f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})") -def print_running_jobs(jobs: List[BatchJob] = None): - if jobs is None: - raise ValueError("jobs was None") - if not isinstance(jobs, list): - raise ValueError("jobs is not a list") - console.print(f"Running {len(jobs)} job(s) with a total of " - f"{sum(len(job.items.list) for job in jobs if job.items.list is not None)} items " - f"non-interactively now. You can take a " - f"coffee break and lean back :)") def print_finished(): console.print("All jobs finished successfully") -def print_job_statistics(jobs: List[BatchJob] = None): - if jobs is None: +def print_job_statistics(batchjobs: BatchJobs = None): + if batchjobs is None: raise ValueError("jobs was None") - if len(jobs) == 0: + if batchjobs.jobs is None: + raise ValueError("batchjobs.jobs was None") + if not isinstance(batchjobs.jobs, list): + raise ValueError("jobs was not a list") + if len(batchjobs.jobs) == 0: console.print("The jobs list is empty") else: - console.print(f"The jobs list now contain a total of {len(jobs)} " + console.print(f"The jobs list now contain a total of {len(batchjobs.jobs)} " f"jobs with a total of " - f"{sum(len(job.items.list) for job in jobs)} items") + f"{sum(len(job.items.list) for job in batchjobs.jobs)} items") def ask_discard_existing_job_pickle(): diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 5e1a345..62362fd 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -7,9 +7,10 @@ from typing import Union, List, TYPE_CHECKING, Optional from src import strip_prefix, print_best_practice, console, ask_yes_no_question, \ - TaskIds, print_found_items_table, ask_add_to_job_queue, print_keep_an_eye_on_wdqs_lag, print_running_jobs, \ - print_finished, print_job_statistics + TaskIds, print_found_items_table, ask_add_to_job_queue, print_keep_an_eye_on_wdqs_lag, print_finished, \ + print_job_statistics from src.helpers.menus import select_task +from src.models.batch_jobs import BatchJobs from src.models.items import Items from src.models.items.academic_journals import AcademicJournalItems from src.models.items.riksdagen_documents import RiksdagenDocumentItems @@ -20,14 +21,14 @@ if TYPE_CHECKING: from src import Task, BatchJob - # TODO rewrite as OOP +logger = logging.getLogger(__name__) + def process_qid_into_job(qid: str = None, task: Task = None, args: argparse.Namespace = None, confirmation: bool = False) -> Union[BatchJob, None]: - # logger = logging.getLogger(__name__) if qid is None: raise ValueError("qid was None") if args is None: @@ -117,55 +118,51 @@ def process_user_supplied_qids_into_batch_jobs(args: argparse.Namespace = None, return jobs -def run_jobs(jobs: List[BatchJob] = None): - if jobs is None: - raise ValueError("jobs was None") +def run_jobs(batchjobs: BatchJobs = None): + if batchjobs is None: + raise ValueError("batchjobs was None") + if not isinstance(batchjobs, BatchJobs): + raise ValueError("batchjobs was not a BatchJobs object") print_keep_an_eye_on_wdqs_lag() from src import login login() - print_running_jobs(jobs) - count = 0 + batchjobs.print_running_jobs() start_time = datetime.now() - for job in jobs: - count += 1 - job.run(jobs=jobs, job_count=count) - console.print(f"runtime until now: {datetime.now() - start_time}") + batchjobs.run_jobs() print_finished() end_time = datetime.now() console.print(f'Total runtime: {end_time - start_time}') def handle_job_preparation_or_run_directly_if_any_jobs(args: argparse.Namespace = None, - jobs: List[BatchJob] = None): - if jobs is None: - raise ValueError("jobs was None") + batchjobs: BatchJobs = None): + if batchjobs is None: + raise ValueError("batchjobs was None") if args is None: raise ValueError("args was None") - if len(jobs) > 0: + if len(batchjobs.jobs) > 0: if args.prepare_jobs: - console.print(f"Adding {len(jobs)} job(s) to the jobs file") - for job in jobs: + console.print(f"Adding {len(batchjobs.jobs)} job(s) to the jobs file") + for job in batchjobs.jobs: from src import add_to_job_pickle add_to_job_pickle(job) - print_job_statistics(jobs=jobs) + print_job_statistics(batchjobs=batchjobs) console.print(f"You can run the jobs " f"non-interactively e.g. on the Toolforge " f"Kubernetes cluster using -r or --run-prepared-jobs. " f"See Kubernetes_HOWTO.md for details.") else: - run_jobs(jobs) + run_jobs(batchjobs=batchjobs) -def get_validated_main_subjects_as_jobs( - args: argparse.Namespace = None, - main_subjects: List[str] = None, - jobs: List[BatchJob] = None -) -> List[BatchJob]: +def get_validated_main_subjects_as_jobs(args: argparse.Namespace = None, + main_subjects: List[str] = None, + batchjobs: List[BatchJob] = None) -> List[BatchJob]: """This function randomly picks a subject and present it for validation""" logger = logging.getLogger(__name__) - if jobs is None: + if batchjobs is None: raise ValueError("jobs was None") - if not isinstance(jobs, List): + if not isinstance(batchjobs, List): raise ValueError("jobs was not a list") if args is None: raise ValueError("args was None") @@ -189,16 +186,16 @@ def get_validated_main_subjects_as_jobs( args=args, confirmation=args.no_confirmation) if job is not None: - jobs.append(job) - logger.debug(f"joblist now has {len(jobs)} jobs") - print_job_statistics(jobs=jobs) + batchjobs.append(job) + logger.debug(f"joblist now has {len(batchjobs)} jobs") + print_job_statistics(batchjobs=batchjobs) if len(subjects_not_picked_yet) > 0: if ( args.no_ask_match_more_limit is None or args.no_ask_match_more_limit < sum( - len(job.items.list) for job in jobs - if job.items.list is not None - ) + len(job.items.list) for job in batchjobs + if job.items.list is not None + ) ): answer_was_yes = ask_yes_no_question("Match one more?") if not answer_was_yes: @@ -209,4 +206,4 @@ def get_validated_main_subjects_as_jobs( else: console.print("No more subjects in the list. Exiting.") break - return jobs + return batchjobs diff --git a/src/helpers/menus.py b/src/helpers/menus.py index 2e5a21e..26ee8b2 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -5,7 +5,7 @@ from src.models.suggestion import Suggestion from src.models.wikimedia.wikidata.item import Item -from src.tasks import tasks, Task +from src.tasks import Task def select_suggestion(suggestions: List[Suggestion] = None, @@ -22,6 +22,7 @@ def select_suggestion(suggestions: List[Suggestion] = None, logger.debug("The user choose to skip") return None else: + from src.tasks import tasks selected_suggestion = tasks[selected_index] logger.debug(f"selected:{selected_index}=" f"{selected_suggestion}") @@ -30,7 +31,10 @@ def select_suggestion(suggestions: List[Suggestion] = None, def select_task() -> Task: logger = logging.getLogger(__name__) - menu = SelectionMenu(tasks, "Select a task") + from src.tasks import tasks + labels = list([task.label for task in tasks]) + # console.print(labels) + menu = SelectionMenu(labels, "Select a task") menu.show() menu.join() task_index = menu.selected_option diff --git a/src/helpers/pickle.py b/src/helpers/pickle.py index 8724794..ed379c6 100644 --- a/src/helpers/pickle.py +++ b/src/helpers/pickle.py @@ -8,6 +8,8 @@ from src.models.batch_job import BatchJob # TODO rewrite as OOP +from src.models.batch_jobs import BatchJobs + def add_to_job_pickle(job: BatchJob = None): if job is None: @@ -41,7 +43,7 @@ def check_if_pickle_exists(path): return False -def parse_job_pickle(silent: bool = False) -> Optional[List[BatchJob]]: +def parse_job_pickle(silent: bool = False) -> Optional[BatchJobs]: """Reads the pickle into a list of batch jobs""" if check_if_pickle_exists(config.job_pickle_file_path): jobs: List[BatchJob] = [] @@ -52,7 +54,7 @@ def parse_job_pickle(silent: bool = False) -> Optional[List[BatchJob]]: console.print("No prepared jobs found") return None else: - return jobs + return BatchJobs(jobs=jobs) else: if not silent: console.print("No pickle file found") diff --git a/src/models/batch_job.py b/src/models/batch_job.py index 25195f5..e5cf6ec 100644 --- a/src/models/batch_job.py +++ b/src/models/batch_job.py @@ -1,12 +1,9 @@ -from __future__ import annotations -from dataclasses import dataclass -from typing import List, TYPE_CHECKING +from typing import List from pydantic import BaseModel -if TYPE_CHECKING: - from src.models.suggestion import Suggestion - from src.models.items import Items +from src.models.items import Items +from src.models.suggestion import Suggestion class BatchJob(BaseModel): @@ -14,9 +11,3 @@ class BatchJob(BaseModel): suggestion: Suggestion items: Items - def run(self, jobs: List[BatchJob], job_count: int = None): - if jobs is None: - raise ValueError("jobs was None") - if job_count is None: - raise ValueError("job count was None") - self.suggestion.add_to_items(items=self.items, jobs=jobs, job_count=job_count) diff --git a/src/models/batch_jobs.py b/src/models/batch_jobs.py new file mode 100644 index 0000000..76295ed --- /dev/null +++ b/src/models/batch_jobs.py @@ -0,0 +1,26 @@ +from typing import List + +from pydantic import BaseModel + +from src.models.batch_job import BatchJob + + +class BatchJobs(BaseModel): + jobs: List[BatchJob] + + @property + def job_count(self): + return len(self.jobs) + + def print_running_jobs(self): + if not isinstance(self.jobs, list): + raise ValueError("jobs is not a list") + from src.helpers.console import console + console.print(f"Running {len(self.jobs)} job(s) with a total of " + f"{sum(len(job.items.list) for job in self.jobs if job.items.list is not None)} items " + f"non-interactively now. You can take a " + f"coffee break and lean back :)") + + def run_jobs(self): + for job in self.jobs: + job.suggestion.add_to_items(items=job.items, jobs=self.jobs, job_count=self.job_count) diff --git a/src/models/items/thesis.py b/src/models/items/thesis.py index 77d1a98..db205a4 100644 --- a/src/models/items/thesis.py +++ b/src/models/items/thesis.py @@ -3,12 +3,11 @@ from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore from src.helpers.console import console +from src.models.items import Items from src.models.suggestion import Suggestion from src.models.task import Task -from src.models.wikimedia.wikidata.item import Item -from src.models.items import Items - # There were ~16.000 thesis' in WD when this was written +from src.models.wikimedia.wikidata.sparql_item import SparqlItem class ThesisItems(Items): @@ -59,8 +58,7 @@ def fetch_based_on_label(self, ''', debug=suggestion.args.debug_sparql) for item_json in results["results"]["bindings"]: logging.debug(f"item_json:{item_json}") - item = Item(json=item_json, - task=task) + item = SparqlItem(**item_json) self.list.append(item) logging.info(f'Got {len(results["results"]["bindings"])} items from ' f'WDQS using the search string {search_string}') diff --git a/src/models/suggestion.py b/src/models/suggestion.py index 95680d9..c55689e 100644 --- a/src/models/suggestion.py +++ b/src/models/suggestion.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import argparse import logging -from typing import List, Optional +from typing import List, Optional, TYPE_CHECKING from urllib.parse import quote from pydantic import BaseModel @@ -9,12 +11,13 @@ import config from src.helpers.calculations import calculate_random_editgroups_hash from src.helpers.cleaning import clean_rich_formatting -from src.helpers.console import print_search_strings_table, console -from src.models.batch_job import BatchJob from src.models.items import Items from src.models.task import Task from src.models.wikimedia.wikidata.item import Item +if TYPE_CHECKING: + from src.models.batch_job import BatchJob + class Suggestion(BaseModel): item: Item @@ -60,6 +63,7 @@ def add_to_items(self, count = 0 for target_item in items.list: count += 1 + from src import console with console.status(f"Uploading main subject " f"[green]{clean_rich_formatting(self.item.label)}[/green] " f"to {clean_rich_formatting(target_item.label)}"): @@ -87,12 +91,14 @@ def extract_search_strings(self): def clean_special_symbols(string: str): return string.replace("®", "").replace("™", "") + from src.helpers.console import console logger = logging.getLogger(__name__) if self.args is None: raise ValueError("args was None") else: logger.debug(f"args:{self.args}") if self.args.no_aliases is True: + from src import console console.print("Alias matching is turned off") no_aliases = True else: @@ -112,6 +118,7 @@ def clean_special_symbols(string: str): else: self.search_strings.append(clean_special_symbols(alias)) # logger.debug(f"search_strings:{self.search_strings}") + from src.helpers.console import print_search_strings_table print_search_strings_table(args=self.args, search_strings=self.search_strings) diff --git a/src/models/task.py b/src/models/task.py index 2525e81..3a4858b 100644 --- a/src/models/task.py +++ b/src/models/task.py @@ -14,24 +14,5 @@ class Task(BaseModel): language_code: SupportedLanguageCode number_of_queries_per_search_string = 1 - # def __init__(self, - # best_practice_information: str = None, - # id: TaskIds = None, - # label: str = None, - # language_code: SupportedLanguageCode = None, - # number_of_queries_per_search_string: int = None): - # if id is None: - # raise ValueError("Got no id") - # if label is None: - # raise ValueError("Got no label") - # if language_code is None: - # raise ValueError("Got no language_code") - # self.id = id - # self.label = label - # self.language_code = language_code - # self.best_practice_information = best_practice_information - # if number_of_queries_per_search_string is not None: - # self.number_of_queries_per_search_string = number_of_queries_per_search_string - def __str__(self): return f"{self.label}" diff --git a/src/tasks.py b/src/tasks.py index 5b80b5e..1e1a7a9 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -4,7 +4,7 @@ # When adding a new task, also add it in the enum tasks = [ - Task( + Task(**dict( id=TaskIds.SCHOLARLY_ARTICLES, label="Add main subject to scholarly articles and preprints", language_code=SupportedLanguageCode.ENGLISH, @@ -22,14 +22,14 @@ "sub forms of screening have been matched." ), number_of_queries_per_search_string=2 - ), - Task( + )), + Task(**dict( id=TaskIds.RIKSDAGEN_DOCUMENTS, label="Add main subject to documents from Riksdagen", language_code=SupportedLanguageCode.SWEDISH, best_practice_information=None - ), - Task( + )), + Task(**dict( id=TaskIds.THESIS, label="Add main subject to thesis' and technical reports", language_code=SupportedLanguageCode.ENGLISH, @@ -46,11 +46,11 @@ "avoid the more general 'cancer screening' until all " "sub forms of screening have been matched." ), - ), - Task( + )), + Task(**dict( id=TaskIds.ACADEMIC_JOURNALS, label="Add main subject to academic journals", language_code=SupportedLanguageCode.ENGLISH, best_practice_information=None - ), + )), ] From 7e11574dcd4ecbf06ffb876201b615a9e363e781 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Sat, 19 Feb 2022 16:11:01 +0100 Subject: [PATCH 07/13] Fix handling of batchjobs --- src/helpers/jobs.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 62362fd..de33415 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -156,14 +156,9 @@ def handle_job_preparation_or_run_directly_if_any_jobs(args: argparse.Namespace def get_validated_main_subjects_as_jobs(args: argparse.Namespace = None, - main_subjects: List[str] = None, - batchjobs: List[BatchJob] = None) -> List[BatchJob]: + main_subjects: List[str] = None) -> BatchJobs: """This function randomly picks a subject and present it for validation""" logger = logging.getLogger(__name__) - if batchjobs is None: - raise ValueError("jobs was None") - if not isinstance(batchjobs, List): - raise ValueError("jobs was not a list") if args is None: raise ValueError("args was None") if main_subjects is None: @@ -174,6 +169,7 @@ def get_validated_main_subjects_as_jobs(args: argparse.Namespace = None, raise ValueError("Got no task") if not isinstance(task, Task): raise ValueError("task was not a Task object") + batchjobs = BatchJobs(jobs=[]) while True: # Check if we have any subjects left in the list if len(subjects_not_picked_yet) > 0: @@ -186,14 +182,14 @@ def get_validated_main_subjects_as_jobs(args: argparse.Namespace = None, args=args, confirmation=args.no_confirmation) if job is not None: - batchjobs.append(job) - logger.debug(f"joblist now has {len(batchjobs)} jobs") + batchjobs.jobs.append(job) + logger.debug(f"joblist now has {len(batchjobs.jobs)} jobs") print_job_statistics(batchjobs=batchjobs) if len(subjects_not_picked_yet) > 0: if ( args.no_ask_match_more_limit is None or args.no_ask_match_more_limit < sum( - len(job.items.list) for job in batchjobs + len(job.items.list) for job in batchjobs.jobs if job.items.list is not None ) ): From 1b415239e44c6cf128b926e06e22ae27344d8a43 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Sat, 19 Feb 2022 16:29:46 +0100 Subject: [PATCH 08/13] Remove quickstatements code and fix export_jobs_to_dataframe --- config.example.py | 2 +- src/__init__.py | 44 +++++++---------------------------- src/helpers/argparse_setup.py | 6 ----- src/models/quickstatements.py | 23 ------------------ 4 files changed, 10 insertions(+), 65 deletions(-) delete mode 100644 src/models/quickstatements.py diff --git a/config.example.py b/config.example.py index deb86c4..6080917 100644 --- a/config.example.py +++ b/config.example.py @@ -9,9 +9,9 @@ password = "" # Global settings +loglevel = logging.WARNING wiki_user = "User:Username" # Change this to your username list_of_allowed_aliases: List[str] = [] # Add elements like this ["API"] -logging.basicConfig(level=logging.WARNING) version = "0.2" # Don't touch this. wd_prefix = "http://www.wikidata.org/entity/" endpoint = "https://query.wikidata.org/sparql" diff --git a/src/__init__.py b/src/__init__.py index 077f9e1..d2d5961 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -21,12 +21,13 @@ parse_main_subjects_pickle, get_hash_of_job_pickle from src.models.batch_job import BatchJob from src.models.batch_jobs import BatchJobs -from src.models.quickstatements import QuickStatementsCommandVersion1 from src.models.suggestion import Suggestion from src.models.task import Task from src.models.wikimedia.wikidata.entiyt_id import EntityId from src.tasks import tasks +logging.basicConfig(level=config.loglevel) + def login(): with console.status("Logging in with WikibaseIntegrator..."): @@ -76,7 +77,7 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None): if len(main_subjects) > 0: console.print(f"Got {len(main_subjects)} results") batchjobs = get_validated_main_subjects_as_jobs(args=args, - main_subjects=main_subjects) + main_subjects=main_subjects) handle_job_preparation_or_run_directly_if_any_jobs(args=args, batchjobs=batchjobs) else: console.print("Got 0 results. Try another query or debug it using --debug") @@ -85,16 +86,15 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None): def export_jobs_to_dataframe(): logger = logging.getLogger(__name__) logger.info("Exporting jobs to DataFrame. All jobs are appended to one frame") - jobs = parse_job_pickle() - if jobs is not None: - number_of_jobs = len(jobs) - if jobs is not None and number_of_jobs > 0: - logger.info(f"Found {number_of_jobs} jobs") + batchjobs = parse_job_pickle() + if batchjobs is not None: + if batchjobs is not None and batchjobs.job_count > 0: + logger.info(f"Found {batchjobs.job_count} jobs") df = pd.DataFrame() count = 1 - for job in jobs: + for job in batchjobs.jobs: count += 1 - logger.info(f"Working on job {count}/{number_of_jobs}") + logger.info(f"Working on job {count}/{batchjobs.job_count}") job_df = pd.DataFrame() for item in job.items.list: job_df = job_df.append(pd.DataFrame(data=[dict( @@ -112,37 +112,11 @@ def export_jobs_to_dataframe(): console.print("No jobs found. Create a job list first by using '--prepare-jobs'") -def export_jobs_to_quickstatements(): - logger = logging.getLogger(__name__) - logger.info("Exporting jobs to QuickStatements V1 commands. One file for each job.") - jobs = parse_job_pickle() - if jobs is not None and len(jobs) > 0: - for job in jobs: - # Convert all items - lines = [] - for item in job.items.list: - line = QuickStatementsCommandVersion1( - target=EntityId(item.id), - property=EntityId("P921"), - value=EntityId(job.suggestion.item.id), - ) - lines.append(line) - logger.debug(f"Got {len(lines)} QS lines to export") - filename = (f"quickstatements-export-" - f"{job.suggestion.item.id}-" - f"{job.suggestion.item.label}.csv") - with open(filename, "w") as file: - for line in lines: - file.write(f"{str(line)}\n") - console.print(f"Wrote to {filename} in the current directory") - - def main(): """This is the main function that makes everything else happen""" logger = logging.getLogger(__name__) migrate_pickle_detection() args = setup_argparse_and_return_args() - batchjobs = None # console.print(args.list) if args.remove_prepared_jobs is True: remove_job_pickle() diff --git a/src/helpers/argparse_setup.py b/src/helpers/argparse_setup.py index 3b9cfb0..2d24c52 100644 --- a/src/helpers/argparse_setup.py +++ b/src/helpers/argparse_setup.py @@ -98,12 +98,6 @@ def setup_argparse_and_return_args(): type=int, help='When working on SPARQL queries of e.g. galaxies, match more until this many matches are in the job list' ) - parser.add_argument( - '--export-job-list-to-quickstatements', '-qs', - action='store_true', - help='Export the prepared job list to QuickStatements.', - default=False - ) parser.add_argument( '--export-jobs-to-dataframe', action='store_true', diff --git a/src/models/quickstatements.py b/src/models/quickstatements.py deleted file mode 100644 index d4daa54..0000000 --- a/src/models/quickstatements.py +++ /dev/null @@ -1,23 +0,0 @@ -from dataclasses import dataclass -from typing import Optional - -from pydantic import BaseModel - -from src.models.wikimedia.wikidata.entiyt_id import EntityId - - -class QuickStatementsCommandVersion1(BaseModel): - """This models the simple line-based QS commands - - For now we only support QID-values - - Q1\tP1\tQ1""" - target: Optional[EntityId] = None - property: Optional[EntityId] = None - value: Optional[EntityId] = None - - class Config: - arbitrary_types_allowed = True - - def __str__(self): - return f"{self.target}\t{self.property}\t{self.value}" From e7ca270a600fb4b72f174faf84f6454d222d30d1 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Sat, 19 Feb 2022 16:30:30 +0100 Subject: [PATCH 09/13] Update classes.puml --- diagrams/classes.puml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/diagrams/classes.puml b/diagrams/classes.puml index d958ac7..3bf280d 100644 --- a/diagrams/classes.puml +++ b/diagrams/classes.puml @@ -153,12 +153,6 @@ class BatchJob { run() } -class QuickStatementsCommandVersion1 { - +target: EntityID = None - +property: EntityID = None - +value: EntityID = None - -__str__() -} Items <|-- AcademicJournalItems Items <|-- RiksdagenDocumentItems Items <|-- ScholarlyArticleItems @@ -168,7 +162,6 @@ BaseModel <|-- Task BaseModel <|-- Suggestion BaseModel <|-- BatchJob BaseModel <|-- BatchJobs -BaseModel <|-- QuickStatementsCommandVersion1 BaseModel <|-- Items Entity <|-- Item Item <|-- SparqlItem From e17326247599abeb0363f84c0eafa81799b91e05 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Sat, 19 Feb 2022 18:09:15 +0100 Subject: [PATCH 10/13] Drop fetching existing main_subjects and matching based on those. --- fetch_main_subjects.py | 71 ---------------------------------------- src/__init__.py | 35 ++++---------------- src/helpers/console.py | 7 ++-- src/helpers/jobs.py | 16 +-------- src/helpers/migration.py | 3 -- src/helpers/pickle.py | 25 -------------- src/models/batch_jobs.py | 12 +++++++ 7 files changed, 22 insertions(+), 147 deletions(-) delete mode 100644 fetch_main_subjects.py diff --git a/fetch_main_subjects.py b/fetch_main_subjects.py deleted file mode 100644 index 14740e1..0000000 --- a/fetch_main_subjects.py +++ /dev/null @@ -1,71 +0,0 @@ -import logging -import random - -from wikibaseintegrator import wbi_config # type: ignore -from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore - -import config -from src import console -from src.helpers.cleaning import strip_prefix -from src.helpers.pickle import add_to_main_subject_pickle - -logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger(__name__) -wbi_config.config["USER_AGENT_DEFAULT"] = config.user_agent -console.print("Fetching 100,000 main subjects") -console.input("Press enter to continue") -subjects = [] -# This offset ensures that we don't get -# the same subset of subjects every time we run it -randomizing_offset: int = random.randint(1, 500000) -console.print(f"Random offset used: {randomizing_offset} for this run") -for i in range(0 + randomizing_offset, 100000 + randomizing_offset, 10000): - print(i) - # title: Get main subjects used at least once on scholarly articles - results = execute_sparql_query(f""" -SELECT ?subject -WHERE -{{ -{{ -SELECT DISTINCT ?subject WHERE {{ - hint:Query hint:optimizer "None". - ?item wdt:P31 wd:Q13442814; - wdt:P921 ?subject. -}} -offset {i} -limit 10000 -}} -MINUS{{ -?item wdt:P31 wd:Q8054. # protein -}} -MINUS{{ -?item wdt:P279 wd:Q8054. # protein -}} -MINUS{{ -?item wdt:P31 wd:Q7187. # gene -}} -MINUS{{ -?item wdt:P279 wd:Q7187. # gene -}} -}} - """) - if len(results) == 0: - raise ValueError("No main subjects found") - else: - # print("adding lexemes to list") - # pprint(results.keys()) - # pprint(results["results"].keys()) - # pprint(len(results["results"]["bindings"])) - for result in results["results"]["bindings"]: - # print(result) - subjects.append(strip_prefix(result["subject"]["value"])) - # exit(0) -console.print(f"{len(subjects)} fetched") -console.print("Filtering out duplicates") -subjects_without_duplicates = set() -for subject in subjects: - subjects_without_duplicates.add(subject) -console.print(f"Saving {len(subjects_without_duplicates)} " - f"to pickle '{config.main_subjects_pickle_file_path}' (overwriting)") -add_to_main_subject_pickle(subjects) -console.print("Done") diff --git a/src/__init__.py b/src/__init__.py index d2d5961..6b1a561 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,6 +1,5 @@ import argparse import logging -from typing import List import pandas as pd # type: ignore from wikibaseintegrator import wbi_login, wbi_config # type: ignore @@ -13,12 +12,12 @@ ask_yes_no_question, print_finished, \ print_keep_an_eye_on_wdqs_lag, print_best_practice, print_job_statistics, ask_discard_existing_job_pickle from src.helpers.enums import TaskIds -from src.helpers.jobs import process_qid_into_job, process_user_supplied_qids_into_batch_jobs, run_jobs, \ +from src.helpers.jobs import process_qid_into_job, process_user_supplied_qids_into_batch_jobs, \ handle_job_preparation_or_run_directly_if_any_jobs, get_validated_main_subjects_as_jobs from src.helpers.menus import select_task from src.helpers.migration import migrate_pickle_detection from src.helpers.pickle import parse_job_pickle, remove_job_pickle, add_to_job_pickle, check_if_pickle_exists, \ - parse_main_subjects_pickle, get_hash_of_job_pickle + get_hash_of_job_pickle from src.models.batch_job import BatchJob from src.models.batch_jobs import BatchJobs from src.models.suggestion import Suggestion @@ -41,19 +40,6 @@ def login(): wbi_config.config["USER_AGENT_DEFAULT"] = config.user_agent -def match_existing_main_subjects(args: argparse.Namespace = None, - jobs: List[BatchJob] = None): - if jobs is None: - raise ValueError("jobs was None") - if not isinstance(jobs, List): - raise ValueError("jobs was not a list") - with console.status("Reading the main subjects file into memory"): - main_subjects = parse_main_subjects_pickle() - # raise Exception("debug exit") - jobs = get_validated_main_subjects_as_jobs(args=args, main_subjects=main_subjects, batchjobs=jobs) - handle_job_preparation_or_run_directly_if_any_jobs(args=args, batchjobs=jobs) - - def match_main_subjects_from_sparql(args: argparse.Namespace = None): """Collect subjects via SPARQL and call get_validated_main_subjects() If we get any validated jobs we handle them""" @@ -125,23 +111,16 @@ def main(): if args.prepare_jobs is True: logger.info("Preparing jobs") if check_if_pickle_exists(config.job_pickle_file_path): - if not ask_discard_existing_job_pickle(): - # the default is yes - # to avoid running batches multiple times by - # mistake (which does not harm Wikidata, but waste - # precious computing resources which we want to avoid.) - batchjobs = parse_job_pickle(silent=True) - if len(batchjobs.jobs) > 0: - console.print(f"Found and loaded {len(batchjobs.jobs)} " - f"jobs with a total of " - f"{sum(len(job.items.list) for job in batchjobs.jobs)} items") - remove_job_pickle(silent=True) + if ask_discard_existing_job_pickle(): + remove_job_pickle(silent=True) + else: + console.print("Quitting.") if args.run_prepared_jobs is True: logger.info("Running prepared jobs") batchjobs = parse_job_pickle() if batchjobs is not None and len(batchjobs.jobs) > 0: file_hash = get_hash_of_job_pickle() - run_jobs(batchjobs=batchjobs) + batchjobs.run_jobs() # Remove the pickle afterwards remove_job_pickle(hash=file_hash) elif args.export_jobs_to_dataframe: diff --git a/src/helpers/console.py b/src/helpers/console.py index 4af7b35..8d11e89 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -123,8 +123,6 @@ def ask_add_to_job_queue(job: BatchJob = None): f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})") - - def print_finished(): console.print("All jobs finished successfully") @@ -141,10 +139,9 @@ def print_job_statistics(batchjobs: BatchJobs = None): else: console.print(f"The jobs list now contain a total of {len(batchjobs.jobs)} " f"jobs with a total of " - f"{sum(len(job.items.list) for job in batchjobs.jobs)} items") + f"{sum(len(job.items.list) for job in batchjobs.jobs if batchjobs.jobs is not None and job is not None)} items") def ask_discard_existing_job_pickle(): return ask_yes_no_question("A prepared list of jobs already exist, " - "do you want to overwrite it? " - "(pressing no will append to it)") + "do you want to delete it?") diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index de33415..1ed67ec 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -118,20 +118,6 @@ def process_user_supplied_qids_into_batch_jobs(args: argparse.Namespace = None, return jobs -def run_jobs(batchjobs: BatchJobs = None): - if batchjobs is None: - raise ValueError("batchjobs was None") - if not isinstance(batchjobs, BatchJobs): - raise ValueError("batchjobs was not a BatchJobs object") - print_keep_an_eye_on_wdqs_lag() - from src import login - login() - batchjobs.print_running_jobs() - start_time = datetime.now() - batchjobs.run_jobs() - print_finished() - end_time = datetime.now() - console.print(f'Total runtime: {end_time - start_time}') def handle_job_preparation_or_run_directly_if_any_jobs(args: argparse.Namespace = None, @@ -152,7 +138,7 @@ def handle_job_preparation_or_run_directly_if_any_jobs(args: argparse.Namespace f"Kubernetes cluster using -r or --run-prepared-jobs. " f"See Kubernetes_HOWTO.md for details.") else: - run_jobs(batchjobs=batchjobs) + batchjobs.run_jobs() def get_validated_main_subjects_as_jobs(args: argparse.Namespace = None, diff --git a/src/helpers/migration.py b/src/helpers/migration.py index 084b6c4..bbcbc88 100644 --- a/src/helpers/migration.py +++ b/src/helpers/migration.py @@ -6,9 +6,6 @@ def migrate_pickle_detection(): if config.job_pickle_file_path is None: raise ValueError("the variable job_pickle_file_path in config " "has to contain a string like 'pickle.dat'") - if config.main_subjects_pickle_file_path is None: - raise ValueError("The variable main_subjects_pickle_file_path" - "is None, see config.example.py") except AttributeError: raise ValueError("You need to migrate the new pickle variables" "in config.example.py to your config.py before " diff --git a/src/helpers/pickle.py b/src/helpers/pickle.py index ed379c6..42bb448 100644 --- a/src/helpers/pickle.py +++ b/src/helpers/pickle.py @@ -19,14 +19,6 @@ def add_to_job_pickle(job: BatchJob = None): pickle.dump(job, file, pickle.DEFAULT_PROTOCOL) -def add_to_main_subject_pickle(subjects: List[str] = None): - if subjects is None: - raise ValueError("subjects was None") - with open(config.main_subjects_pickle_file_path, 'wb') as file: - for qid in subjects: - pickle.dump(qid, file, pickle.DEFAULT_PROTOCOL) - - def read_from_pickle(path): with open(path, 'rb') as file: try: @@ -61,23 +53,6 @@ def parse_job_pickle(silent: bool = False) -> Optional[BatchJobs]: return None -def parse_main_subjects_pickle() -> Optional[List[str]]: - """Reads the pickle into a list of main subjects""" - if check_if_pickle_exists(config.main_subjects_pickle_file_path): - subjects = [] - for subject in read_from_pickle(config.main_subjects_pickle_file_path): - subjects.append(subject) - if len(subjects) == 0: - console.print("No qids found in the pickle.") - return None - else: - # print(f"found:{subjects}") - return subjects - else: - console.print("No main subjects pickle file found. " - "Create it by running 'python fetch_main_subjects.py'") - exit(0) - def remove_job_pickle(silent: bool = False, hash: str = None): if hash is None: diff --git a/src/models/batch_jobs.py b/src/models/batch_jobs.py index 76295ed..820e931 100644 --- a/src/models/batch_jobs.py +++ b/src/models/batch_jobs.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List from pydantic import BaseModel @@ -22,5 +23,16 @@ def print_running_jobs(self): f"coffee break and lean back :)") def run_jobs(self): + from src.helpers.console import console, print_keep_an_eye_on_wdqs_lag, print_finished + if self.jobs is None or len(self.jobs) == 0: + raise ValueError("did not get what we need") + print_keep_an_eye_on_wdqs_lag() + from src import login + login() + self.print_running_jobs() + start_time = datetime.now() for job in self.jobs: job.suggestion.add_to_items(items=job.items, jobs=self.jobs, job_count=self.job_count) + print_finished() + end_time = datetime.now() + console.print(f'Total runtime: {end_time - start_time}') From bf75a262ca84a3d4632e3109006c0b53d4707b81 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Wed, 23 Feb 2022 23:55:08 +0100 Subject: [PATCH 11/13] Remove duplicates from the list of items after fetching from SPARQL. --- src/helpers/jobs.py | 4 ++++ src/models/wikimedia/wikidata/entity.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 1ed67ec..7385866 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -76,6 +76,10 @@ def process_qid_into_job(qid: str = None, if items.list is None: raise ValueError("items.list was None") if len(items.list) > 0: + # Remove duplicates + logger.info(f"{len(items.list)} before duplicate removal") + items.list = list(set(items.list)) + logger.info(f"{len(items.list)} after duplicate removal") # Randomize the list items.random_shuffle_list() print_found_items_table(args=args, diff --git a/src/models/wikimedia/wikidata/entity.py b/src/models/wikimedia/wikidata/entity.py index bca73d6..a425542 100644 --- a/src/models/wikimedia/wikidata/entity.py +++ b/src/models/wikimedia/wikidata/entity.py @@ -17,6 +17,12 @@ class Entity(BaseModel): id: Optional[str] label: Optional[str] + def __eq__(self, other): + return self.id == other.id + + def __hash__(self): + return hash(('id', self.id)) + def upload_one_statement_to_wikidata(self, statement: BaseDataType = None, summary: str = None, From 6741b6195a43f68c1adc921b7d115c6d52b5d905 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Wed, 23 Feb 2022 23:55:08 +0100 Subject: [PATCH 12/13] Remove duplicates from the list of items after fetching from SPARQL. --- src/helpers/jobs.py | 4 ++++ src/models/wikimedia/wikidata/entity.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 1ed67ec..7385866 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -76,6 +76,10 @@ def process_qid_into_job(qid: str = None, if items.list is None: raise ValueError("items.list was None") if len(items.list) > 0: + # Remove duplicates + logger.info(f"{len(items.list)} before duplicate removal") + items.list = list(set(items.list)) + logger.info(f"{len(items.list)} after duplicate removal") # Randomize the list items.random_shuffle_list() print_found_items_table(args=args, diff --git a/src/models/wikimedia/wikidata/entity.py b/src/models/wikimedia/wikidata/entity.py index bca73d6..89a91e9 100644 --- a/src/models/wikimedia/wikidata/entity.py +++ b/src/models/wikimedia/wikidata/entity.py @@ -17,6 +17,14 @@ class Entity(BaseModel): id: Optional[str] label: Optional[str] + def __eq__(self, other): + """This helps in removing duplicates + https://stackoverflow.com/questions/4169252/remove-duplicates-in-list-of-object-with-python""" + return self.id == other.id + + def __hash__(self): + return hash(('id', self.id)) + def upload_one_statement_to_wikidata(self, statement: BaseDataType = None, summary: str = None, From 92632e34f8ee1fcfabd4aec68a1a7ee55601fed0 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 24 Feb 2022 01:27:05 +0100 Subject: [PATCH 13/13] Implement fetching up to limit and asking in the end only. This saves time for the user and they can do other things while ItemSubjector fetches candidates to be reviewed. jobs.py: process_qid_into_job(): Remove asking get_validated_main_subjects_as_jobs(): Ask here instead depending on whether limit is set or not. --- src/helpers/jobs.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 7385866..761df9c 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -77,9 +77,9 @@ def process_qid_into_job(qid: str = None, raise ValueError("items.list was None") if len(items.list) > 0: # Remove duplicates - logger.info(f"{len(items.list)} before duplicate removal") + logger.warning(f"{len(items.list)} before duplicate removal") items.list = list(set(items.list)) - logger.info(f"{len(items.list)} after duplicate removal") + logger.warning(f"{len(items.list)} after duplicate removal") # Randomize the list items.random_shuffle_list() print_found_items_table(args=args, @@ -89,11 +89,7 @@ def process_qid_into_job(qid: str = None, items=items, suggestion=suggestion ) - answer = ask_add_to_job_queue(job) - if answer: - return job - else: - return None + return job else: console.print("No matching items found") return None @@ -172,7 +168,12 @@ def get_validated_main_subjects_as_jobs(args: argparse.Namespace = None, args=args, confirmation=args.no_confirmation) if job is not None: - batchjobs.jobs.append(job) + if args.no_ask_match_more_limit is None: + answer = ask_add_to_job_queue(job) + if answer: + batchjobs.jobs.append(job) + else: + batchjobs.jobs.append(job) logger.debug(f"joblist now has {len(batchjobs.jobs)} jobs") print_job_statistics(batchjobs=batchjobs) if len(subjects_not_picked_yet) > 0: @@ -192,4 +193,11 @@ def get_validated_main_subjects_as_jobs(args: argparse.Namespace = None, else: console.print("No more subjects in the list. Exiting.") break + if args.no_ask_match_more_limit is not None: + batchjobs_limit = BatchJobs(jobs=[]) + for job in batchjobs.jobs: + answer = ask_add_to_job_queue(job) + if answer: + batchjobs_limit.jobs.append(job) + return batchjobs_limit return batchjobs