From e004903b73c69ed761f9aa5d3d0abaf8c04bc062 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 10:16:40 +0200 Subject: [PATCH 01/37] Major rewrite. Separate out into better defined classes. This is first step towards making the tool more testable with small and simple methods. 8 new files and 22 modified --- .github/workflows/lint_python.yml | 2 +- src/__init__.py | 38 +-- src/helpers/argparse_setup.py | 18 +- src/helpers/cleaning.py | 30 -- src/helpers/cli_messages.py | 108 +++++++ src/helpers/console.py | 145 +--------- src/helpers/jobs.py | 191 ++++--------- src/helpers/menus.py | 18 +- src/helpers/pickle.py | 8 +- src/helpers/questions.py | 48 ++++ src/models/batch_job.py | 6 +- src/models/batch_jobs.py | 10 +- src/models/items/__init__.py | 31 +- src/models/items/academic_journals.py | 144 +++++----- src/models/items/riksdagen_documents.py | 144 +++++----- src/models/items/scholarly_articles.py | 169 ++--------- src/models/items/thesis.py | 138 ++++----- src/models/suggestion.py | 178 +----------- src/models/wikimedia/wikidata/entity.py | 3 +- src/models/wikimedia/wikidata/entiyt_id.py | 2 +- src/models/wikimedia/wikidata/foreign_id.py | 2 +- src/models/wikimedia/wikidata/item.py | 53 ---- .../wikimedia/wikidata/item/__init__.py | 70 +++++ .../wikimedia/wikidata/item/main_subject.py | 266 ++++++++++++++++++ .../wikidata/item/scholarly_article.py | 5 + .../wikimedia/wikidata/query/__init__.py | 67 +++++ .../wikimedia/wikidata/query/article.py | 8 + .../wikidata/query/preprint_article.py | 27 ++ .../wikidata/query/published_article.py | 82 ++++++ src/tasks.py | 76 ++--- tests/test___init__.py | 3 + tests/test_suggestion.py | 92 +++--- 32 files changed, 1144 insertions(+), 1038 deletions(-) create mode 100644 src/helpers/cli_messages.py create mode 100644 src/helpers/questions.py delete mode 100644 src/models/wikimedia/wikidata/item.py create mode 100644 src/models/wikimedia/wikidata/item/__init__.py create mode 100644 src/models/wikimedia/wikidata/item/main_subject.py create mode 100644 src/models/wikimedia/wikidata/item/scholarly_article.py create mode 100644 src/models/wikimedia/wikidata/query/__init__.py create mode 100644 src/models/wikimedia/wikidata/query/article.py create mode 100644 src/models/wikimedia/wikidata/query/preprint_article.py create mode 100644 src/models/wikimedia/wikidata/query/published_article.py create mode 100644 tests/test___init__.py diff --git a/.github/workflows/lint_python.yml b/.github/workflows/lint_python.yml index 69fc6c5..370cd51 100644 --- a/.github/workflows/lint_python.yml +++ b/.github/workflows/lint_python.yml @@ -17,7 +17,7 @@ jobs: - run: poetry install --with=dev - run: poetry run bandit --recursive --skip B301,B105,B403,B311,B101,B324 src # B101 is assert statements - run: poetry run black --check . - - run: poetry run codespell # --ignore-words-list="" --skip="*.css,*.js,*.lock" + - run: poetry run codespell # --ignore-words-sparql_items="" --skip="*.css,*.js,*.lock" - run: poetry run flake8 --ignore=C408,C416,E203,F401,F541,R501,R502,R503,R504,W503 --max-complexity=21 --max-line-length=162 --show-source --statistics . - run: poetry run isort --check-only --profile black . diff --git a/src/__init__.py b/src/__init__.py index 51fd1a7..31cca38 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -8,23 +8,25 @@ import config from src.helpers.argparse_setup import setup_argparse_and_return_args -from src.helpers.cleaning import strip_prefix from src.helpers.console import ( - ask_add_to_job_queue, - ask_discard_existing_job_pickle, - ask_yes_no_question, console, + print_keep_an_eye_on_wdqs_lag, +) +from src.helpers.cli_messages import ( print_best_practice, - print_finished, print_found_items_table, + print_finished, print_job_statistics, - print_keep_an_eye_on_wdqs_lag, +) +from src.helpers.questions import ( + ask_add_to_job_queue, + ask_discard_existing_job_pickle, + ask_yes_no_question, ) from src.helpers.enums import TaskIds from src.helpers.jobs import ( get_validated_main_subjects_as_jobs, handle_job_preparation_or_run_directly_if_any_jobs, - process_qid_into_job, process_user_supplied_qids_into_batch_jobs, ) from src.helpers.menus import select_task @@ -57,7 +59,7 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None): if "P1889" not in args.sparql: console.print( "Your SPARQL did not contain P1889 (different from). " - "Please include 'MINUS {?item wdt:P1889 [].}' " + "Please include 'MINUS {?main_subject_item wdt:P1889 [].}' " "in your WHERE clause to avoid false positives." ) exit(0) @@ -70,7 +72,7 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None): ) for item_json in results["results"]["bindings"]: logging.debug(f"item_json:{item_json}") - main_subjects.append(item_json["item"]["value"]) + main_subjects.append(item_json["main_subject_item"]["value"]) if len(main_subjects) > 0: console.print(f"Got {len(main_subjects)} results") batchjobs = get_validated_main_subjects_as_jobs( @@ -87,8 +89,8 @@ def export_jobs_to_dataframe(): logger = logging.getLogger(__name__) logger.info("Exporting jobs to DataFrame. All jobs are appended to one frame") batchjobs = parse_job_pickle() - if batchjobs is not None: - if batchjobs is not None and batchjobs.job_count > 0: + if batchjobs: + if batchjobs and batchjobs.job_count > 0: logger.info(f"Found {batchjobs.job_count} jobs") df = pd.DataFrame() count = 1 @@ -96,7 +98,7 @@ def export_jobs_to_dataframe(): count += 1 logger.info(f"Working on job {count}/{batchjobs.job_count}") job_df = pd.DataFrame() - for item in job.items.list: + for item in job.main_subject_item.items.sparql_items: job_df = job_df.append( pd.DataFrame( data=[ @@ -109,14 +111,16 @@ def export_jobs_to_dataframe(): ) ) df = df.append(job_df) - logger.debug(f"Added {len(job.items.list)} items to the dataframe") + logger.debug( + f"Added {len(job.main_subject_item.items.sparql_items)} items to the dataframe" + ) logger.debug(f"Exporting {len(df)} rows to pickle") pickle_filename = "dataframe.pkl.gz" df.to_pickle(pickle_filename) console.print(f"Wrote to {pickle_filename} in the current directory") else: console.print( - "No jobs found. Create a job list first by using '--prepare-jobs'" + "No jobs found. Create a job sparql_items first by using '--prepare-jobs'" ) def run(self): @@ -124,10 +128,10 @@ def run(self): logger = logging.getLogger(__name__) migrate_pickle_detection() args = setup_argparse_and_return_args() - # console.print(args.list) + # console.print(args.sparql_items) if args.remove_prepared_jobs is True: remove_job_pickle() - console.print("Removed the job list.") + console.print("Removed the job sparql_items.") # exit(0) if args.prepare_jobs is True: logger.info("Preparing jobs") @@ -139,7 +143,7 @@ def run(self): if args.run_prepared_jobs is True: logger.info("Running prepared jobs") batchjobs = parse_job_pickle() - if batchjobs is not None and len(batchjobs.jobs) > 0: + if batchjobs and len(batchjobs.jobs) > 0: file_hash = get_hash_of_job_pickle() batchjobs.run_jobs() # Remove the pickle afterwards diff --git a/src/helpers/argparse_setup.py b/src/helpers/argparse_setup.py index 328a693..8b794ab 100644 --- a/src/helpers/argparse_setup.py +++ b/src/helpers/argparse_setup.py @@ -6,16 +6,16 @@ def setup_argparse_and_return_args(): formatter_class=argparse.RawDescriptionHelpFormatter, description=""" ItemSubjector enables working main subject statements on items based on a -heuristic matching the subject with the title of the item. +heuristic matching the subject with the title of the main_subject_item. Example adding one Qid: '$ itemsubjector.py -a Q1234' -Example adding one Qid and prepare a job list to be run non-interactively later: +Example adding one Qid and prepare a job sparql_items to be run non-interactively later: '$ itemsubjector.py -a Q1234 -p' Example working on all diseases: -'$ itemsubjector.py --sparql "SELECT ?item WHERE {?item wdt:P31 wd:Q12136. MINUS {?item wdt:P1889 [].}}"' +'$ itemsubjector.py --sparql "SELECT ?main_subject_item WHERE {?main_subject_item wdt:P31 wd:Q12136. MINUS {?main_subject_item wdt:P1889 [].}}"' """, ) parser.add_argument( @@ -64,7 +64,7 @@ def setup_argparse_and_return_args(): "--match-existing-main-subjects", action="store_true", help=( - "Match from list of 136.000 already used " + "Match from sparql_items of 136.000 already used " "main subjects on other scientific articles" ), ) @@ -82,7 +82,7 @@ def setup_argparse_and_return_args(): ) parser.add_argument( "-iu", - "--show-item-urls", + "--show-main_subject_item-urls", action="store_true", help="Show an extra column in the table of items with links", ) @@ -90,8 +90,8 @@ def setup_argparse_and_return_args(): "--sparql", nargs="?", help="Work on main subject items returned by this SPARQL query.\n" - 'Note: "?item" has to be selected for it to work, see the example above.\n' - "Note: MINUS {?item wdt:P1889 [].} must be present in the query to avoid false positives.", + 'Note: "?main_subject_item" has to be selected for it to work, see the example above.\n' + "Note: MINUS {?main_subject_item wdt:P1889 [].} must be present in the query to avoid false positives.", ) parser.add_argument( "--debug-sparql", @@ -104,12 +104,12 @@ def setup_argparse_and_return_args(): "--limit", nargs="?", type=int, - help="When working on SPARQL queries of e.g. galaxies, match more until this many matches are in the job list", + help="When working on SPARQL queries of e.g. galaxies, match more until this many matches are in the job sparql_items", ) parser.add_argument( "--export-jobs-to-dataframe", action="store_true", - help="Export the prepared job list to a Pandas DataFrame.", + help="Export the prepared job sparql_items to a Pandas DataFrame.", default=False, ) return parser.parse_args() diff --git a/src/helpers/cleaning.py b/src/helpers/cleaning.py index 840850e..44a5631 100644 --- a/src/helpers/cleaning.py +++ b/src/helpers/cleaning.py @@ -1,33 +1,3 @@ -def strip_bad_chars(string): - # Note this has to match the cleaning done in the sparql query - # We lowercase and remove common symbols - # We replace like this to save CPU cycles see - # https://stackoverflow.com/questions/3411771/best-way-to-replace-multiple-characters-in-a-string - return ( - string - # Needed for matching backslashes e.g. "Dmel\CG5330" on Q29717230 - .replace("\\", "\\\\") - # Needed for when labels contain apostrophe - .replace("'", "\\'") - .replace(",", "") - .replace(":", "") - .replace(";", "") - .replace("(", "") - .replace(")", "") - .replace("[", "") - .replace("]", "") - ) - - def clean_rich_formatting(label): # Fix rich parse bug with "[/TSUP]" and "[/ITAL]" return label.replace("[/", "['/") - - -def strip_prefix(qid): - if "https://www.wikidata.org/wiki/" in qid: - qid = qid[30:] - if "http://www.wikidata.org/entity/" in qid: - qid = qid[31:] - # logger.debug(f"qid:{qid}") - return qid diff --git a/src/helpers/cli_messages.py b/src/helpers/cli_messages.py new file mode 100644 index 0000000..3aa285c --- /dev/null +++ b/src/helpers/cli_messages.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +import argparse +from typing import Set +from urllib.parse import quote + +from rich.table import Table + +from src.models.task import Task +from src.helpers.console import console +from src.models.batch_jobs import BatchJobs +from src.helpers.cleaning import clean_rich_formatting +from src.helpers.console import press_enter_to_continue +from src.models.items import Items + + +def print_best_practice(task: Task): + if task.best_practice_information: + console.print(task.best_practice_information) + press_enter_to_continue() + + +def print_search_strings_table( + args: argparse.Namespace = None, search_strings: Set[str] = None +): + if args is None: + raise ValueError("args was None") + if search_strings is None: + raise ValueError("search strings was None") + table = Table(title="Search strings") + table.add_column(f"Extracted the following {len(search_strings)} search strings") + if args.show_search_urls: + table.add_column(f"Wikidata search URL") + for string in search_strings: + if args.show_search_urls: + table.add_row( + string, f"https://www.wikidata.org/w/index.php?search={quote(string)}" + ) + else: + table.add_row(string) + console.print(table) + + +def print_found_items_table(args: argparse.Namespace = None, items: Items = None): + if args is None: + raise ValueError("args was None") + if items is None: + raise ValueError("items was None") + if items.sparql_items is None: + raise ValueError("items.sparql_items was None") + table = Table(title="Matched items found") + if len(items.sparql_items) < 1000: + list_to_show = items.sparql_items[0:50] + else: + # Show 1 sample for each 20 items in the sparql_items + list_to_show = items.sparql_items[0 : int(len(items.sparql_items) / 20)] + if len(items.sparql_items) > 4000: + console.print( + "[red]Warning: This is a very large batch, please proceed with caution[/red]" + ) + press_enter_to_continue() + table.add_column( + f"Showing a random subset of {len(list_to_show)} " + f"items, please review as many as possible for false " + f"positives and reject the batch if you find any." + ) + if args.show_item_urls: + table.add_column(f"Wikidata URL") + for item in list_to_show: + if item.label is None: + raise ValueError("main_subject_item.label was None") + if args.show_item_urls: + label = clean_rich_formatting(item.label) + table.add_row(label, item.url) + else: + table.add_row(item.label) + console.print(table) + + +def print_finished(): + console.print("All jobs finished successfully") + + +def print_job_statistics(batchjobs: BatchJobs = None): + if not batchjobs: + raise ValueError("jobs was None") + if not batchjobs.jobs: + raise ValueError("batchjobs.jobs was None") + if not isinstance(batchjobs.jobs, list): + raise ValueError("jobs was not a sparql_items") + if not len(batchjobs.jobs): + console.print("The jobs sparql_items is empty") + else: + total_number_of_queries = sum([job.number_of_queries for job in batchjobs.jobs]) + total_number_of_items = sum( + len(job.main_subject_item.items.sparql_items) + for job in batchjobs.jobs + if batchjobs.jobs + and job + and job.main_subject_item.items + and job.main_subject_item.items.sparql_items + ) + console.print( + f"The jobs sparql_items now contain a total of {len(batchjobs.jobs)} " # type: ignore + f"jobs with a total of " + f"{total_number_of_items} items found from " + f"{total_number_of_queries} queries" + ) diff --git a/src/helpers/console.py b/src/helpers/console.py index febb26d..de8fe73 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -1,37 +1,15 @@ from __future__ import annotations -import argparse -from typing import TYPE_CHECKING, Set -from urllib.parse import quote +from typing import TYPE_CHECKING from rich.console import Console -from rich.table import Table - -from src.helpers.cleaning import clean_rich_formatting -from src.models.batch_job import BatchJob -from src.models.batch_jobs import BatchJobs if TYPE_CHECKING: - from src.models.items import Items - from src.models.task import Task + pass console = Console() -def ask_yes_no_question(message: str): - # https://www.quora.com/ - # I%E2%80%99m-new-to-Python-how-can-I-write-a-yes-no-question - # this will loop forever - while True: - answer = console.input(message + " [Y/Enter/n]: ") - if len(answer) == 0 or answer[0].lower() in ("y", "n"): - if len(answer) == 0: - return True - else: - # the == operator just returns a boolean, - return answer[0].lower() == "y" - - def print_keep_an_eye_on_wdqs_lag(): console.print( "Please keep an eye on the lag of the WDQS cluster here and avoid " @@ -49,122 +27,3 @@ def print_keep_an_eye_on_wdqs_lag(): def press_enter_to_continue(): console.input("Press Enter to continue.") - - -def print_best_practice(task: Task): - if task.best_practice_information is not None: - console.print(task.best_practice_information) - press_enter_to_continue() - - -def print_search_strings_table( - args: argparse.Namespace = None, search_strings: Set[str] = None -): - if args is None: - raise ValueError("args was None") - if search_strings is None: - raise ValueError("search strings was None") - table = Table(title="Search strings") - table.add_column(f"Extracted the following {len(search_strings)} search strings") - if args.show_search_urls: - table.add_column(f"Wikidata search URL") - for string in search_strings: - if args.show_search_urls: - table.add_row( - string, f"https://www.wikidata.org/w/index.php?search={quote(string)}" - ) - else: - table.add_row(string) - console.print(table) - - -def print_found_items_table(args: argparse.Namespace = None, items: Items = None): - if args is None: - raise ValueError("args was None") - if items is None: - raise ValueError("items was None") - if items.list is None: - raise ValueError("items.list was None") - table = Table(title="Matched items found") - if len(items.list) < 1000: - list_to_show = items.list[0:50] - else: - # Show 1 sample for each 20 items in the list - list_to_show = items.list[0 : int(len(items.list) / 20)] - if len(items.list) > 4000: - console.print( - "[red]Warning: This is a very large batch, please proceed with caution[/red]" - ) - press_enter_to_continue() - table.add_column( - f"Showing a random subset of {len(list_to_show)} " - f"items, please review as many as possible for false " - f"positives and reject the batch if you find any." - ) - if args.show_item_urls: - table.add_column(f"Wikidata URL") - for item in list_to_show: - if item.label is None: - raise ValueError("item.label was None") - if args.show_item_urls: - label = clean_rich_formatting(item.label) - table.add_row(label, item.url()) - else: - table.add_row(item.label) - console.print(table) - - -def ask_add_to_job_queue(job: BatchJob = None): - if job is None: - raise ValueError("job was None") - if job.suggestion.item is None: - raise ValueError("job.suggestion.item was None") - if job.suggestion.item.label is None: - raise ValueError("job.suggestion.item.label was None") - if job.suggestion.item.description is None: - job.suggestion.item.description = "" - if job.items.list is None: - raise ValueError("job.items.list was None") - return ask_yes_no_question( - f"Do you want to add this job for " - f"[magenta]{job.suggestion.item.label}: " - f"{job.suggestion.item.description}[/magenta] with " - f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})" - ) - - -def print_finished(): - console.print("All jobs finished successfully") - - -def print_job_statistics(batchjobs: BatchJobs = None): - if batchjobs is None: - raise ValueError("jobs was None") - if batchjobs.jobs is None: - raise ValueError("batchjobs.jobs was None") - if not isinstance(batchjobs.jobs, list): - raise ValueError("jobs was not a list") - if len(batchjobs.jobs) == 0: - console.print("The jobs list is empty") - else: - total_number_of_queries = sum([job.number_of_queries for job in batchjobs.jobs]) - total_number_of_items = sum( - len(job.items.list) - for job in batchjobs.jobs - if batchjobs.jobs is not None - and job is not None - and job.items is not None - and job.items.list is not None - ) - console.print( - f"The jobs list now contain a total of {len(batchjobs.jobs)} " # type: ignore - f"jobs with a total of " - f"{total_number_of_items} items found from " - f"{total_number_of_queries} queries" - ) - - -def ask_discard_existing_job_pickle(): - return ask_yes_no_question( - "A prepared list of jobs already exist, " "do you want to delete it?" - ) diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index a79e1d5..34c282e 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -3,128 +3,44 @@ import argparse import logging import random -from typing import TYPE_CHECKING, List, Optional, Union +from typing import List import config -from src import ( - TaskIds, - ask_add_to_job_queue, - ask_yes_no_question, - console, +from src.helpers.cli_messages import ( print_best_practice, print_job_statistics, - strip_prefix, ) +from src.helpers.console import console from src.helpers.menus import select_task +from src.helpers.questions import ( + ask_add_to_job_queue, + ask_yes_no_question, +) +from src.models.batch_job import BatchJob from src.models.batch_jobs import BatchJobs -from src.models.items import Items -from src.models.items.academic_journals import AcademicJournalItems -from src.models.items.riksdagen_documents import RiksdagenDocumentItems -from src.models.items.scholarly_articles import ScholarlyArticleItems -from src.models.items.thesis import ThesisItems +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem from src.tasks import Task -if TYPE_CHECKING: - from src import BatchJob - # TODO rewrite as OOP logger = logging.getLogger(__name__) -def process_qid_into_job( - qid: str = None, - task: Task = None, - args: argparse.Namespace = None, - confirmation: bool = False, -) -> Union[BatchJob, None]: - if qid is None: - raise ValueError("qid was None") - if args is None: - raise ValueError("args was None") - if task is None: - raise ValueError("task was None") - from src.models.wikimedia.wikidata.item import Item - - item = Item( - id=strip_prefix(qid), - ) - item.fetch_label_and_description_and_aliases(task=task) - if item.label is not None: - console.print(f"Working on {item}") - # generate suggestion with all we need - from src import Suggestion - - suggestion = Suggestion(item=item, task=task, args=args) - if confirmation: - answer = ask_yes_no_question("Do you want to continue?") - if not answer: - return None - suggestion.extract_search_strings() - if config.loglevel == logging.INFO: - suggestion.print_search_strings() - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - number_of_queries = ( - len(suggestion.search_strings) * task.number_of_queries_per_search_string - ) - with console.status( - f"Fetching items with labels that have one of " - f"the search strings by running a total of " - f"{number_of_queries} " - f"queries on WDQS..." - ): - items: Optional[Items] = None - if task.id == TaskIds.SCHOLARLY_ARTICLES: - items = ScholarlyArticleItems() - elif task.id == TaskIds.RIKSDAGEN_DOCUMENTS: - items = RiksdagenDocumentItems() - elif task.id == TaskIds.THESIS: - items = ThesisItems() - elif task.id == TaskIds.ACADEMIC_JOURNALS: - items = AcademicJournalItems() - else: - raise ValueError(f"{task.id} was not recognized") - items.fetch_based_on_label(suggestion=suggestion, task=task) - if items.list is None: - raise ValueError("items.list was None") - if len(items.list) > 0: - # Remove duplicates - logger.debug(f"{len(items.list)} before duplicate removal") - items.list = list(set(items.list)) - logger.debug(f"{len(items.list)} after duplicate removal") - # Randomize the list - items.random_shuffle_list() - from src import BatchJob - - job = BatchJob( - items=items, number_of_queries=number_of_queries, suggestion=suggestion - ) - return job - else: - console.print("No matching items found") - return None - else: - console.print( - f"Label for {task.language_code} was None on {item.url()}, skipping" - ) - return None - - def process_user_supplied_qids_into_batch_jobs( args: argparse.Namespace = None, task: Task = None ) -> List[BatchJob]: - """Given a list of QIDs, we go through - them and return a list of jobs""" + """Given a sparql_items of QIDs, we go through + them and return a sparql_items of jobs""" # logger = logging.getLogger(__name__) - if args is None: + if not args: raise ValueError("args was None") - if task is None: + if not task: raise ValueError("task was None") print_best_practice(task) jobs = [] for qid in args.add: - job = process_qid_into_job(qid=qid, task=task, args=args) - if job is not None: + main_subject_item = MainSubjectItem(qid=qid, args=args, task=task) + job = main_subject_item.fetch_items_and_get_job() + if job: jobs.append(job) return jobs @@ -158,12 +74,12 @@ def get_validated_main_subjects_as_jobs( args: argparse.Namespace = None, main_subjects: List[str] = None ) -> BatchJobs: """This function randomly picks a subject and add it to the - list of jobs if it had any matches and the user approved it""" + sparql_items of jobs if it had any matches and the user approved it""" if args is None: raise ValueError("args was None") if main_subjects is None: raise ValueError("main subjects was None") - subjects_not_picked_yet = main_subjects + qid_subjects_not_picked_yet = main_subjects task: Task = select_task() if task is None: raise ValueError("Got no task") @@ -171,66 +87,67 @@ def get_validated_main_subjects_as_jobs( raise ValueError("task was not a Task object") batchjobs = BatchJobs(jobs=[]) while True: - # Check if we have any subjects left in the list - if len(subjects_not_picked_yet) > 0: + # Check if we have any subjects left in the sparql_items + if len(qid_subjects_not_picked_yet): console.print(f"Picking a random main subject") - qid = random.choice(subjects_not_picked_yet) - subjects_not_picked_yet.remove(qid) - job = process_qid_into_job( - qid=qid, - task=task, - args=args, - confirmation=args.no_confirmation, + qid = random.choice(qid_subjects_not_picked_yet) + qid_subjects_not_picked_yet.remove(qid) + main_subject_item = MainSubjectItem( + qid=qid, args=args, task=task, confirmation=args.no_confirmation ) - if job is not None: + job = main_subject_item.fetch_items_and_get_job() + if job: # Here we check if the user has enabled no ask more limit. if args.no_ask_match_more_limit is None: logger.debug("No ask more was None") - job.items.print_items_list(args=args) - job.suggestion.print_search_strings() - answer = ask_add_to_job_queue(job) - if answer: - batchjobs.jobs.append(job) + if job.main_subject_item.items: + job.main_subject_item.items.print_items_list(args=args) + job.main_subject_item.print_search_strings() + answer = ask_add_to_job_queue(job) + if answer: + batchjobs.jobs.append(job) else: batchjobs.jobs.append(job) logger.debug(f"joblist now has {len(batchjobs.jobs)} jobs") print_job_statistics(batchjobs=batchjobs) - if len(subjects_not_picked_yet) > 0: + if len(qid_subjects_not_picked_yet) > 0: if ( args.no_ask_match_more_limit is None or args.no_ask_match_more_limit < sum( - len(job.items.list) + len(job.main_subject_item.items.sparql_items) for job in batchjobs.jobs - if job.items.list is not None + if job.main_subject_item.items + and job.main_subject_item.items.sparql_items ) ): answer_was_yes = ask_yes_no_question("Match one more?") if not answer_was_yes: break else: - console.print("No more subjects in the list.") + console.print("No more subjects in the sparql_items.") break else: - console.print("No more subjects in the list. Exiting.") + console.print("No more subjects in the sparql_items. Exiting.") break - if args.no_ask_match_more_limit is not None: + if args.no_ask_match_more_limit: batchjobs_limit = BatchJobs(jobs=[]) for job in batchjobs.jobs: - job.items.print_items_list(args=args) - job.suggestion.print_search_strings() - if ( - config.automatically_approve_jobs_with_less_than_fifty_matches - and job.items.number_of_items < 50 - ): - console.print( - f"This job with {job.items.number_of_items} matching items was automatically approved", - style="green", - ) - batchjobs_limit.jobs.append(job) - else: - answer = ask_add_to_job_queue(job) - if answer: + if job.main_subject_item.items: + job.main_subject_item.items.print_items_list(args=args) + job.main_subject_item.print_search_strings() + if ( + config.automatically_approve_jobs_with_less_than_fifty_matches + and job.main_subject_item.items.number_of_sparql_items < 50 + ): + console.print( + f"This job with {job.main_subject_item.items.number_of_sparql_items} matching items was automatically approved", + style="green", + ) batchjobs_limit.jobs.append(job) + else: + answer = ask_add_to_job_queue(job) + if answer: + batchjobs_limit.jobs.append(job) return batchjobs_limit return batchjobs diff --git a/src/helpers/menus.py b/src/helpers/menus.py index 49c2290..97f7c7e 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -3,13 +3,15 @@ from consolemenu import SelectionMenu # type: ignore -from src.models.suggestion import Suggestion from src.models.wikimedia.wikidata.item import Item +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem from src.tasks import Task +logger = logging.getLogger(__name__) -def select_suggestion(suggestions: List[Suggestion] = None, item: Item = None): - if item is None or suggestions is None: + +def select_suggestion(suggestions: List[MainSubjectItem], item: Item): + if not item or not item.qid or not suggestions: raise ValueError("Did not get what we need") logger = logging.getLogger(__name__) menu = SelectionMenu( @@ -31,7 +33,7 @@ def select_suggestion(suggestions: List[Suggestion] = None, item: Item = None): def select_task() -> Task: - logger = logging.getLogger(__name__) + # TODO use questionary here? from src.tasks import tasks labels = [task.label for task in tasks] @@ -55,8 +57,8 @@ def select_task() -> Task: # menu.join() # selected_language_index = menu.selected_option # mapping = {} -# for index, item in enumerate(WikimediaLanguageCode): -# mapping[index] = item +# for index, main_subject_item in enumerate(WikimediaLanguageCode): +# mapping[index] = main_subject_item # selected_language = mapping[selected_language_index] # logger.debug(f"selected:{selected_language_index}=" # f"{selected_language}") @@ -69,8 +71,8 @@ def select_task() -> Task: # menu.join() # selected_lexical_category_index = menu.selected_option # category_mapping = {} -# for index, item in enumerate(WikidataLexicalCategory): -# category_mapping[index] = item +# for index, main_subject_item in enumerate(WikidataLexicalCategory): +# category_mapping[index] = main_subject_item # selected_lexical_category = category_mapping[selected_lexical_category_index] # logger.debug(f"selected:{selected_lexical_category_index}=" # f"{selected_lexical_category}") diff --git a/src/helpers/pickle.py b/src/helpers/pickle.py index 738d87f..5661075 100644 --- a/src/helpers/pickle.py +++ b/src/helpers/pickle.py @@ -36,7 +36,7 @@ def check_if_pickle_exists(path): def parse_job_pickle(silent: bool = False) -> Optional[BatchJobs]: - """Reads the pickle into a list of batch jobs""" + """Reads the pickle into a sparql_items of batch jobs""" if check_if_pickle_exists(config.job_pickle_file_path): jobs: List[BatchJob] = [] for job in read_from_pickle(config.job_pickle_file_path): @@ -58,16 +58,16 @@ def remove_job_pickle(silent: bool = False, hash: str = None): if os.path.exists(config.job_pickle_file_path): os.remove(config.job_pickle_file_path) if not silent: - console.print("The job list file was removed") + console.print("The job sparql_items file was removed") if os.path.exists(config.job_pickle_file_path): hash_now = get_hash_of_job_pickle() if hash == hash_now: os.remove(config.job_pickle_file_path) if not silent: - console.print("The job list file was removed") + console.print("The job sparql_items file was removed") else: console.print( - "Job list file not deleted because the contents " + "Job sparql_items file not deleted because the contents " "has changed since this batch of jobs was started." ) else: diff --git a/src/helpers/questions.py b/src/helpers/questions.py new file mode 100644 index 0000000..8cf67f2 --- /dev/null +++ b/src/helpers/questions.py @@ -0,0 +1,48 @@ +# from __future__ import annotations + +from src.helpers.console import console +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from src.models.batch_jobs import BatchJob + + +def ask_add_to_job_queue(job: "BatchJob" = None): + if not job: + raise ValueError("job was None") + if not job.main_subject_item: + raise ValueError("job.main_subject_item was None") + if not job.main_subject_item.label: + raise ValueError("job.main_subject_item.label was None") + if not job.main_subject_item.description: + job.main_subject_item.description = "" + if not job.main_subject_item.items: + raise ValueError("items was None") + if not job.main_subject_item.items.sparql_items: + raise ValueError("sparql_items was None") + return ask_yes_no_question( + f"Do you want to add this job for " + f"[magenta]{job.main_subject_item.label}: " + f"{job.main_subject_item.description}[/magenta] with " + f"{len(job.main_subject_item.items.sparql_items)} items to the queue? (see {job.main_subject_item.url})" + ) + + +def ask_discard_existing_job_pickle(): + return ask_yes_no_question( + "A prepared sparql_items of jobs already exist, " "do you want to delete it?" + ) + + +def ask_yes_no_question(message: str): + # https://www.quora.com/ + # I%E2%80%99m-new-to-Python-how-can-I-write-a-yes-no-question + # this will loop forever + while True: + answer = console.input(message + " [Y/Enter/n]: ") + if len(answer) == 0 or answer[0].lower() in ("y", "n"): + if len(answer) == 0: + return True + else: + # the == operator just returns a boolean, + return answer[0].lower() == "y" diff --git a/src/models/batch_job.py b/src/models/batch_job.py index 0822a88..40142d1 100644 --- a/src/models/batch_job.py +++ b/src/models/batch_job.py @@ -1,12 +1,10 @@ from pydantic import BaseModel -from src.models.items import Items -from src.models.suggestion import Suggestion +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem class BatchJob(BaseModel): """Models a batch job intended to be run non-interactively""" - suggestion: Suggestion - items: Items + main_subject_item: MainSubjectItem number_of_queries: int diff --git a/src/models/batch_jobs.py b/src/models/batch_jobs.py index 90f7dc3..ab89d0e 100644 --- a/src/models/batch_jobs.py +++ b/src/models/batch_jobs.py @@ -17,12 +17,12 @@ def job_count(self): def print_running_jobs(self): if not isinstance(self.jobs, list): - raise ValueError("jobs is not a list") + raise ValueError("jobs is not a sparql_items") from src.helpers.console import console console.print( f"Running {len(self.jobs)} job(s) with a total of " - f"{sum(len(job.items.list) for job in self.jobs if job.items.list is not None)} items " + f"{sum(len(job.main_subject_item.items.sparql_items) for job in self.jobs if job.main_subject_item.items and job.main_subject_item.items.sparql_items)} items " f"non-interactively now. You can take a " f"coffee break and lean back :)" ) @@ -30,9 +30,9 @@ def print_running_jobs(self): def run_jobs(self): from src.helpers.console import ( console, - print_finished, print_keep_an_eye_on_wdqs_lag, ) + from src import print_finished if self.jobs is None or len(self.jobs) == 0: raise ValueError("did not get what we need") @@ -42,9 +42,7 @@ def run_jobs(self): self.print_running_jobs() start_time = datetime.now() for job in self.jobs: - job.suggestion.add_to_items( - items=job.items, jobs=self.jobs, job_count=self.job_count - ) + job.main_subject_item.add_to_items(jobs=self.jobs, job_count=self.job_count) print_finished() end_time = datetime.now() console.print(f"Total runtime: {end_time - start_time}") diff --git a/src/models/items/__init__.py b/src/models/items/__init__.py index b59e27a..a1bb75e 100644 --- a/src/models/items/__init__.py +++ b/src/models/items/__init__.py @@ -1,32 +1,43 @@ from __future__ import annotations import argparse +import logging import random -from typing import TYPE_CHECKING, List, Optional +from typing import List, Any from pydantic import BaseModel -from src.models.task import Task from src.models.wikimedia.wikidata.sparql_item import SparqlItem -if TYPE_CHECKING: - from src.models.suggestion import Suggestion +# if TYPE_CHECKING: +# from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem + +logger = logging.getLogger(__name__) class Items(BaseModel): - list: Optional[List[SparqlItem]] + # pydantic forwardref error + main_subject_item: Any # type MainSubjectItem + sparql_items: List[SparqlItem] = [] @property - def number_of_items(self): - return len(self.list) + def number_of_sparql_items(self): + return len(self.sparql_items) - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): + def fetch_based_on_label(self): pass - def random_shuffle_list(self): - random.shuffle(self.list) + def random_shuffle_items(self): + random.shuffle(self.sparql_items) def print_items_list(self, args: argparse.Namespace): from src import print_found_items_table print_found_items_table(args=args, items=self) + + def remove_duplicates(self): + if self.sparql_items is None: + raise ValueError("items.sparql_items was None") + logger.debug(f"{len(self.sparql_items)} before duplicate removal") + self.sparql_items = list(set(self.sparql_items)) + logger.debug(f"{len(self.sparql_items)} after duplicate removal") diff --git a/src/models/items/academic_journals.py b/src/models/items/academic_journals.py index d155850..526c2b8 100644 --- a/src/models/items/academic_journals.py +++ b/src/models/items/academic_journals.py @@ -1,72 +1,72 @@ -import logging - -from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore - -import config -from src.helpers.cleaning import strip_bad_chars -from src.helpers.console import console -from src.models.items import Items -from src.models.suggestion import Suggestion -from src.models.task import Task -from src.models.wikimedia.wikidata.sparql_item import SparqlItem - - -class AcademicJournalItems(Items): - """This supports both published peer reviewed articles and preprints""" - - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): - def process_results(results): - # TODO refactor into private method - items = [] - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = SparqlItem(**item_json) - items.append(item) - return items - - # logger = logging.getLogger(__name__) - if suggestion is None: - raise ValueError("suggestion was None") - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - if suggestion.item is None: - raise ValueError("suggestion.item was None") - if suggestion.item.id is None: - raise ValueError("suggestion.item.id was None") - if suggestion.args is None: - raise ValueError("suggestion.args was None") - # Fetch all items matching the search strings - self.list = [] - for search_string in suggestion.search_strings: - search_string = strip_bad_chars(search_string) - results = execute_sparql_query( - f""" - #{config.user_agent} - SELECT ?item ?itemLabel - WHERE - {{ - ?item wdt:P31 wd:Q737498. - minus {{?item wdt:P921 wd:{suggestion.item.id}.}} - ?item rdfs:label ?label. - # We lowercase the label first and search for the - # string in both the beginning, middle and end of the label - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """, - ) - logging.info( - f'Got {len(results["results"]["bindings"])} academic journal items from ' - f"WDQS using the search string {search_string}" - ) - self.list.extend(process_results(results)) - console.print(f"Got a total of {len(self.list)} items") +# import logging +# +# from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore +# +# import config +# from src.helpers.cleaning import strip_bad_chars +# from src.helpers.console import console +# from src.models.items import Items +# from src.models.suggestion import Suggestion +# from src.models.task import Task +# from src.models.wikimedia.wikidata.sparql_item import SparqlItem +# +# +# class AcademicJournalItems(Items): +# """This supports both published peer reviewed articles and preprints""" +# +# def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): +# def process_results(results): +# # TODO refactor into private method +# items = [] +# for item_json in results["results"]["bindings"]: +# logging.debug(f"item_json:{item_json}") +# item = SparqlItem(**item_json) +# items.append(item) +# return items +# +# # logger = logging.getLogger(__name__) +# if suggestion is None: +# raise ValueError("suggestion was None") +# if task is None: +# raise ValueError("task was None") +# if task.language_code is None: +# raise ValueError("task.language_code was None") +# if suggestion.search_strings is None: +# raise ValueError("suggestion.search_strings was None") +# if suggestion.main_subject_item is None: +# raise ValueError("suggestion.main_subject_item was None") +# if suggestion.main_subject_item.id is None: +# raise ValueError("suggestion.main_subject_item.id was None") +# if suggestion.args is None: +# raise ValueError("suggestion.args was None") +# # Fetch all items matching the search strings +# self.list = [] +# for search_string in suggestion.search_strings: +# search_string = strip_bad_chars(search_string) +# results = execute_sparql_query( +# f""" +# #{config.user_agent} +# SELECT ?main_subject_item ?itemLabel +# WHERE +# {{ +# ?main_subject_item wdt:P31 wd:Q737498. +# minus {{?main_subject_item wdt:P921 wd:{suggestion.main_subject_item.id}.}} +# ?main_subject_item rdfs:label ?label. +# # We lowercase the label first and search for the +# # string in both the beginning, middle and end of the label +# FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || +# REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || +# REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) +# MINUS {{?main_subject_item wdt:P921/wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# MINUS {{?main_subject_item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# MINUS {{?main_subject_item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} +# }} +# """, +# ) +# logging.info( +# f'Got {len(results["results"]["bindings"])} academic journal items from ' +# f"WDQS using the search string {search_string}" +# ) +# self.list.extend(process_results(results)) +# console.print(f"Got a total of {len(self.list)} items") diff --git a/src/models/items/riksdagen_documents.py b/src/models/items/riksdagen_documents.py index c85ab07..51585ca 100644 --- a/src/models/items/riksdagen_documents.py +++ b/src/models/items/riksdagen_documents.py @@ -1,72 +1,72 @@ -import logging - -from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore - -import config -from src.helpers.console import console -from src.models.items import Items -from src.models.suggestion import Suggestion -from src.models.task import Task -from src.models.wikimedia.wikidata.sparql_item import SparqlItem - - -class RiksdagenDocumentItems(Items): - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): - # logger = logging.getLogger(__name__) - if suggestion is None: - raise ValueError("suggestion was None") - if suggestion.item is None: - raise ValueError("suggestion.item was None") - if suggestion.args is None: - raise ValueError("suggestion.args was None") - if suggestion.args.limit_to_items_without_p921: - raise Exception( - "Limiting to items without P921 is not " "supported yet for this task." - ) - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - # Fetch all items matching the search strings - self.list = [] - # Include spaces around the n-gram to avoid edits like this one - # https://www.wikidata.org/w/index.php?title=Q40671507&diff=1497186802&oldid=1496945583 - # Lowercase is not needed here as Elastic matches anyway - for search_string in suggestion.search_strings: - results = execute_sparql_query( - f""" - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - hint:Query hint:optimizer "None". - SERVICE wikibase:mwapi {{ - bd:serviceParam wikibase:api "Search"; - wikibase:endpoint "www.wikidata.org"; - mwapi:srsearch 'haswbstatement:P8433 -haswbstatement:P921={suggestion.item.id} "{search_string}"' . - ?title wikibase:apiOutput mwapi:title. - }} - BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) - ?item rdfs:label ?label. - # We lowercase the label first and search for the - # string in both the beginning, middle and end of the label - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - # remove more specific forms of the main subject also - # Thanks to Jan Ainali for this improvement :) - MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "sv". }} - }} - """, - ) - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = SparqlItem(**item_json) - self.list.append(item) - logging.info( - f'Got {len(results["results"]["bindings"])} items from ' - f"WDQS using the search string {search_string}" - ) - console.print(f"Got a total of {len(self.list)} items") +# import logging +# +# from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore +# +# import config +# from src.helpers.console import console +# from src.models.items import Items +# from src.models.suggestion import Suggestion +# from src.models.task import Task +# from src.models.wikimedia.wikidata.sparql_item import SparqlItem +# +# +# class RiksdagenDocumentItems(Items): +# def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): +# # logger = logging.getLogger(__name__) +# if suggestion is None: +# raise ValueError("suggestion was None") +# if suggestion.main_subject_item is None: +# raise ValueError("suggestion.main_subject_item was None") +# if suggestion.args is None: +# raise ValueError("suggestion.args was None") +# if suggestion.args.limit_to_items_without_p921: +# raise Exception( +# "Limiting to items without P921 is not " "supported yet for this task." +# ) +# if suggestion.search_strings is None: +# raise ValueError("suggestion.search_strings was None") +# if task is None: +# raise ValueError("task was None") +# if task.language_code is None: +# raise ValueError("task.language_code was None") +# # Fetch all items matching the search strings +# self.list = [] +# # Include spaces around the n-gram to avoid edits like this one +# # https://www.wikidata.org/w/index.php?title=Q40671507&diff=1497186802&oldid=1496945583 +# # Lowercase is not needed here as Elastic matches anyway +# for search_string in suggestion.search_strings: +# results = execute_sparql_query( +# f""" +# #{config.user_agent} +# SELECT DISTINCT ?main_subject_item ?itemLabel +# WHERE {{ +# hint:Query hint:optimizer "None". +# SERVICE wikibase:mwapi {{ +# bd:serviceParam wikibase:api "Search"; +# wikibase:endpoint "www.wikidata.org"; +# mwapi:srsearch 'haswbstatement:P8433 -haswbstatement:P921={suggestion.main_subject_item.id} "{search_string}"' . +# ?title wikibase:apiOutput mwapi:title. +# }} +# BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?main_subject_item) +# ?main_subject_item rdfs:label ?label. +# # We lowercase the label first and search for the +# # string in both the beginning, middle and end of the label +# FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || +# REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || +# REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) +# # remove more specific forms of the main subject also +# # Thanks to Jan Ainali for this improvement :) +# MINUS {{?main_subject_item wdt:P921 ?topic. ?topic wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# SERVICE wikibase:label {{ bd:serviceParam wikibase:language "sv". }} +# }} +# """, +# ) +# for item_json in results["results"]["bindings"]: +# logging.debug(f"item_json:{item_json}") +# item = SparqlItem(**item_json) +# self.list.append(item) +# logging.info( +# f'Got {len(results["results"]["bindings"])} items from ' +# f"WDQS using the search string {search_string}" +# ) +# console.print(f"Got a total of {len(self.list)} items") diff --git a/src/models/items/scholarly_articles.py b/src/models/items/scholarly_articles.py index 73995c9..3f2c0eb 100644 --- a/src/models/items/scholarly_articles.py +++ b/src/models/items/scholarly_articles.py @@ -1,14 +1,12 @@ import logging +from typing import Dict from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore -import config -from src.helpers.cleaning import strip_bad_chars from src.helpers.console import console from src.models.items import Items -from src.models.suggestion import Suggestion -from src.models.task import Task -from src.models.wikimedia.wikidata.sparql_item import SparqlItem +from src.models.wikimedia.wikidata.query.preprint_article import PreprintArticleQuery +from src.models.wikimedia.wikidata.query.published_article import PublishedArticleQuery logger = logging.getLogger(__name__) @@ -16,148 +14,37 @@ class ScholarlyArticleItems(Items): """This supports both published peer reviewed articles and preprints""" - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): - def build_query( - suggestion: Suggestion = None, - search_string: str = None, - task: Task = None, - cirrussearch_parameters: str = None, - ): - # TODO refactor - if suggestion is None: - raise ValueError("suggestion was None") - if suggestion.item is None: - raise ValueError("suggestion.item was None") - if search_string is None: - raise ValueError("search_string was None") - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - if cirrussearch_parameters is None: - raise ValueError("cirrussearch_parameters was None") - # This query uses https://www.w3.org/TR/sparql11-property-paths/ to - # find subjects that are subclass of one another up to 3 hops away - # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI - # which has a hardcoded limit of 10,000 items so you will never get more matches than that - # This query use regex to match beginning, middle and end of the label of matched items - # The replacing lines should match the similar python replacements in cleaning.py - # The replacing with "\\\\\\\\" becomes "\\\\" after leaving python and then it works in - # SPARQL where it becomes "\\" and thus match a single backslash - return f""" - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - hint:Query hint:optimizer "None". - BIND(STR('{cirrussearch_parameters} \"{search_string}\"') as ?search_string) - SERVICE wikibase:mwapi {{ - bd:serviceParam wikibase:api "Search"; - wikibase:endpoint "www.wikidata.org"; - mwapi:srsearch ?search_string. - ?title wikibase:apiOutput mwapi:title. - }} - BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) - ?item rdfs:label ?label. - BIND(REPLACE(LCASE(?label), ",", "") as ?label1) - BIND(REPLACE(?label1, ":", "") as ?label2) - BIND(REPLACE(?label2, ";", "") as ?label3) - BIND(REPLACE(?label3, "\\\\(", "") as ?label4) - BIND(REPLACE(?label4, "\\\\)", "") as ?label5) - BIND(REPLACE(?label5, "\\\\[", "") as ?label6) - BIND(REPLACE(?label6, "\\\\]", "") as ?label7) - BIND(REPLACE(?label7, "\\\\\\\\", "") as ?label8) - BIND(?label8 as ?cleaned_label) - FILTER(CONTAINS(?cleaned_label, ' {search_string.lower()} '@{task.language_code.value}) || - REGEX(?cleaned_label, '.* {search_string.lower()}$'@{task.language_code.value}) || - REGEX(?cleaned_label, '^{search_string.lower()} .*'@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """ + cirrussearch_parameters: str = "" + query: str = "" + results: Dict = {} - def process_results(results): - # TODO refactor - items = [] - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = SparqlItem(**item_json) - item.validate_qid_and_copy_label() - if not item.is_in_blocklist(): - items.append(item) - else: - logger.info(f"{item.label} found in blocklist, skipping") - return items + def fetch_based_on_label(self): + self.execute_queries() + self.print_total_items() - if suggestion is None: - raise ValueError("suggestion was None") - if suggestion.item is None: - raise ValueError("suggestion.item was None") - if suggestion.args is None: - raise ValueError("suggestion.args was None") - if suggestion.args.limit_to_items_without_p921: - raise Exception( - "Limiting to items without P921 is not " "supported yet for this task." - ) - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - if suggestion.args.limit_to_items_without_p921: - console.print( - "Limiting to scholarly articles without P921 main subject only" - ) - cirrussearch_parameters = ( - f"haswbstatement:P31=Q13442814 -haswbstatement:P921" - ) - else: - cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={suggestion.item.id}" + def execute_queries(self): # Fetch all items matching the search strings - self.list = [] - for search_string in suggestion.search_strings: - search_string = strip_bad_chars(search_string) - results = execute_sparql_query( - build_query( - cirrussearch_parameters=cirrussearch_parameters, - suggestion=suggestion, - search_string=search_string, - task=task, - ) + for search_string in self.main_subject_item.search_strings: + published_article_query = PublishedArticleQuery( + search_string=search_string, + main_subject_item=self.main_subject_item, + cirrussearch_parameters=self.cirrussearch_parameters, ) - logging.info( - f'Got {len(results["results"]["bindings"])} scholarly items from ' - f"WDQS using the search string {search_string}" - ) - self.list.extend(process_results(results)) + published_article_query.get_results() + # https://pythonexamples.org/python-append-list-to-another-list/ + self.sparql_items.extend(published_article_query.items) + published_article_query.print_number_of_results() # preprints # We don't use CirrusSearch in this query because we can do it more easily in # SPARQL on a small subgraph like this - # find all items that are ?item wdt:P31/wd:P279* wd:Q1266946 + # find all items that are ?main_subject_item wdt:P31/wd:P279* wd:Q1266946 # minus the Qid we want to add - results_preprint = execute_sparql_query( - f""" - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - ?item wdt:P31/wd:P279* wd:Q580922. # preprint - MINUS {{ - ?item wdt:P921 wd:{suggestion.item.id}; - }} - ?item rdfs:label ?label. - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """, - ) - logging.info( - f'Got {len(results["results"]["bindings"])} preprint items from ' - f"WDQS using the search string {search_string}" + preprint_query = PreprintArticleQuery( + search_string=search_string, main_subject_item=self.main_subject_item ) - self.list.extend(process_results(results_preprint)) - console.print(f"Got a total of {len(self.list)} items") + preprint_query.get_results() + preprint_query.print_number_of_results() + self.sparql_items.extend(preprint_query.items) + + def print_total_items(self): + console.print(f"Got a total of {len(self.sparql_items)} items") diff --git a/src/models/items/thesis.py b/src/models/items/thesis.py index 6d256fa..2aa9d93 100644 --- a/src/models/items/thesis.py +++ b/src/models/items/thesis.py @@ -1,69 +1,69 @@ -import logging - -from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore - -from src.helpers.console import console -from src.models.items import Items -from src.models.suggestion import Suggestion -from src.models.task import Task - -# There were ~16.000 thesis' in WD when this was written -from src.models.wikimedia.wikidata.sparql_item import SparqlItem - - -class ThesisItems(Items): - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): - # logger = logging.getLogger(__name__) - if suggestion is None: - raise ValueError("suggestion was None") - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - if suggestion.args.limit_to_items_without_p921: - raise Exception( - "Limiting to items without P921 is not " "supported yet for this task." - ) - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - # Fetch all items matching the search strings - self.list = [] - for search_string in suggestion.search_strings: - # We don't use CirrusSearch in this query because we can do it more easily in - # SPARQL on a small subgraph like this - # find all items that are ?item wdt:P31/wd:P279* wd:Q1266946 - # minus the Qid we want to add - results = execute_sparql_query( - f""" - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - {{ - ?item wdt:P31/wd:P279* wd:Q1266946. # thesis - }} UNION - {{ - ?item wdt:P31/wd:P279* wd:Q1385450. # dissertation - }} UNION - {{ - ?item wdt:P31/wd:P279* wd:Q3099732. # technical report - }} - MINUS {{ - ?item wdt:P921 wd:{suggestion.item.id}; - }} - ?item rdfs:label ?label. - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """, - ) - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = SparqlItem(**item_json) - self.list.append(item) - logging.info( - f'Got {len(results["results"]["bindings"])} items from ' - f"WDQS using the search string {search_string}" - ) - console.print(f"Got a total of {len(self.list)} items") +# import logging +# +# from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore +# +# from src.helpers.console import console +# from src.models.items import Items +# from src.models.suggestion import Suggestion +# from src.models.task import Task +# +# # There were ~16.000 thesis' in WD when this was written +# from src.models.wikimedia.wikidata.sparql_item import SparqlItem +# +# +# class ThesisItems(Items): +# def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): +# # logger = logging.getLogger(__name__) +# if suggestion is None: +# raise ValueError("suggestion was None") +# if suggestion.search_strings is None: +# raise ValueError("suggestion.search_strings was None") +# if suggestion.args.limit_to_items_without_p921: +# raise Exception( +# "Limiting to items without P921 is not " "supported yet for this task." +# ) +# if task is None: +# raise ValueError("task was None") +# if task.language_code is None: +# raise ValueError("task.language_code was None") +# # Fetch all items matching the search strings +# self.list = [] +# for search_string in suggestion.search_strings: +# # We don't use CirrusSearch in this query because we can do it more easily in +# # SPARQL on a small subgraph like this +# # find all items that are ?main_subject_item wdt:P31/wd:P279* wd:Q1266946 +# # minus the Qid we want to add +# results = execute_sparql_query( +# f""" +# SELECT DISTINCT ?item ?itemLabel +# WHERE {{ +# {{ +# ?item wdt:P31/wd:P279* wd:Q1266946. # thesis +# }} UNION +# {{ +# ?item wdt:P31/wd:P279* wd:Q1385450. # dissertation +# }} UNION +# {{ +# ?item wdt:P31/wd:P279* wd:Q3099732. # technical report +# }} +# MINUS {{ +# ?item wdt:P921 wd:{suggestion.main_subject_item.id}; +# }} +# ?item rdfs:label ?label. +# FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || +# REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || +# REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) +# MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} +# }} +# """, +# ) +# for item_json in results["results"]["bindings"]: +# logging.debug(f"item_json:{item_json}") +# item = SparqlItem(**item_json) +# self.list.append(item) +# logging.info( +# f'Got {len(results["results"]["bindings"])} items from ' +# f"WDQS using the search string {search_string}" +# ) +# console.print(f"Got a total of {len(self.list)} items") diff --git a/src/models/suggestion.py b/src/models/suggestion.py index 07510a8..cd1a393 100644 --- a/src/models/suggestion.py +++ b/src/models/suggestion.py @@ -1,9 +1,7 @@ from __future__ import annotations -import argparse import logging -from typing import TYPE_CHECKING, List, Optional, Set -from urllib.parse import quote +from typing import TYPE_CHECKING from pydantic import BaseModel from wikibaseintegrator import WikibaseIntegrator # type: ignore @@ -11,181 +9,11 @@ from wikibaseintegrator.models import Claim # type: ignore from wikibaseintegrator.wbi_helpers import search_entities # type: ignore -import config -import config.items -from src.helpers.calculations import calculate_random_editgroups_hash -from src.helpers.cleaning import clean_rich_formatting -from src.models.items import Items -from src.models.task import Task -from src.models.wikimedia.wikidata.enums import Property, Qid -from src.models.wikimedia.wikidata.item import Item - if TYPE_CHECKING: - from src.models.batch_job import BatchJob + pass logger = logging.getLogger(__name__) class Suggestion(BaseModel): - item: Item - task: Task - args: argparse.Namespace - search_strings: Optional[Set[str]] = None - - class Config: - arbitrary_types_allowed = True - - def __alias_appears_in_label_of_a_qid__(self, alias: str) -> bool: - if alias is None: - raise ValueError("alias was none") - results = search_entities(alias, dict_result=True) - for result in results: - if result["label"] == alias: - qid = result["id"] - logger.info(f"Found {alias} as label in {qid}") - # verify that it is not a scientific article - return self.__is_not_scientific_article__(qid=qid) - return False - - @staticmethod - def __is_not_scientific_article__(qid: str): - """Looks up the QID in Wikidata to check whether it is a scholarly article or not. - We negate the result""" - if qid is None: - raise ValueError("qid was None") - wbi = WikibaseIntegrator() - item = wbi.item.get(qid) - claims: List[Claim] = item.claims - for claim in claims: - if claim.mainsnak.property_number == Property.INSTANCE_OF.value: - qid = claim.mainsnak.datavalue["value"]["id"] - logger.info(f"Found P31 with value {qid}") - from src.helpers.console import console - - # console.print(claim.mainsnak) - if qid == Qid.SCHOLARLY_ARTICLE.value: - logger.debug("__is_not_scientific_article__:returning false now") - return False - else: - return True - - def __str__(self): - """Return label and description, the latter cut to 50 chars""" - if self.item is not None: - string = ( - f"label: [bold]{clean_rich_formatting(self.item.label)}[/bold]\n" - f"aliases: {', '.join(self.item.aliases)}\n" - f"description: {self.item.description[:70]}\n" - f"{self.item.url()}\n" - ) - for url in self.search_urls(): - string = string + f"{url}\n" - return string - - def add_to_items( - self, items: Items = None, jobs: List[BatchJob] = None, job_count: int = None - ): - """Add a suggested Qid as main subject on all items that - have a label that matches one of the search strings for this Qid - We calculate a new edit group hash each time this function is - called so similar edits are grouped and easily be undone. - - This function is non-interactive""" - if items is None: - raise ValueError("Items was None") - if items.list is None: - raise ValueError("items.list was None") - if jobs is None: - raise ValueError("jobs was None") - if job_count is None: - raise ValueError("job count was None") - editgroups_hash: str = calculate_random_editgroups_hash() - count = 0 - for target_item in items.list: - count += 1 - from src import console - - with console.status( - f"Uploading main subject " - f"[green]{clean_rich_formatting(self.item.label)}[/green] " - f"to {clean_rich_formatting(target_item.label)}" - ): - main_subject_property = "P921" - reference = ItemType( - "Q69652283", # inferred from title - prop_nr="P887", # based on heuristic - ) - statement = ItemType( - self.item.id, prop_nr=main_subject_property, references=[reference] - ) - target_item.upload_one_statement_to_wikidata( - statement=statement, - summary=f"[[Property:{main_subject_property}]]: [[{self.item.id}]]", - editgroups_hash=editgroups_hash, - ) - console.print( - f"(job {job_count}/{len(jobs)})(item {count}/{len(items.list)}) " - f"Added '{clean_rich_formatting(self.item.label)}' to " - f"{clean_rich_formatting(target_item.label)}: {target_item.url()}" - ) - # input("Press enter to continue") - - def extract_search_strings(self): - def clean_special_symbols(string: str): - return string.replace("®", "").replace("™", "").replace('"', "") - - from src.helpers.console import console - - logger = logging.getLogger(__name__) - if self.args is None: - raise ValueError("args was None") - else: - logger.debug(f"args:{self.args}") - if self.args.no_aliases is True: - console.print("Alias matching is turned off") - no_aliases = True - elif self.item.id in config.items.no_alias_for_scholarly_items: - logger.info( - f"Alias matching is turned off for this item: {self.item.label}" - ) - no_aliases = True - else: - no_aliases = False - if self.item.label is None: - raise ValueError("self.item.label was None") - self.search_strings: Set[str] = set() - self.search_strings.add(clean_special_symbols(self.item.label)) - if self.item.aliases is not None and no_aliases is False: - for alias in self.item.aliases: - # logger.debug(f"extracting alias:{alias}") - if len(alias) < 5 and alias not in config.items.list_of_allowed_aliases: - console.print( - f"Skipping short alias '{alias}' to avoid false positives", - style="#FF8000", - ) - elif self.__alias_appears_in_label_of_a_qid__(alias=alias): - console.print( - f"Skipped '{alias}' because it appears " - f"in a label of at least one Qid that is not a scholarly article", - style="#FF8000", - ) - elif alias in config.items.list_of_allowed_aliases: - console.print(f"Found {alias} in the allow list") - self.search_strings.add(clean_special_symbols(alias)) - else: - self.search_strings.add(clean_special_symbols(alias)) - - def print_search_strings(self): - # logger.debug(f"search_strings:{self.search_strings}") - from src.helpers.console import print_search_strings_table - - print_search_strings_table(args=self.args, search_strings=self.search_strings) - - def search_urls(self) -> List[str]: - if self.search_strings is None: - raise ValueError("self.search_strings was None") - urls = [] - for search_string in self.search_strings: - search_term = quote(f'"{search_string}"') - urls.append(f"https://www.wikidata.org/w/index.php?search={search_term}") - return urls + pass diff --git a/src/models/wikimedia/wikidata/entity.py b/src/models/wikimedia/wikidata/entity.py index ba3c82c..3449c6d 100644 --- a/src/models/wikimedia/wikidata/entity.py +++ b/src/models/wikimedia/wikidata/entity.py @@ -37,7 +37,7 @@ def upload_one_statement_to_wikidata( This mandates an editgroups hash to be supplied""" logger = logging.getLogger(__name__) if self.id is None: - raise ValueError("no id on item") + raise ValueError("no id on main_subject_item") if statement is None: raise ValueError("Statement was None") if summary is None: @@ -58,5 +58,6 @@ def upload_one_statement_to_wikidata( logger.error(f"Got error from the API: {e}") # logger.debug(f"result from WBI:{result}") + @property def url(self): return f"http://www.wikidata.org/entity/{self.id}" diff --git a/src/models/wikimedia/wikidata/entiyt_id.py b/src/models/wikimedia/wikidata/entiyt_id.py index 9a0dfd4..29d0992 100644 --- a/src/models/wikimedia/wikidata/entiyt_id.py +++ b/src/models/wikimedia/wikidata/entiyt_id.py @@ -13,7 +13,7 @@ class EntityId: rest: str def __init__(self, entity_id: str): - if entity_id is not None: + if entity_id: # Remove prefix if found if config.wd_prefix in entity_id: logger.debug("Removing prefix") diff --git a/src/models/wikimedia/wikidata/foreign_id.py b/src/models/wikimedia/wikidata/foreign_id.py index de1c9bb..f370c95 100644 --- a/src/models/wikimedia/wikidata/foreign_id.py +++ b/src/models/wikimedia/wikidata/foreign_id.py @@ -6,7 +6,7 @@ class ForeignID: id: Optional[str] property: Optional[str] # This is the property with type ExternalId - source_item_id: Optional[str] # This is the Q-item for the source + source_item_id: Optional[str] # This is the Q-main_subject_item for the source def __init__( self, diff --git a/src/models/wikimedia/wikidata/item.py b/src/models/wikimedia/wikidata/item.py deleted file mode 100644 index 3a68362..0000000 --- a/src/models/wikimedia/wikidata/item.py +++ /dev/null @@ -1,53 +0,0 @@ -from typing import List, Optional - -from wikibaseintegrator import WikibaseIntegrator # type: ignore -from wikibaseintegrator import wbi_config # type: ignore -from wikibaseintegrator.models import Alias # type: ignore - -import config -from src.models.task import Task -from src.models.wikimedia.wikidata.entity import Entity - -wbi_config.config["USER_AGENT"] = config.user_agent - - -class Item(Entity): - """This represents an item in Wikidata - We always work on one language at a time, - so we don't bother with languages here and keep to simple strings""" - - description: Optional[str] = None - aliases: Optional[List[str]] = None - - def __str__(self): - return f"{self.label}, see {self.url()}" - - def fetch_label_and_description_and_aliases(self, task: Task = None): - """Fetch label and aliases in the task language from the Wikidata API""" - if task is None: - raise ValueError("task was None") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - if task.language_code is None: - raise ValueError("task.language_code was None") - from src.helpers.console import console - - with console.status( - f"Fetching {task.language_code.name.title()} label and aliases from the Wikidata API..." - ): - wbi = WikibaseIntegrator() - item = wbi.item.get(self.id) - label = item.labels.get(task.language_code.value) - if label is not None: - self.label = str(label) - description = item.descriptions.get(task.language_code.value) - if description is not None: - self.description = str(description) - aliases: List[Alias] = item.aliases.get(task.language_code.value) - # logging.debug(f"aliases from wbi:{item.aliases.get('en')}") - if aliases is not None: - self.aliases = [] - for alias in aliases: - self.aliases.append(str(alias)) - # logging.debug(f"appended:{alias.value}") - # logging.debug(f"aliases:{self.aliases}") diff --git a/src/models/wikimedia/wikidata/item/__init__.py b/src/models/wikimedia/wikidata/item/__init__.py new file mode 100644 index 0000000..da3ecbc --- /dev/null +++ b/src/models/wikimedia/wikidata/item/__init__.py @@ -0,0 +1,70 @@ +import argparse +from typing import List, Optional + +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator import wbi_config # type: ignore +from wikibaseintegrator.models import Alias # type: ignore + +import config +from src.models.task import Task +from src.models.wikimedia.wikidata.entity import Entity + +wbi_config.config["USER_AGENT"] = config.user_agent + + +class Item(Entity): + """This represents an main_subject_item in Wikidata + We always work on one language at a time, + so we don't bother with languages here and keep to simple strings""" + + aliases: Optional[List[str]] = None + args: Optional[argparse.Namespace] = None + confirmation: bool = False + description: Optional[str] = None + qid: str = "" + task: Optional[Task] = None + + class Config: + arbitrary_types_allowed = True + + def __str__(self): + return f"{self.label}, see {self.url}" + + def __fetch_label_and_description_and_aliases__(self, task: Task = None): + """Fetch label and aliases in the task language from the Wikidata API""" + if not self.task: + raise ValueError("self.task was None") + if not isinstance(self.task, Task): + raise ValueError("self.task was not a Task object") + if self.task.language_code is None: + raise ValueError("self.task.language_code was None") + from src.helpers.console import console + + with console.status( + f"Fetching {self.task.language_code.name.title()} label and aliases from the Wikidata API..." + ): + wbi = WikibaseIntegrator() + if not self.id: + id = self.qid + item = wbi.item.get(id) + label = item.labels.get(self.task.language_code.value) + if label: + self.label = str(label) + description = item.descriptions.get(self.task.language_code.value) + if description: + self.description = str(description) + aliases: List[Alias] = item.aliases.get(self.task.language_code.value) + # logging.debug(f"aliases from wbi:{main_subject_item.aliases.get('en')}") + if aliases: + self.aliases = [] + for alias in aliases: + self.aliases.append(str(alias)) + # logging.debug(f"appended:{alias.value}") + # logging.debug(f"aliases:{self.aliases}") + + def __strip_qid_prefix__(self): + if "https://www.wikidata.org/wiki/" in self.qid: + self.qid = self.qid[30:] + if "http://www.wikidata.org/entity/" in self.qid: + self.qid = self.qid[31:] + # logger.debug(f"qid:{qid}") diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py new file mode 100644 index 0000000..c031f98 --- /dev/null +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -0,0 +1,266 @@ +import logging +from typing import Set, List, Optional, TYPE_CHECKING +from urllib.parse import quote + +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator.datatypes import Item as ItemType # type: ignore +from wikibaseintegrator.models import Claim # type: ignore +from wikibaseintegrator.wbi_helpers import search_entities # type: ignore + +import config +import config.items +from src.helpers.calculations import calculate_random_editgroups_hash +from src.helpers.cleaning import clean_rich_formatting +from src.helpers.console import console +from src.helpers.questions import ask_yes_no_question +from src.models.items import Items +from src.models.items.scholarly_articles import ScholarlyArticleItems +from src.models.wikimedia.wikidata.enums import Property, Qid +from src.models.wikimedia.wikidata.item import Item +from src.tasks import TaskIds + +if TYPE_CHECKING: + from src.models.batch_job import BatchJob + +logger = logging.getLogger(__name__) + + +class MainSubjectItem(Item): + search_strings: Set[str] = set() + items: Optional[Items] = None + number_of_queries: int = 0 + + class Config: + arbitrary_types_allowed = True + + def __alias_appears_in_label_of_a_qid__(self, alias: str) -> bool: + if not alias: + raise ValueError("alias was none") + results = search_entities(alias, dict_result=True) + for result in results: + if result["label"] == alias: + qid = result["id"] + logger.info(f"Found {alias} as label in {qid}") + # verify that it is not a scientific article + return self.__is_not_scientific_article__(qid=qid) + return False + + @staticmethod + def __is_not_scientific_article__(qid: str): + """Looks up the QID in Wikidata to check whether it is a scholarly article or not. + We negate the result""" + # TODO avoid negating here + if not qid: + raise ValueError("qid was None") + wbi = WikibaseIntegrator() + item = wbi.item.get(qid) + claims: List[Claim] = item.claims + for claim in claims: + if claim.mainsnak.property_number == Property.INSTANCE_OF.value: + qid = claim.mainsnak.datavalue["value"]["id"] + logger.info(f"Found P31 with value {qid}") + from src.helpers.console import console + + # console.print(claim.mainsnak) + if qid == Qid.SCHOLARLY_ARTICLE.value: + logger.debug("__is_not_scientific_article__:returning false now") + return False + else: + return True + + def __str__(self): + """Return label and description, the latter cut to 50 chars""" + string = ( + f"label: [bold]{clean_rich_formatting(self.label)}[/bold]\n" + f"aliases: {', '.join(self.aliases)}\n" + f"description: {self.description[:70]}\n" + f"{self.url}\n" + ) + for url in self.search_urls(): + string = string + f"{url}\n" + return string + + def add_to_items( + self, jobs: List["BatchJob"] = None, job_count: int = None + ): + """Add a suggested Qid as main subject on all items that + have a label that matches one of the search strings for this Qid + We calculate a new edit group hash each time this function is + called so similar edits are grouped and easily be undone. + + This function is non-interactive""" + if not self.items: + raise ValueError("Items was None") + if not self.items.sparql_items: + raise ValueError("items.sparql_items was None") + if not jobs: + raise ValueError("jobs was None") + if not job_count: + raise ValueError("job count was None") + editgroups_hash: str = calculate_random_editgroups_hash() + count = 0 + for target_item in self.items.sparql_items: + count += 1 + from src import console + + if not target_item.label: + target_item.label = "main_subject_item with missing label" + with console.status( + f"Uploading main subject " + f"[green]{clean_rich_formatting(self.label)}[/green] " + f"to {clean_rich_formatting(target_item.label)} ({target_item.id})" + ): + main_subject_property = "P921" + reference = ItemType( + "Q69652283", # inferred from title + prop_nr="P887", # based on heuristic + ) + statement = ItemType( + self.id, + prop_nr=main_subject_property, + references=[reference], + ) + target_item.upload_one_statement_to_wikidata( + statement=statement, + summary=f"[[Property:{main_subject_property}]]: [[{self.id}]]", + editgroups_hash=editgroups_hash, + ) + console.print( + f"(job {job_count}/{len(jobs)})(main_subject_item {count}/{self.items.number_of_sparql_items} " + f"Added '{clean_rich_formatting(self.label)}' to " + f"{clean_rich_formatting(target_item.label)}: {target_item.url}" + ) + # input("Press enter to continue") + + @staticmethod + def __clean_special_symbols__(string: str): + return string.replace("®", "").replace("™", "").replace('"', "") + + def __extract_search_strings__(self): + if not self.args: + raise ValueError("args was None") + else: + logger.debug(f"args:{self.args}") + if self.args.no_aliases is True: + console.print("Alias matching is turned off") + no_aliases = True + elif self.id in config.items.no_alias_for_scholarly_items: + logger.info( + f"Alias matching is turned off for this main_subject_item: {self.label}" + ) + no_aliases = True + else: + no_aliases = False + if not self.label: + raise ValueError("self.label was None") + self.search_strings: Set[str] = set() + self.search_strings.add(self.__clean_special_symbols__(self.label)) + if self.aliases and no_aliases is False: + for alias in self.aliases: + # logger.debug(f"extracting alias:{alias}") + if len(alias) < 5 and alias not in config.items.list_of_allowed_aliases: + console.print( + f"Skipping short alias '{alias}' to avoid false positives", + style="#FF8000", + ) + elif self.__alias_appears_in_label_of_a_qid__(alias=alias): + console.print( + f"Skipped '{alias}' because it appears " + f"in a label of at least one Qid that is not a scholarly article", + style="#FF8000", + ) + elif alias in config.items.list_of_allowed_aliases: + console.print(f"Found {alias} in the allow sparql_items") + self.search_strings.add(self.__clean_special_symbols__(alias)) + else: + self.search_strings.add(self.__clean_special_symbols__(alias)) + + def print_search_strings(self): + # logger.debug(f"search_strings:{self.search_strings}") + from src.helpers.cli_messages import print_search_strings_table + + print_search_strings_table(args=self.args, search_strings=self.search_strings) + + def search_urls(self) -> List[str]: + if not self.search_strings: + raise ValueError("self.search_strings was None") + urls = [] + for search_string in self.search_strings: + search_term = quote(f'"{search_string}"') + urls.append(f"https://www.wikidata.org/w/index.php?search={search_term}") + return urls + + def __prepare_before_fetching_items__(self): + self.__extract_search_strings__() + self.__check_we_got_what_we_need__() + if config.loglevel in [logging.INFO, logging.DEBUG]: + self.print_search_strings() + self.__count_number_of_queries__() + self.__instantiate_the_right_class_for_this_task__() + + def __parse_into_job__(self): + if self.items.number_of_sparql_items: + self.items.remove_duplicates() + self.items.random_shuffle_items() + from src import BatchJob + + job = BatchJob( + number_of_queries=self.number_of_queries, + main_subject_item=self, + ) + return job + else: + console.print("No matching items found") + return None + + def __count_number_of_queries__(self): + self.number_of_queries = ( + len(self.search_strings) * self.task.number_of_queries_per_search_string + ) + + def __check_we_got_what_we_need__(self): + if not self.search_strings: + raise ValueError("search_strings was None") + if not self.task: + raise ValueError("task was None") + + def __instantiate_the_right_class_for_this_task__(self): + if self.task.id == TaskIds.SCHOLARLY_ARTICLES: + self.items = ScholarlyArticleItems(main_subject_item=self) + # elif self.task.id == TaskIds.RIKSDAGEN_DOCUMENTS: + # items = RiksdagenDocumentItems(main_subject_item=self) + # elif self.task.id == TaskIds.THESIS: + # items = ThesisItems(main_subject_item=self) + # elif self.task.id == TaskIds.ACADEMIC_JOURNALS: + # items = AcademicJournalItems(main_subject_item=self) + else: + raise ValueError(f"{self.task.id} was not recognized") + + def fetch_items_and_get_job(self) -> Optional["BatchJob"]: + """This method handles all the work needed to return a job""" + self.__strip_qid_prefix__() + self.__fetch_label_and_description_and_aliases__() + if self.label: + console.print(f"Working on {self.label}") + if self.confirmation: + answer = ask_yes_no_question("Do you want to continue?") + if not answer: + return None + self.__prepare_before_fetching_items__() + if self.items: + with console.status( + f"Fetching items with labels that have one of " + f"the search strings by running a total of " + f"{self.number_of_queries} " + f"queries on WDQS..." + ): + self.items.fetch_based_on_label() + return self.__parse_into_job__() + else: + raise ValueError("items was None") + else: + if self.task: + console.print( + f"Label for {self.task.language_code} was None on {self.url}, skipping" + ) + return None \ No newline at end of file diff --git a/src/models/wikimedia/wikidata/item/scholarly_article.py b/src/models/wikimedia/wikidata/item/scholarly_article.py new file mode 100644 index 0000000..f84666c --- /dev/null +++ b/src/models/wikimedia/wikidata/item/scholarly_article.py @@ -0,0 +1,5 @@ +from src.models.wikimedia.wikidata.item import Item + + +class ScholarlyArticleItem(Item): + pass diff --git a/src/models/wikimedia/wikidata/query/__init__.py b/src/models/wikimedia/wikidata/query/__init__.py new file mode 100644 index 0000000..615a9b2 --- /dev/null +++ b/src/models/wikimedia/wikidata/query/__init__.py @@ -0,0 +1,67 @@ +import logging +from typing import Dict, List + +from pydantic import BaseModel +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore + +from src.models.wikimedia.wikidata.item import Item +from src.models.wikimedia.wikidata.sparql_item import SparqlItem + +logger = logging.getLogger(__name__) + + +class Query(BaseModel): + results: Dict = {} + search_string = "" + query_string = "" + items: List[Item] = [] + + def parse_results(self) -> None: + # console.print(self.results) + for item_json in self.results["results"]["bindings"]: + logging.debug(f"item_json:{item_json}") + item = SparqlItem(**item_json) + item.validate_qid_and_copy_label() + if not item.is_in_blocklist(): + self.items.append(item) + else: + logger.info(f"{item.label} found in blocklist, skipping") + + def strip_bad_chars(self): + # Note this has to match the cleaning done in the sparql query + # We lowercase and remove common symbols + # We replace like this to save CPU cycles see + # https://stackoverflow.com/questions/3411771/best-way-to-replace-multiple-characters-in-a-string + self.search_string = ( + self.search_string + # Needed for matching backslashes e.g. "Dmel\CG5330" on Q29717230 + .replace("\\", "\\\\") + # Needed for when labels contain apostrophe + .replace("'", "\\'") + .replace(",", "") + .replace(":", "") + .replace(";", "") + .replace("(", "") + .replace(")", "") + .replace("[", "") + .replace("]", "") + ) + + def execute(self): + self.results = execute_sparql_query(self.query_string) + + def get_results(self): + """Do everything needed to get the results""" + self.strip_bad_chars() + self.build_query() + self.execute() + self.parse_results() + + def build_query(self): + pass + + def print_number_of_results(self): + logging.info( + f"Got {len(self.items)} items from " + f"WDQS using the search string {self.search_string}" + ) diff --git a/src/models/wikimedia/wikidata/query/article.py b/src/models/wikimedia/wikidata/query/article.py new file mode 100644 index 0000000..91e14f5 --- /dev/null +++ b/src/models/wikimedia/wikidata/query/article.py @@ -0,0 +1,8 @@ +from typing import Any + +from src.models.wikimedia.wikidata.query import Query + + +class ArticleQuery(Query): + # any here because of pydantic error + main_subject_item: Any diff --git a/src/models/wikimedia/wikidata/query/preprint_article.py b/src/models/wikimedia/wikidata/query/preprint_article.py new file mode 100644 index 0000000..f2ef93b --- /dev/null +++ b/src/models/wikimedia/wikidata/query/preprint_article.py @@ -0,0 +1,27 @@ +import config +from src.models.wikimedia.wikidata.query.article import ArticleQuery + + +class PreprintArticleQuery(ArticleQuery): + def build_query(self): + self.query_string = f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + ?item wdt:P31/wd:P279* wd:Q580922. # preprint + MINUS {{ + ?item wdt:P921 wd:{self.main_subject_item.id}; + }} + ?item rdfs:label ?label. + FILTER(CONTAINS( + LCASE(?label), " {self.search_string.lower()} " + @{self.main_subject_item.task.language_code.value}) || + REGEX(LCASE(?label), ".* {self.search_string.lower()}$" + @{self.main_subject_item.task.language_code.value}) || + REGEX(LCASE(?label), "^{self.search_string.lower()} .*" + @{self.main_subject_item.task.language_code.value}) + ) + MINUS {{?item wdt:P921/wdt:P279 wd:{self.main_subject_item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + """ diff --git a/src/models/wikimedia/wikidata/query/published_article.py b/src/models/wikimedia/wikidata/query/published_article.py new file mode 100644 index 0000000..5128ea9 --- /dev/null +++ b/src/models/wikimedia/wikidata/query/published_article.py @@ -0,0 +1,82 @@ +import config +from src import console +from src.models.wikimedia.wikidata.query.article import ArticleQuery + + +class PublishedArticleQuery(ArticleQuery): + cirrussearch_parameters: str = "" + + def check_we_got_everything_we_need(self): + if not self.main_subject_item: + raise ValueError("suggestion was None") + if not self.main_subject_item: + raise ValueError("suggestion.main_subject_item was None") + if not self.main_subject_item.args: + raise ValueError("suggestion.args was None") + if self.main_subject_item.args.limit_to_items_without_p921: + raise Exception( + "Limiting to items without P921 is not " "supported yet for this task." + ) + if self.main_subject_item.task is None: + raise ValueError("task was None") + if self.main_subject_item.task.language_code is None: + raise ValueError("task.language_code was None") + if self.main_subject_item.args.limit_to_items_without_p921: + console.print( + "Limiting to scholarly articles without P921 main subject only" + ) + cirrussearch_parameters = ( + f"haswbstatement:P31=Q13442814 -haswbstatement:P921" + ) + else: + cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={self.main_subject_item.id}" + if self.main_subject_item.task is None: + raise ValueError("task was None") + if self.main_subject_item.task.language_code is None: + raise ValueError("task.language_code was None") + if cirrussearch_parameters is None: + raise ValueError("cirrussearch_parameters was None") + + def build_query( + self, + ): + # This query uses https://www.w3.org/TR/sparql11-property-paths/ to + # find subjects that are subclass of one another up to 3 hops away + # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI + # which has a hardcoded limit of 10,000 items so you will never get more matches than that + # This query use regex to match beginning, middle and end of the label of matched items + # The replacing lines should match the similar python replacements in cleaning.py + # The replacing with "\\\\\\\\" becomes "\\\\" after leaving python and then it works in + # SPARQL where it becomes "\\" and thus match a single backslash + self.query_string = f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + hint:Query hint:optimizer "None". + BIND(STR('{self.cirrussearch_parameters} \"{self.search_string}\"') as ?search_string) + SERVICE wikibase:mwapi {{ + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch ?search_string. + ?title wikibase:apiOutput mwapi:title. + }} + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + BIND(REPLACE(LCASE(?label), ",", "") as ?label1) + BIND(REPLACE(?label1, ":", "") as ?label2) + BIND(REPLACE(?label2, ";", "") as ?label3) + BIND(REPLACE(?label3, "\\\\(", "") as ?label4) + BIND(REPLACE(?label4, "\\\\)", "") as ?label5) + BIND(REPLACE(?label5, "\\\\[", "") as ?label6) + BIND(REPLACE(?label6, "\\\\]", "") as ?label7) + BIND(REPLACE(?label7, "\\\\\\\\", "") as ?label8) + BIND(?label8 as ?cleaned_label) + FILTER(CONTAINS(?cleaned_label, ' {self.search_string.lower()} '@{self.main_subject_item.task.language_code.value}) || + REGEX(?cleaned_label, '.* {self.search_string.lower()}$'@{self.main_subject_item.task.language_code.value}) || + REGEX(?cleaned_label, '^{self.search_string.lower()} .*'@{self.main_subject_item.task.language_code.value})) + MINUS {{?item wdt:P921/wdt:P279 wd:{self.main_subject_item.id}. }} + MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{self.main_subject_item.id}. }} + MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{self.main_subject_item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + """ diff --git a/src/tasks.py b/src/tasks.py index 24d6f06..051cb91 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -16,8 +16,8 @@ "E.g. when searching for 'cancer screening' in Wikidata " "we find 'gastric cancer screening' in labels of " "scientific articles but there is " - "perhaps no item for this yet.\n" - "In this case it is preferred to first create that item " + "perhaps no main_subject_item for this yet.\n" + "In this case it is preferred to first create that main_subject_item " "(done in Q108532542 and add that as main subject and " "avoid the more general 'cancer screening' until all " "sub forms of screening have been matched." @@ -25,40 +25,40 @@ number_of_queries_per_search_string=2, ) ), - Task( - **dict( - id=TaskIds.RIKSDAGEN_DOCUMENTS, - label="Add main subject to documents from Riksdagen", - language_code=SupportedLanguageCode.SWEDISH, - best_practice_information=None, - ) - ), - Task( - **dict( - id=TaskIds.THESIS, - label="Add main subject to thesis' and technical reports", - language_code=SupportedLanguageCode.ENGLISH, - best_practice_information=( - "When adding Qid main subjects please try to first " - "educate yourself about the subarea of science a little " - "and find/create items as specific as possible.\n" - "E.g. when searching for 'cancer screening' in Wikidata " - "we find 'gastric cancer screening' in labels of " - "scientific articles but there is " - "perhaps no item for this yet.\n" - "In this case it is preferred to first create that item " - "(done in Q108532542 and add that as main subject and " - "avoid the more general 'cancer screening' until all " - "sub forms of screening have been matched." - ), - ) - ), - Task( - **dict( - id=TaskIds.ACADEMIC_JOURNALS, - label="Add main subject to academic journals", - language_code=SupportedLanguageCode.ENGLISH, - best_practice_information=None, - ) - ), + # Task( + # **dict( + # id=TaskIds.RIKSDAGEN_DOCUMENTS, + # label="Add main subject to documents from Riksdagen", + # language_code=SupportedLanguageCode.SWEDISH, + # best_practice_information=None, + # ) + # ), + # Task( + # **dict( + # id=TaskIds.THESIS, + # label="Add main subject to thesis' and technical reports", + # language_code=SupportedLanguageCode.ENGLISH, + # best_practice_information=( + # "When adding Qid main subjects please try to first " + # "educate yourself about the subarea of science a little " + # "and find/create items as specific as possible.\n" + # "E.g. when searching for 'cancer screening' in Wikidata " + # "we find 'gastric cancer screening' in labels of " + # "scientific articles but there is " + # "perhaps no main_subject_item for this yet.\n" + # "In this case it is preferred to first create that main_subject_item " + # "(done in Q108532542 and add that as main subject and " + # "avoid the more general 'cancer screening' until all " + # "sub forms of screening have been matched." + # ), + # ) + # ), + # Task( + # **dict( + # id=TaskIds.ACADEMIC_JOURNALS, + # label="Add main subject to academic journals", + # language_code=SupportedLanguageCode.ENGLISH, + # best_practice_information=None, + # ) + # ), ] diff --git a/tests/test___init__.py b/tests/test___init__.py new file mode 100644 index 0000000..1630bdb --- /dev/null +++ b/tests/test___init__.py @@ -0,0 +1,3 @@ +class TestQuery: + def test_parse_results(self): + assert False diff --git a/tests/test_suggestion.py b/tests/test_suggestion.py index d585f50..67c0a35 100644 --- a/tests/test_suggestion.py +++ b/tests/test_suggestion.py @@ -1,46 +1,46 @@ -import argparse -from unittest import TestCase - -from src.models.suggestion import Suggestion -from src.models.wikimedia.wikidata.sparql_item import SparqlItem, Value -from src.tasks import tasks - - -class TestSuggestion(TestCase): - def test_extract_search_strings(self): - item = SparqlItem( - item=Value(value="Q407541"), itemLabel=Value(value="fentanyl") - ) - item.validate_qid_and_copy_label() - suggestion = Suggestion( - item=item, - task=tasks[0], - args=argparse.Namespace( - no_aliases=dict(no_aliases=False), - show_search_urls=dict(show_search_urls=False), - ), - ) - suggestion.extract_search_strings() - # suggestion.print_search_strings() - if not len(suggestion.search_strings) == 1: - self.fail() - - def test_extract_search_strings_with_problematic_alias(self): - """This has a problematic alias "thrush" which is also a bird""" - item = SparqlItem( - item=Value(value="Q273510"), itemLabel=Value(value="candidadis") - ) - item.validate_qid_and_copy_label() - item.fetch_label_and_description_and_aliases(task=tasks[0]) - suggestion = Suggestion( - item=item, - task=tasks[0], - args=argparse.Namespace( - no_aliases=dict(no_aliases=False), - show_search_urls=dict(show_search_urls=False), - ), - ) - suggestion.extract_search_strings() - suggestion.print_search_strings() - print(len(suggestion.search_strings)) - assert len(suggestion.search_strings) == 10 +# import argparse +# from unittest import TestCase +# +# from src.models.suggestion import Suggestion +# from src.models.wikimedia.wikidata.sparql_item import SparqlItem, Value +# from src.tasks import tasks +# +# +# class TestSuggestion(TestCase): +# def test_extract_search_strings(self): +# item = SparqlItem( +# item=Value(value="Q407541"), itemLabel=Value(value="fentanyl") +# ) +# item.validate_qid_and_copy_label() +# suggestion = Suggestion( +# main_subject_item=item, +# task=tasks[0], +# args=argparse.Namespace( +# no_aliases=dict(no_aliases=False), +# show_search_urls=dict(show_search_urls=False), +# ), +# ) +# suggestion.__extract_search_strings__() +# # suggestion.print_search_strings() +# if not len(suggestion.search_strings) == 1: +# self.fail() +# +# def test_extract_search_strings_with_problematic_alias(self): +# """This has a problematic alias "thrush" which is also a bird""" +# item = SparqlItem( +# item=Value(value="Q273510"), itemLabel=Value(value="candidadis") +# ) +# item.validate_qid_and_copy_label() +# item.__fetch_label_and_description_and_aliases__(task=tasks[0]) +# suggestion = Suggestion( +# main_subject_item=item, +# task=tasks[0], +# args=argparse.Namespace( +# no_aliases=dict(no_aliases=False), +# show_search_urls=dict(show_search_urls=False), +# ), +# ) +# suggestion.__extract_search_strings__() +# suggestion.print_search_strings() +# print(len(suggestion.search_strings)) +# assert len(suggestion.search_strings) == 10 From 9791d948ffb19b6d9112e2cd8f7332398321f343 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:44:13 +0200 Subject: [PATCH 02/37] cli_messages.py: Fix attribute error --- src/helpers/cli_messages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helpers/cli_messages.py b/src/helpers/cli_messages.py index 3aa285c..4f38cf7 100644 --- a/src/helpers/cli_messages.py +++ b/src/helpers/cli_messages.py @@ -64,12 +64,12 @@ def print_found_items_table(args: argparse.Namespace = None, items: Items = None f"items, please review as many as possible for false " f"positives and reject the batch if you find any." ) - if args.show_item_urls: + if getattr(args, "show_item_urls", False): table.add_column(f"Wikidata URL") for item in list_to_show: if item.label is None: raise ValueError("main_subject_item.label was None") - if args.show_item_urls: + if getattr(args, "show_item_urls", False): label = clean_rich_formatting(item.label) table.add_row(label, item.url) else: From 3087258610db6954a467bca35c31dbed51d51535 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:44:25 +0200 Subject: [PATCH 03/37] cli_messages.py: Fix generator warning --- src/helpers/cli_messages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpers/cli_messages.py b/src/helpers/cli_messages.py index 4f38cf7..4a90295 100644 --- a/src/helpers/cli_messages.py +++ b/src/helpers/cli_messages.py @@ -93,7 +93,7 @@ def print_job_statistics(batchjobs: BatchJobs = None): else: total_number_of_queries = sum([job.number_of_queries for job in batchjobs.jobs]) total_number_of_items = sum( - len(job.main_subject_item.items.sparql_items) + job.main_subject_item.items.number_of_sparql_items for job in batchjobs.jobs if batchjobs.jobs and job From 8c86c819cabc4d28c9be7d5fcb26abb5a2828c6f Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:44:43 +0200 Subject: [PATCH 04/37] jobs.py: Fix typing --- src/helpers/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 34c282e..d82b798 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -71,7 +71,7 @@ def handle_job_preparation_or_run_directly_if_any_jobs( def get_validated_main_subjects_as_jobs( - args: argparse.Namespace = None, main_subjects: List[str] = None + args: argparse.Namespace, main_subjects: List[str] ) -> BatchJobs: """This function randomly picks a subject and add it to the sparql_items of jobs if it had any matches and the user approved it""" From 7135d7564f32df9b784ef0643ceef428be530c60 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:45:38 +0200 Subject: [PATCH 05/37] __init__.py: run(): Simplify logic --- src/__init__.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index 31cca38..df71cb5 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -157,14 +157,7 @@ def run(self): if args.add is None: console.print("Got no arguments or QIDs. Try '--help' for help.") exit(0) - task: Task = select_task() - if task is None: - raise ValueError("Got no task") - jobs = [] - jobs.extend( - process_user_supplied_qids_into_batch_jobs(args=args, task=task) - ) - batchjobs = BatchJobs(jobs=jobs) + batchjobs = get_validated_main_subjects_as_jobs(args=args, main_subjects=args.add) handle_job_preparation_or_run_directly_if_any_jobs( args=args, batchjobs=batchjobs ) From 8a8e01214972598c4cf1e2cbf050a6d32fd8ac89 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:50:54 +0200 Subject: [PATCH 06/37] pre-commit fixes --- src/__init__.py | 21 +++++++++---------- src/helpers/cli_messages.py | 7 +++---- src/helpers/jobs.py | 10 ++------- src/helpers/questions.py | 3 ++- src/models/batch_jobs.py | 12 ++++++----- src/models/items/__init__.py | 2 +- .../wikimedia/wikidata/item/main_subject.py | 10 +++------ .../wikidata/query/published_article.py | 2 +- tests/{test___init__.py => test_query.py} | 2 +- 9 files changed, 30 insertions(+), 39 deletions(-) rename tests/{test___init__.py => test_query.py} (70%) diff --git a/src/__init__.py b/src/__init__.py index df71cb5..050b440 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -8,21 +8,13 @@ import config from src.helpers.argparse_setup import setup_argparse_and_return_args -from src.helpers.console import ( - console, - print_keep_an_eye_on_wdqs_lag, -) from src.helpers.cli_messages import ( print_best_practice, - print_found_items_table, print_finished, + print_found_items_table, print_job_statistics, ) -from src.helpers.questions import ( - ask_add_to_job_queue, - ask_discard_existing_job_pickle, - ask_yes_no_question, -) +from src.helpers.console import console, print_keep_an_eye_on_wdqs_lag from src.helpers.enums import TaskIds from src.helpers.jobs import ( get_validated_main_subjects_as_jobs, @@ -38,6 +30,11 @@ parse_job_pickle, remove_job_pickle, ) +from src.helpers.questions import ( + ask_add_to_job_queue, + ask_discard_existing_job_pickle, + ask_yes_no_question, +) from src.models.batch_job import BatchJob from src.models.batch_jobs import BatchJobs from src.models.suggestion import Suggestion @@ -157,7 +154,9 @@ def run(self): if args.add is None: console.print("Got no arguments or QIDs. Try '--help' for help.") exit(0) - batchjobs = get_validated_main_subjects_as_jobs(args=args, main_subjects=args.add) + batchjobs = get_validated_main_subjects_as_jobs( + args=args, main_subjects=args.add + ) handle_job_preparation_or_run_directly_if_any_jobs( args=args, batchjobs=batchjobs ) diff --git a/src/helpers/cli_messages.py b/src/helpers/cli_messages.py index 4a90295..b97132b 100644 --- a/src/helpers/cli_messages.py +++ b/src/helpers/cli_messages.py @@ -6,12 +6,11 @@ from rich.table import Table -from src.models.task import Task -from src.helpers.console import console -from src.models.batch_jobs import BatchJobs from src.helpers.cleaning import clean_rich_formatting -from src.helpers.console import press_enter_to_continue +from src.helpers.console import console, press_enter_to_continue +from src.models.batch_jobs import BatchJobs from src.models.items import Items +from src.models.task import Task def print_best_practice(task: Task): diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index d82b798..df0b46a 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -6,16 +6,10 @@ from typing import List import config -from src.helpers.cli_messages import ( - print_best_practice, - print_job_statistics, -) +from src.helpers.cli_messages import print_best_practice, print_job_statistics from src.helpers.console import console from src.helpers.menus import select_task -from src.helpers.questions import ( - ask_add_to_job_queue, - ask_yes_no_question, -) +from src.helpers.questions import ask_add_to_job_queue, ask_yes_no_question from src.models.batch_job import BatchJob from src.models.batch_jobs import BatchJobs from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem diff --git a/src/helpers/questions.py b/src/helpers/questions.py index 8cf67f2..aa768f9 100644 --- a/src/helpers/questions.py +++ b/src/helpers/questions.py @@ -1,8 +1,9 @@ # from __future__ import annotations -from src.helpers.console import console from typing import TYPE_CHECKING +from src.helpers.console import console + if TYPE_CHECKING: from src.models.batch_jobs import BatchJob diff --git a/src/models/batch_jobs.py b/src/models/batch_jobs.py index ab89d0e..f5e27cc 100644 --- a/src/models/batch_jobs.py +++ b/src/models/batch_jobs.py @@ -20,19 +20,21 @@ def print_running_jobs(self): raise ValueError("jobs is not a sparql_items") from src.helpers.console import console + number_of_items = sum( + job.main_subject_item.items.number_of_sparql_items + for job in self.jobs + if job.main_subject_item.items and job.main_subject_item.items.sparql_items + ) console.print( f"Running {len(self.jobs)} job(s) with a total of " - f"{sum(len(job.main_subject_item.items.sparql_items) for job in self.jobs if job.main_subject_item.items and job.main_subject_item.items.sparql_items)} items " + f"{number_of_items} items " f"non-interactively now. You can take a " f"coffee break and lean back :)" ) def run_jobs(self): - from src.helpers.console import ( - console, - print_keep_an_eye_on_wdqs_lag, - ) from src import print_finished + from src.helpers.console import console, print_keep_an_eye_on_wdqs_lag if self.jobs is None or len(self.jobs) == 0: raise ValueError("did not get what we need") diff --git a/src/models/items/__init__.py b/src/models/items/__init__.py index a1bb75e..1aeca09 100644 --- a/src/models/items/__init__.py +++ b/src/models/items/__init__.py @@ -3,7 +3,7 @@ import argparse import logging import random -from typing import List, Any +from typing import Any, List from pydantic import BaseModel diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index c031f98..6dfe7ab 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -1,5 +1,5 @@ import logging -from typing import Set, List, Optional, TYPE_CHECKING +from typing import TYPE_CHECKING, List, Optional, Set from urllib.parse import quote from wikibaseintegrator import WikibaseIntegrator # type: ignore @@ -80,9 +80,7 @@ def __str__(self): string = string + f"{url}\n" return string - def add_to_items( - self, jobs: List["BatchJob"] = None, job_count: int = None - ): + def add_to_items(self, jobs: List["BatchJob"] = None, job_count: int = None): """Add a suggested Qid as main subject on all items that have a label that matches one of the search strings for this Qid We calculate a new edit group hash each time this function is @@ -101,8 +99,6 @@ def add_to_items( count = 0 for target_item in self.items.sparql_items: count += 1 - from src import console - if not target_item.label: target_item.label = "main_subject_item with missing label" with console.status( @@ -263,4 +259,4 @@ def fetch_items_and_get_job(self) -> Optional["BatchJob"]: console.print( f"Label for {self.task.language_code} was None on {self.url}, skipping" ) - return None \ No newline at end of file + return None diff --git a/src/models/wikimedia/wikidata/query/published_article.py b/src/models/wikimedia/wikidata/query/published_article.py index 5128ea9..d4a9d98 100644 --- a/src/models/wikimedia/wikidata/query/published_article.py +++ b/src/models/wikimedia/wikidata/query/published_article.py @@ -1,5 +1,5 @@ import config -from src import console +from src.helpers.console import console from src.models.wikimedia.wikidata.query.article import ArticleQuery diff --git a/tests/test___init__.py b/tests/test_query.py similarity index 70% rename from tests/test___init__.py rename to tests/test_query.py index 1630bdb..3ae7bb3 100644 --- a/tests/test___init__.py +++ b/tests/test_query.py @@ -1,3 +1,3 @@ class TestQuery: def test_parse_results(self): - assert False + pass From b06a478c9de79fecb9725b702f4cc124afec7d61 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:53:33 +0200 Subject: [PATCH 07/37] Make methods private --- src/models/items/academic_journals.py | 4 ++-- src/models/wikimedia/wikidata/query/__init__.py | 16 ++++++++-------- .../wikimedia/wikidata/query/preprint_article.py | 2 +- .../wikidata/query/published_article.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/models/items/academic_journals.py b/src/models/items/academic_journals.py index 526c2b8..5e0e449 100644 --- a/src/models/items/academic_journals.py +++ b/src/models/items/academic_journals.py @@ -3,7 +3,7 @@ # from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore # # import config -# from src.helpers.cleaning import strip_bad_chars +# from src.helpers.cleaning import __strip_bad_chars__ # from src.helpers.console import console # from src.models.items import Items # from src.models.suggestion import Suggestion @@ -42,7 +42,7 @@ # # Fetch all items matching the search strings # self.list = [] # for search_string in suggestion.search_strings: -# search_string = strip_bad_chars(search_string) +# search_string = __strip_bad_chars__(search_string) # results = execute_sparql_query( # f""" # #{config.user_agent} diff --git a/src/models/wikimedia/wikidata/query/__init__.py b/src/models/wikimedia/wikidata/query/__init__.py index 615a9b2..e6fc4be 100644 --- a/src/models/wikimedia/wikidata/query/__init__.py +++ b/src/models/wikimedia/wikidata/query/__init__.py @@ -16,7 +16,7 @@ class Query(BaseModel): query_string = "" items: List[Item] = [] - def parse_results(self) -> None: + def __parse_results__(self) -> None: # console.print(self.results) for item_json in self.results["results"]["bindings"]: logging.debug(f"item_json:{item_json}") @@ -27,7 +27,7 @@ def parse_results(self) -> None: else: logger.info(f"{item.label} found in blocklist, skipping") - def strip_bad_chars(self): + def __strip_bad_chars__(self): # Note this has to match the cleaning done in the sparql query # We lowercase and remove common symbols # We replace like this to save CPU cycles see @@ -47,17 +47,17 @@ def strip_bad_chars(self): .replace("]", "") ) - def execute(self): + def __execute__(self): self.results = execute_sparql_query(self.query_string) def get_results(self): """Do everything needed to get the results""" - self.strip_bad_chars() - self.build_query() - self.execute() - self.parse_results() + self.__strip_bad_chars__() + self.__build_query__() + self.__execute__() + self.__parse_results__() - def build_query(self): + def __build_query__(self): pass def print_number_of_results(self): diff --git a/src/models/wikimedia/wikidata/query/preprint_article.py b/src/models/wikimedia/wikidata/query/preprint_article.py index f2ef93b..a517162 100644 --- a/src/models/wikimedia/wikidata/query/preprint_article.py +++ b/src/models/wikimedia/wikidata/query/preprint_article.py @@ -3,7 +3,7 @@ class PreprintArticleQuery(ArticleQuery): - def build_query(self): + def __build_query__(self): self.query_string = f""" #{config.user_agent} SELECT DISTINCT ?item ?itemLabel diff --git a/src/models/wikimedia/wikidata/query/published_article.py b/src/models/wikimedia/wikidata/query/published_article.py index d4a9d98..d259164 100644 --- a/src/models/wikimedia/wikidata/query/published_article.py +++ b/src/models/wikimedia/wikidata/query/published_article.py @@ -37,7 +37,7 @@ def check_we_got_everything_we_need(self): if cirrussearch_parameters is None: raise ValueError("cirrussearch_parameters was None") - def build_query( + def __build_query__( self, ): # This query uses https://www.w3.org/TR/sparql11-property-paths/ to From 92a8737a04965179c5898c9d792eb0862134343e Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:58:24 +0200 Subject: [PATCH 08/37] Delete unused empty class --- src/models/wikimedia/wikidata/item/scholarly_article.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 src/models/wikimedia/wikidata/item/scholarly_article.py diff --git a/src/models/wikimedia/wikidata/item/scholarly_article.py b/src/models/wikimedia/wikidata/item/scholarly_article.py deleted file mode 100644 index f84666c..0000000 --- a/src/models/wikimedia/wikidata/item/scholarly_article.py +++ /dev/null @@ -1,5 +0,0 @@ -from src.models.wikimedia.wikidata.item import Item - - -class ScholarlyArticleItem(Item): - pass From cfff1e420b4506d6b46ca6356df8de12c4d6a83a Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:59:18 +0200 Subject: [PATCH 09/37] Delete unused import --- src/models/wikimedia/wikidata/item/main_subject.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index 6dfe7ab..926363a 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -59,8 +59,6 @@ def __is_not_scientific_article__(qid: str): if claim.mainsnak.property_number == Property.INSTANCE_OF.value: qid = claim.mainsnak.datavalue["value"]["id"] logger.info(f"Found P31 with value {qid}") - from src.helpers.console import console - # console.print(claim.mainsnak) if qid == Qid.SCHOLARLY_ARTICLE.value: logger.debug("__is_not_scientific_article__:returning false now") From f2d585013fc4acd72c074ef1a84bea6dc73ecbf7 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 12:25:56 +0200 Subject: [PATCH 10/37] Move SparqlItem to item/ and remove qid from Item. --- src/helpers/menus.py | 2 +- src/models/items/__init__.py | 2 +- src/models/wikimedia/wikidata/item/__init__.py | 13 ++++++------- .../wikidata/{sparql_item.py => item/sparql.py} | 0 src/models/wikimedia/wikidata/query/__init__.py | 2 +- tests/test_sparql_item.py | 2 +- 6 files changed, 10 insertions(+), 11 deletions(-) rename src/models/wikimedia/wikidata/{sparql_item.py => item/sparql.py} (100%) diff --git a/src/helpers/menus.py b/src/helpers/menus.py index 97f7c7e..4468d71 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -11,7 +11,7 @@ def select_suggestion(suggestions: List[MainSubjectItem], item: Item): - if not item or not item.qid or not suggestions: + if not item or not item.id or not suggestions: raise ValueError("Did not get what we need") logger = logging.getLogger(__name__) menu = SelectionMenu( diff --git a/src/models/items/__init__.py b/src/models/items/__init__.py index 1aeca09..3ec8ca9 100644 --- a/src/models/items/__init__.py +++ b/src/models/items/__init__.py @@ -7,7 +7,7 @@ from pydantic import BaseModel -from src.models.wikimedia.wikidata.sparql_item import SparqlItem +from src.models.wikimedia.wikidata.item.sparql import SparqlItem # if TYPE_CHECKING: # from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem diff --git a/src/models/wikimedia/wikidata/item/__init__.py b/src/models/wikimedia/wikidata/item/__init__.py index da3ecbc..17b0873 100644 --- a/src/models/wikimedia/wikidata/item/__init__.py +++ b/src/models/wikimedia/wikidata/item/__init__.py @@ -21,7 +21,6 @@ class Item(Entity): args: Optional[argparse.Namespace] = None confirmation: bool = False description: Optional[str] = None - qid: str = "" task: Optional[Task] = None class Config: @@ -45,7 +44,7 @@ def __fetch_label_and_description_and_aliases__(self, task: Task = None): ): wbi = WikibaseIntegrator() if not self.id: - id = self.qid + id = self.id item = wbi.item.get(id) label = item.labels.get(self.task.language_code.value) if label: @@ -63,8 +62,8 @@ def __fetch_label_and_description_and_aliases__(self, task: Task = None): # logging.debug(f"aliases:{self.aliases}") def __strip_qid_prefix__(self): - if "https://www.wikidata.org/wiki/" in self.qid: - self.qid = self.qid[30:] - if "http://www.wikidata.org/entity/" in self.qid: - self.qid = self.qid[31:] - # logger.debug(f"qid:{qid}") + if "https://www.wikidata.org/wiki/" in self.id: + self.id = self.id[30:] + if "http://www.wikidata.org/entity/" in self.id: + self.id = self.id[31:] + # logger.debug(f"id:{id}") diff --git a/src/models/wikimedia/wikidata/sparql_item.py b/src/models/wikimedia/wikidata/item/sparql.py similarity index 100% rename from src/models/wikimedia/wikidata/sparql_item.py rename to src/models/wikimedia/wikidata/item/sparql.py diff --git a/src/models/wikimedia/wikidata/query/__init__.py b/src/models/wikimedia/wikidata/query/__init__.py index e6fc4be..4478e9b 100644 --- a/src/models/wikimedia/wikidata/query/__init__.py +++ b/src/models/wikimedia/wikidata/query/__init__.py @@ -5,7 +5,7 @@ from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore from src.models.wikimedia.wikidata.item import Item -from src.models.wikimedia.wikidata.sparql_item import SparqlItem +from src.models.wikimedia.wikidata.item.sparql import SparqlItem logger = logging.getLogger(__name__) diff --git a/tests/test_sparql_item.py b/tests/test_sparql_item.py index 185f3aa..2c4e2f1 100644 --- a/tests/test_sparql_item.py +++ b/tests/test_sparql_item.py @@ -1,7 +1,7 @@ from unittest import TestCase from src import console -from src.models.wikimedia.wikidata.sparql_item import SparqlItem, Value +from src.models.wikimedia.wikidata.item.sparql import SparqlItem, Value class TestSparqlItem(TestCase): From 71995a16b3e1ebcd51b821c0261bd6578f80bc83 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 12:32:42 +0200 Subject: [PATCH 11/37] Fix qid -> id --- src/helpers/jobs.py | 4 ++-- src/models/login.py | 2 +- src/models/wikimedia/wikidata/item/__init__.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index df0b46a..8c69477 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -32,7 +32,7 @@ def process_user_supplied_qids_into_batch_jobs( print_best_practice(task) jobs = [] for qid in args.add: - main_subject_item = MainSubjectItem(qid=qid, args=args, task=task) + main_subject_item = MainSubjectItem(id=qid, args=args, task=task) job = main_subject_item.fetch_items_and_get_job() if job: jobs.append(job) @@ -87,7 +87,7 @@ def get_validated_main_subjects_as_jobs( qid = random.choice(qid_subjects_not_picked_yet) qid_subjects_not_picked_yet.remove(qid) main_subject_item = MainSubjectItem( - qid=qid, args=args, task=task, confirmation=args.no_confirmation + id=qid, args=args, task=task, confirmation=args.no_confirmation ) job = main_subject_item.fetch_items_and_get_job() if job: diff --git a/src/models/login.py b/src/models/login.py index d2e229b..ff2a74e 100644 --- a/src/models/login.py +++ b/src/models/login.py @@ -1,4 +1,4 @@ -from wikibaseintegrator import wbi_config, wbi_login +from wikibaseintegrator import wbi_config, wbi_login # type: ignore import config diff --git a/src/models/wikimedia/wikidata/item/__init__.py b/src/models/wikimedia/wikidata/item/__init__.py index 17b0873..9b21d57 100644 --- a/src/models/wikimedia/wikidata/item/__init__.py +++ b/src/models/wikimedia/wikidata/item/__init__.py @@ -29,7 +29,7 @@ class Config: def __str__(self): return f"{self.label}, see {self.url}" - def __fetch_label_and_description_and_aliases__(self, task: Task = None): + def __fetch_label_and_description_and_aliases__(self): """Fetch label and aliases in the task language from the Wikidata API""" if not self.task: raise ValueError("self.task was None") From 8c409c8869593c3746e9dbe62f18336104670ff3 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 12:39:20 +0200 Subject: [PATCH 12/37] Fix id --- src/models/wikimedia/wikidata/item/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/models/wikimedia/wikidata/item/__init__.py b/src/models/wikimedia/wikidata/item/__init__.py index 9b21d57..3b49510 100644 --- a/src/models/wikimedia/wikidata/item/__init__.py +++ b/src/models/wikimedia/wikidata/item/__init__.py @@ -43,9 +43,7 @@ def __fetch_label_and_description_and_aliases__(self): f"Fetching {self.task.language_code.name.title()} label and aliases from the Wikidata API..." ): wbi = WikibaseIntegrator() - if not self.id: - id = self.id - item = wbi.item.get(id) + item = wbi.item.get(self.id) label = item.labels.get(self.task.language_code.value) if label: self.label = str(label) From 21917f02e67d4c7b4458d58ef47e6cd03ffa1f79 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 12:46:28 +0200 Subject: [PATCH 13/37] published_article.py: Make sure the cirrussearch params are set and the checks are run. Add new methods to increase code readability --- .../wikimedia/wikidata/query/__init__.py | 4 +- .../wikidata/query/preprint_article.py | 2 +- .../wikidata/query/published_article.py | 38 ++++++++++--------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/models/wikimedia/wikidata/query/__init__.py b/src/models/wikimedia/wikidata/query/__init__.py index 4478e9b..c80d5f2 100644 --- a/src/models/wikimedia/wikidata/query/__init__.py +++ b/src/models/wikimedia/wikidata/query/__init__.py @@ -53,11 +53,11 @@ def __execute__(self): def get_results(self): """Do everything needed to get the results""" self.__strip_bad_chars__() - self.__build_query__() + self.__prepare_and_build_query__() self.__execute__() self.__parse_results__() - def __build_query__(self): + def __prepare_and_build_query__(self): pass def print_number_of_results(self): diff --git a/src/models/wikimedia/wikidata/query/preprint_article.py b/src/models/wikimedia/wikidata/query/preprint_article.py index a517162..6c9e4a8 100644 --- a/src/models/wikimedia/wikidata/query/preprint_article.py +++ b/src/models/wikimedia/wikidata/query/preprint_article.py @@ -3,7 +3,7 @@ class PreprintArticleQuery(ArticleQuery): - def __build_query__(self): + def __prepare_and_build_query__(self): self.query_string = f""" #{config.user_agent} SELECT DISTINCT ?item ?itemLabel diff --git a/src/models/wikimedia/wikidata/query/published_article.py b/src/models/wikimedia/wikidata/query/published_article.py index d259164..ae9c457 100644 --- a/src/models/wikimedia/wikidata/query/published_article.py +++ b/src/models/wikimedia/wikidata/query/published_article.py @@ -6,13 +6,11 @@ class PublishedArticleQuery(ArticleQuery): cirrussearch_parameters: str = "" - def check_we_got_everything_we_need(self): + def __check_we_got_everything_we_need__(self): if not self.main_subject_item: - raise ValueError("suggestion was None") - if not self.main_subject_item: - raise ValueError("suggestion.main_subject_item was None") + raise ValueError("main_subject_item was None") if not self.main_subject_item.args: - raise ValueError("suggestion.args was None") + raise ValueError("main_subject_item.args was None") if self.main_subject_item.args.limit_to_items_without_p921: raise Exception( "Limiting to items without P921 is not " "supported yet for this task." @@ -21,25 +19,19 @@ def check_we_got_everything_we_need(self): raise ValueError("task was None") if self.main_subject_item.task.language_code is None: raise ValueError("task.language_code was None") - if self.main_subject_item.args.limit_to_items_without_p921: - console.print( - "Limiting to scholarly articles without P921 main subject only" - ) - cirrussearch_parameters = ( - f"haswbstatement:P31=Q13442814 -haswbstatement:P921" - ) - else: - cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={self.main_subject_item.id}" if self.main_subject_item.task is None: raise ValueError("task was None") if self.main_subject_item.task.language_code is None: raise ValueError("task.language_code was None") - if cirrussearch_parameters is None: - raise ValueError("cirrussearch_parameters was None") - def __build_query__( + def __prepare_and_build_query__( self, ): + self.__check_we_got_everything_we_need__() + self.__setup_cirrussearch_params__() + self.__build_query__() + + def __build_query__(self): # This query uses https://www.w3.org/TR/sparql11-property-paths/ to # find subjects that are subclass of one another up to 3 hops away # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI @@ -80,3 +72,15 @@ def __build_query__( SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} """ + + def __setup_cirrussearch_params__(self): + if self.main_subject_item.args.limit_to_items_without_p921: + console.print( + "Limiting to scholarly articles without P921 main subject only" + ) + self.cirrussearch_parameters = ( + f"haswbstatement:P31=Q13442814 -haswbstatement:P921" + ) + else: + self.cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={self.main_subject_item.id}" + From 66224990d31507fdbd59b88ab9bd74e33fd061f7 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 18:29:38 +0200 Subject: [PATCH 14/37] Rewrite a bit more and reenable Riksdagen Documents 1 new file and 14 modified --- src/helpers/cli_messages.py | 2 +- src/helpers/console.py | 7 -- src/helpers/jobs.py | 2 +- src/helpers/menus.py | 1 - src/models/items/__init__.py | 9 +- src/models/items/riksdagen_documents.py | 96 +++++-------------- src/models/items/scholarly_articles.py | 10 -- src/models/login.py | 2 +- src/models/suggestion.py | 6 +- .../wikimedia/wikidata/item/main_subject.py | 5 +- .../wikimedia/wikidata/query/__init__.py | 4 +- .../wikimedia/wikidata/query/article.py | 8 -- .../wikidata/query/preprint_article.py | 8 +- .../wikidata/query/published_article.py | 5 +- .../wikidata/query/riksdagen_document.py | 33 +++++++ src/tasks.py | 16 ++-- 16 files changed, 91 insertions(+), 123 deletions(-) delete mode 100644 src/models/wikimedia/wikidata/query/article.py create mode 100644 src/models/wikimedia/wikidata/query/riksdagen_document.py diff --git a/src/helpers/cli_messages.py b/src/helpers/cli_messages.py index b97132b..1caf3c7 100644 --- a/src/helpers/cli_messages.py +++ b/src/helpers/cli_messages.py @@ -1,4 +1,4 @@ -from __future__ import annotations +# from __future__ import annotations import argparse from typing import Set diff --git a/src/helpers/console.py b/src/helpers/console.py index de8fe73..68951a2 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -1,12 +1,5 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - from rich.console import Console -if TYPE_CHECKING: - pass - console = Console() diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py index 8c69477..9311745 100644 --- a/src/helpers/jobs.py +++ b/src/helpers/jobs.py @@ -1,4 +1,4 @@ -from __future__ import annotations +# from __future__ import annotations import argparse import logging diff --git a/src/helpers/menus.py b/src/helpers/menus.py index 4468d71..360a144 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -13,7 +13,6 @@ def select_suggestion(suggestions: List[MainSubjectItem], item: Item): if not item or not item.id or not suggestions: raise ValueError("Did not get what we need") - logger = logging.getLogger(__name__) menu = SelectionMenu( suggestions, f"Does any of these fit the label \n'{item.label}'" ) diff --git a/src/models/items/__init__.py b/src/models/items/__init__.py index 3ec8ca9..6f4058b 100644 --- a/src/models/items/__init__.py +++ b/src/models/items/__init__.py @@ -1,4 +1,4 @@ -from __future__ import annotations +# from __future__ import annotations import argparse import logging @@ -7,6 +7,7 @@ from pydantic import BaseModel +from src.helpers.console import console from src.models.wikimedia.wikidata.item.sparql import SparqlItem # if TYPE_CHECKING: @@ -41,3 +42,9 @@ def remove_duplicates(self): logger.debug(f"{len(self.sparql_items)} before duplicate removal") self.sparql_items = list(set(self.sparql_items)) logger.debug(f"{len(self.sparql_items)} after duplicate removal") + + def print_total_items(self): + console.print(f"Got a total of {len(self.sparql_items)} items") + + def execute_queries(self): + pass diff --git a/src/models/items/riksdagen_documents.py b/src/models/items/riksdagen_documents.py index 51585ca..dac12b0 100644 --- a/src/models/items/riksdagen_documents.py +++ b/src/models/items/riksdagen_documents.py @@ -1,72 +1,24 @@ -# import logging -# -# from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore -# -# import config -# from src.helpers.console import console -# from src.models.items import Items -# from src.models.suggestion import Suggestion -# from src.models.task import Task -# from src.models.wikimedia.wikidata.sparql_item import SparqlItem -# -# -# class RiksdagenDocumentItems(Items): -# def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): -# # logger = logging.getLogger(__name__) -# if suggestion is None: -# raise ValueError("suggestion was None") -# if suggestion.main_subject_item is None: -# raise ValueError("suggestion.main_subject_item was None") -# if suggestion.args is None: -# raise ValueError("suggestion.args was None") -# if suggestion.args.limit_to_items_without_p921: -# raise Exception( -# "Limiting to items without P921 is not " "supported yet for this task." -# ) -# if suggestion.search_strings is None: -# raise ValueError("suggestion.search_strings was None") -# if task is None: -# raise ValueError("task was None") -# if task.language_code is None: -# raise ValueError("task.language_code was None") -# # Fetch all items matching the search strings -# self.list = [] -# # Include spaces around the n-gram to avoid edits like this one -# # https://www.wikidata.org/w/index.php?title=Q40671507&diff=1497186802&oldid=1496945583 -# # Lowercase is not needed here as Elastic matches anyway -# for search_string in suggestion.search_strings: -# results = execute_sparql_query( -# f""" -# #{config.user_agent} -# SELECT DISTINCT ?main_subject_item ?itemLabel -# WHERE {{ -# hint:Query hint:optimizer "None". -# SERVICE wikibase:mwapi {{ -# bd:serviceParam wikibase:api "Search"; -# wikibase:endpoint "www.wikidata.org"; -# mwapi:srsearch 'haswbstatement:P8433 -haswbstatement:P921={suggestion.main_subject_item.id} "{search_string}"' . -# ?title wikibase:apiOutput mwapi:title. -# }} -# BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?main_subject_item) -# ?main_subject_item rdfs:label ?label. -# # We lowercase the label first and search for the -# # string in both the beginning, middle and end of the label -# FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || -# REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || -# REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) -# # remove more specific forms of the main subject also -# # Thanks to Jan Ainali for this improvement :) -# MINUS {{?main_subject_item wdt:P921 ?topic. ?topic wdt:P279 wd:{suggestion.main_subject_item.id}. }} -# SERVICE wikibase:label {{ bd:serviceParam wikibase:language "sv". }} -# }} -# """, -# ) -# for item_json in results["results"]["bindings"]: -# logging.debug(f"item_json:{item_json}") -# item = SparqlItem(**item_json) -# self.list.append(item) -# logging.info( -# f'Got {len(results["results"]["bindings"])} items from ' -# f"WDQS using the search string {search_string}" -# ) -# console.print(f"Got a total of {len(self.list)} items") +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore + +from src.models.items import Items +from src.models.wikimedia.wikidata.query.riksdagen_document import ( + RiksdagenDocumentQuery, +) + + +# logger = logging.getLogger(__name__) + + +class RiksdagenDocumentItems(Items): + def fetch_based_on_label(self): + self.execute_queries() + self.print_total_items() + + def execute_queries(self): + # Fetch all items matching the search strings + for search_string in self.main_subject_item.search_strings: + riksdagen_query = RiksdagenDocumentQuery( + main_subject_item=self.main_subject_item, search_string=search_string + ) + riksdagen_query.get_results() + self.sparql_items.extend(riksdagen_query.items) diff --git a/src/models/items/scholarly_articles.py b/src/models/items/scholarly_articles.py index 3f2c0eb..d19a583 100644 --- a/src/models/items/scholarly_articles.py +++ b/src/models/items/scholarly_articles.py @@ -3,7 +3,6 @@ from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore -from src.helpers.console import console from src.models.items import Items from src.models.wikimedia.wikidata.query.preprint_article import PreprintArticleQuery from src.models.wikimedia.wikidata.query.published_article import PublishedArticleQuery @@ -15,7 +14,6 @@ class ScholarlyArticleItems(Items): """This supports both published peer reviewed articles and preprints""" cirrussearch_parameters: str = "" - query: str = "" results: Dict = {} def fetch_based_on_label(self): @@ -34,17 +32,9 @@ def execute_queries(self): # https://pythonexamples.org/python-append-list-to-another-list/ self.sparql_items.extend(published_article_query.items) published_article_query.print_number_of_results() - # preprints - # We don't use CirrusSearch in this query because we can do it more easily in - # SPARQL on a small subgraph like this - # find all items that are ?main_subject_item wdt:P31/wd:P279* wd:Q1266946 - # minus the Qid we want to add preprint_query = PreprintArticleQuery( search_string=search_string, main_subject_item=self.main_subject_item ) preprint_query.get_results() preprint_query.print_number_of_results() self.sparql_items.extend(preprint_query.items) - - def print_total_items(self): - console.print(f"Got a total of {len(self.sparql_items)} items") diff --git a/src/models/login.py b/src/models/login.py index ff2a74e..ccb2206 100644 --- a/src/models/login.py +++ b/src/models/login.py @@ -1,4 +1,4 @@ -from wikibaseintegrator import wbi_config, wbi_login # type: ignore +from wikibaseintegrator import wbi_config, wbi_login # type: ignore import config diff --git a/src/models/suggestion.py b/src/models/suggestion.py index cd1a393..a992433 100644 --- a/src/models/suggestion.py +++ b/src/models/suggestion.py @@ -1,7 +1,6 @@ -from __future__ import annotations +# from __future__ import annotations import logging -from typing import TYPE_CHECKING from pydantic import BaseModel from wikibaseintegrator import WikibaseIntegrator # type: ignore @@ -9,9 +8,6 @@ from wikibaseintegrator.models import Claim # type: ignore from wikibaseintegrator.wbi_helpers import search_entities # type: ignore -if TYPE_CHECKING: - pass - logger = logging.getLogger(__name__) diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index 926363a..b918a14 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -14,6 +14,7 @@ from src.helpers.console import console from src.helpers.questions import ask_yes_no_question from src.models.items import Items +from src.models.items.riksdagen_documents import RiksdagenDocumentItems from src.models.items.scholarly_articles import ScholarlyArticleItems from src.models.wikimedia.wikidata.enums import Property, Qid from src.models.wikimedia.wikidata.item import Item @@ -221,8 +222,8 @@ def __check_we_got_what_we_need__(self): def __instantiate_the_right_class_for_this_task__(self): if self.task.id == TaskIds.SCHOLARLY_ARTICLES: self.items = ScholarlyArticleItems(main_subject_item=self) - # elif self.task.id == TaskIds.RIKSDAGEN_DOCUMENTS: - # items = RiksdagenDocumentItems(main_subject_item=self) + elif self.task.id == TaskIds.RIKSDAGEN_DOCUMENTS: + self.items = RiksdagenDocumentItems(main_subject_item=self) # elif self.task.id == TaskIds.THESIS: # items = ThesisItems(main_subject_item=self) # elif self.task.id == TaskIds.ACADEMIC_JOURNALS: diff --git a/src/models/wikimedia/wikidata/query/__init__.py b/src/models/wikimedia/wikidata/query/__init__.py index c80d5f2..c9125d2 100644 --- a/src/models/wikimedia/wikidata/query/__init__.py +++ b/src/models/wikimedia/wikidata/query/__init__.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, List +from typing import Dict, List, Any from pydantic import BaseModel from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore @@ -15,6 +15,8 @@ class Query(BaseModel): search_string = "" query_string = "" items: List[Item] = [] + # any here because of pydantic error + main_subject_item: Any def __parse_results__(self) -> None: # console.print(self.results) diff --git a/src/models/wikimedia/wikidata/query/article.py b/src/models/wikimedia/wikidata/query/article.py deleted file mode 100644 index 91e14f5..0000000 --- a/src/models/wikimedia/wikidata/query/article.py +++ /dev/null @@ -1,8 +0,0 @@ -from typing import Any - -from src.models.wikimedia.wikidata.query import Query - - -class ArticleQuery(Query): - # any here because of pydantic error - main_subject_item: Any diff --git a/src/models/wikimedia/wikidata/query/preprint_article.py b/src/models/wikimedia/wikidata/query/preprint_article.py index 6c9e4a8..356422a 100644 --- a/src/models/wikimedia/wikidata/query/preprint_article.py +++ b/src/models/wikimedia/wikidata/query/preprint_article.py @@ -1,9 +1,13 @@ import config -from src.models.wikimedia.wikidata.query.article import ArticleQuery +from src.models.wikimedia.wikidata.query import Query -class PreprintArticleQuery(ArticleQuery): +class PreprintArticleQuery(Query): def __prepare_and_build_query__(self): + # We don't use CirrusSearch in this query because we can do it more easily in + # SPARQL on a small subgraph like this + # find all items that are ?main_subject_item wdt:P31/wd:P279* wd:Q1266946 + # minus the Qid we want to add self.query_string = f""" #{config.user_agent} SELECT DISTINCT ?item ?itemLabel diff --git a/src/models/wikimedia/wikidata/query/published_article.py b/src/models/wikimedia/wikidata/query/published_article.py index ae9c457..9343b8b 100644 --- a/src/models/wikimedia/wikidata/query/published_article.py +++ b/src/models/wikimedia/wikidata/query/published_article.py @@ -1,9 +1,9 @@ import config from src.helpers.console import console -from src.models.wikimedia.wikidata.query.article import ArticleQuery +from src.models.wikimedia.wikidata.query import Query -class PublishedArticleQuery(ArticleQuery): +class PublishedArticleQuery(Query): cirrussearch_parameters: str = "" def __check_we_got_everything_we_need__(self): @@ -83,4 +83,3 @@ def __setup_cirrussearch_params__(self): ) else: self.cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={self.main_subject_item.id}" - diff --git a/src/models/wikimedia/wikidata/query/riksdagen_document.py b/src/models/wikimedia/wikidata/query/riksdagen_document.py new file mode 100644 index 0000000..fa06308 --- /dev/null +++ b/src/models/wikimedia/wikidata/query/riksdagen_document.py @@ -0,0 +1,33 @@ +import config +from src.models.wikimedia.wikidata.query import Query + + +class RiksdagenDocumentQuery(Query): + def __prepare_and_build_query__(self): + lang = self.main_subject_item.task.language_code.value + self.query_string = f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + hint:Query hint:optimizer "None". + SERVICE wikibase:mwapi {{ + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch 'haswbstatement:P8433 -haswbstatement:P921={self.main_subject_item.id} "{self.search_string}"' . + ?title wikibase:apiOutput mwapi:title. + }} + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + # We lowercase the label first and search for the + # string in both the beginning, middle and end of the label + FILTER(CONTAINS( + LCASE(?label), " {self.search_string.lower()} "@{lang}) || + REGEX(LCASE(?label), ".* {self.search_string.lower()}$"@{lang}) || + REGEX(LCASE(?label), "^{self.search_string.lower()} .*"@{lang}) + ) + # remove more specific forms of the main subject also + # Thanks to Jan Ainali for this improvement :) + MINUS {{?main_subject_item wdt:P921 ?topic. ?topic wdt:P279 wd:{self.main_subject_item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "sv". }} + }} + """ diff --git a/src/tasks.py b/src/tasks.py index 051cb91..3e72592 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -25,14 +25,14 @@ number_of_queries_per_search_string=2, ) ), - # Task( - # **dict( - # id=TaskIds.RIKSDAGEN_DOCUMENTS, - # label="Add main subject to documents from Riksdagen", - # language_code=SupportedLanguageCode.SWEDISH, - # best_practice_information=None, - # ) - # ), + Task( + **dict( + id=TaskIds.RIKSDAGEN_DOCUMENTS, + label="Add main subject to documents from Riksdagen", + language_code=SupportedLanguageCode.SWEDISH, + best_practice_information=None, + ) + ), # Task( # **dict( # id=TaskIds.THESIS, From 9bcc8565edbee9fd520d4c80519ff7ba849971f6 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 20:25:03 +0200 Subject: [PATCH 15/37] Update classes.puml and comment out dead code. --- diagrams/classes.puml | 111 ++++++++++++-------- src/models/wikimedia/wikidata/foreign_id.py | 46 ++++---- 2 files changed, 90 insertions(+), 67 deletions(-) diff --git a/diagrams/classes.puml b/diagrams/classes.puml index 3bf280d..5593a04 100644 --- a/diagrams/classes.puml +++ b/diagrams/classes.puml @@ -46,17 +46,44 @@ package wikimedia { class EntityID{ letter: WikidataNamespaceLetters rest: str - __init__() __str__() } - class ForeignID{ - __init__() + abstract class Query{ + __execute__() + __parse_results__() + __prepare_and_build_query__() + __strip_bad_chars__() + get_results() + print_number_of_results() + } + class PreprintArticleQuery { + __prepare_and_build_query__() + } + class RiksdagenDocumentQuery { + __prepare_and_build_query__() + } + class PublishedArticleQuery { + __build_query__() + __check_we_got_everything_we_need__() + __prepare_and_build_query__() + __setup_cirrussearch_params__() } class SparqlItem{ item: Value itemLabel: Value validate_qid_and_copy_label() } + class MainSubjectItem { + item: Item = None + search_strings: List[str] = None + task: Task = None + args: argparse.Namespace = None + __init__() + __str__() + add_to_items() + extract_search_strings() + search_urls ()) + } class Item{ label: Optional[str] = None description: Optional[str] = None @@ -84,51 +111,40 @@ package wikimedia { SUPINE THIRD_PERSON_SINGULAR } - enum WikidataLexicalCategory { - ADJECTIVE - ADVERB - AFFIX - NOUN - PROPER_NOUN - VERB - } - enum WikidataNamespaceLetters { - ITEM - LEXEME - PROPERTY - } +' enum WikidataLexicalCategory { +' ADJECTIVE +' ADVERB +' AFFIX +' NOUN +' PROPER_NOUN +' VERB +' } +' enum WikidataNamespaceLetters { +' ITEM +' LEXEME +' PROPERTY +' } } } package items { - abstract class Items - class AcademicJournalItems { - fetch_based_on_label() + abstract class Items { + execute_queries() + fetch_based_on_label() + number_of_sparql_items() + print_items_list() + print_total_items() + random_shuffle_items() + remove_duplicates() } class RiksdagenDocumentItems { - +list - +fetch_based_on_label() +execute_queries() +fetch_based_on_label() } - class ScholarlyArticleItems { - +list - +fetch_based_on_label() - } - class ThesisItems { - list - fetch_based_on_label() +execute_queries() +fetch_based_on_label() } } -class Suggestion { - item: Item = None - search_strings: List[str] = None - task: Task = None - args: argparse.Namespace = None - __init__() - __str__() - add_to_items() - extract_search_strings() - search_urls ()) -} class Task { best_practice_information: Union[str, None] = None @@ -136,7 +152,6 @@ class Task { label: str = None language_code: SupportedLanguageCode = None number_of_queries_per_search_string = 1 - __init__() __str__() } @@ -152,18 +167,26 @@ class BatchJob { +items: Items run() } - -Items <|-- AcademicJournalItems +class ItemSubjector { + export_jobs_to_dataframe() + match_main_subjects_from_sparql() + run() +} +'Items <|-- AcademicJournalItems Items <|-- RiksdagenDocumentItems Items <|-- ScholarlyArticleItems -Items <|-- ThesisItems +'Items <|-- ThesisItems BaseModel <|-- Entity BaseModel <|-- Task -BaseModel <|-- Suggestion BaseModel <|-- BatchJob BaseModel <|-- BatchJobs BaseModel <|-- Items +BaseModel <|-- ItemSubjector Entity <|-- Item Item <|-- SparqlItem +Item <|-- MainSubjectItem +Query <|-- PreprintArticleQuery +Query <|-- PublishedArticleQuery +Query <|-- RiksdagenDocumentQuery @enduml \ No newline at end of file diff --git a/src/models/wikimedia/wikidata/foreign_id.py b/src/models/wikimedia/wikidata/foreign_id.py index f370c95..6c402eb 100644 --- a/src/models/wikimedia/wikidata/foreign_id.py +++ b/src/models/wikimedia/wikidata/foreign_id.py @@ -1,23 +1,23 @@ -from typing import Optional - -from src.models.wikimedia.wikidata.entiyt_id import EntityId - - -class ForeignID: - id: Optional[str] - property: Optional[str] # This is the property with type ExternalId - source_item_id: Optional[str] # This is the Q-main_subject_item for the source - - def __init__( - self, - id: Optional[str] = None, - property: Optional[str] = None, - source_item_id: Optional[str] = None, - ): - self.id = id - if property is None: - raise ValueError("property was None") - self.property = str(EntityId(property)) - if source_item_id is None: - raise ValueError("source_item_id was None") - self.source_item_id = str(EntityId(source_item_id)) +# from typing import Optional +# +# from src.models.wikimedia.wikidata.entiyt_id import EntityId +# +# +# class ForeignID: +# id: Optional[str] +# property: Optional[str] # This is the property with type ExternalId +# source_item_id: Optional[str] # This is the Q-main_subject_item for the source +# +# def __init__( +# self, +# id: Optional[str] = None, +# property: Optional[str] = None, +# source_item_id: Optional[str] = None, +# ): +# self.id = id +# if property is None: +# raise ValueError("property was None") +# self.property = str(EntityId(property)) +# if source_item_id is None: +# raise ValueError("source_item_id was None") +# self.source_item_id = str(EntityId(source_item_id)) From 9f4f382289de4f023bc40e19c9967511ecc23339 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 21:16:33 +0200 Subject: [PATCH 16/37] Use properties instead of len() in multiple places. 3 files modified --- src/__init__.py | 10 +++++----- src/helpers/cli_messages.py | 15 ++++++++------- src/models/batch_jobs.py | 4 ++-- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index 050b440..dc7f651 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -69,8 +69,8 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None): ) for item_json in results["results"]["bindings"]: logging.debug(f"item_json:{item_json}") - main_subjects.append(item_json["main_subject_item"]["value"]) - if len(main_subjects) > 0: + main_subjects.append(item_json["item"]["value"]) + if main_subjects: console.print(f"Got {len(main_subjects)} results") batchjobs = get_validated_main_subjects_as_jobs( args=args, main_subjects=main_subjects @@ -87,13 +87,13 @@ def export_jobs_to_dataframe(): logger.info("Exporting jobs to DataFrame. All jobs are appended to one frame") batchjobs = parse_job_pickle() if batchjobs: - if batchjobs and batchjobs.job_count > 0: - logger.info(f"Found {batchjobs.job_count} jobs") + if batchjobs and batchjobs.number_of_jobs > 0: + logger.info(f"Found {batchjobs.number_of_jobs} jobs") df = pd.DataFrame() count = 1 for job in batchjobs.jobs: count += 1 - logger.info(f"Working on job {count}/{batchjobs.job_count}") + logger.info(f"Working on job {count}/{batchjobs.number_of_jobs}") job_df = pd.DataFrame() for item in job.main_subject_item.items.sparql_items: job_df = job_df.append( diff --git a/src/helpers/cli_messages.py b/src/helpers/cli_messages.py index 1caf3c7..ad60b9d 100644 --- a/src/helpers/cli_messages.py +++ b/src/helpers/cli_messages.py @@ -48,12 +48,12 @@ def print_found_items_table(args: argparse.Namespace = None, items: Items = None if items.sparql_items is None: raise ValueError("items.sparql_items was None") table = Table(title="Matched items found") - if len(items.sparql_items) < 1000: + if items.number_of_sparql_items < 1000: list_to_show = items.sparql_items[0:50] else: # Show 1 sample for each 20 items in the sparql_items - list_to_show = items.sparql_items[0 : int(len(items.sparql_items) / 20)] - if len(items.sparql_items) > 4000: + list_to_show = items.sparql_items[0 : int(items.number_of_sparql_items / 20)] + if items.number_of_sparql_items > 4000: console.print( "[red]Warning: This is a very large batch, please proceed with caution[/red]" ) @@ -82,12 +82,13 @@ def print_finished(): def print_job_statistics(batchjobs: BatchJobs = None): if not batchjobs: - raise ValueError("jobs was None") + raise ValueError("batchjobs was None") if not batchjobs.jobs: - raise ValueError("batchjobs.jobs was None") + # No jobs to print information about + return if not isinstance(batchjobs.jobs, list): raise ValueError("jobs was not a sparql_items") - if not len(batchjobs.jobs): + if not batchjobs.number_of_jobs: console.print("The jobs sparql_items is empty") else: total_number_of_queries = sum([job.number_of_queries for job in batchjobs.jobs]) @@ -100,7 +101,7 @@ def print_job_statistics(batchjobs: BatchJobs = None): and job.main_subject_item.items.sparql_items ) console.print( - f"The jobs sparql_items now contain a total of {len(batchjobs.jobs)} " # type: ignore + f"The jobs sparql_items now contain a total of {batchjobs.number_of_jobs} " # type: ignore f"jobs with a total of " f"{total_number_of_items} items found from " f"{total_number_of_queries} queries" diff --git a/src/models/batch_jobs.py b/src/models/batch_jobs.py index f5e27cc..3ed33e9 100644 --- a/src/models/batch_jobs.py +++ b/src/models/batch_jobs.py @@ -12,7 +12,7 @@ class BatchJobs(BaseModel): jobs: List[BatchJob] @property - def job_count(self): + def number_of_jobs(self): return len(self.jobs) def print_running_jobs(self): @@ -44,7 +44,7 @@ def run_jobs(self): self.print_running_jobs() start_time = datetime.now() for job in self.jobs: - job.main_subject_item.add_to_items(jobs=self.jobs, job_count=self.job_count) + job.main_subject_item.add_to_items(jobs=self.jobs, job_count=self.number_of_jobs) print_finished() end_time = datetime.now() console.print(f"Total runtime: {end_time - start_time}") From d56933069d896ceb993db38644a10a68857c1e98 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 23:12:17 +0200 Subject: [PATCH 17/37] main_subject.py: Better UI message --- src/models/wikimedia/wikidata/item/main_subject.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index b918a14..e9955e3 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -256,6 +256,6 @@ def fetch_items_and_get_job(self) -> Optional["BatchJob"]: else: if self.task: console.print( - f"Label for {self.task.language_code} was None on {self.url}, skipping" + f"Label for {self.task.language_code.name.title()} was None on {self.url}, skipping" ) return None From db0affe233bea0aac5d6b98513adc0bc2f4e36cb Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 23:35:58 +0200 Subject: [PATCH 18/37] Improve readability of model main_subject.py: fetch_items_and_get_job(): Split into smaller methods --- .../wikimedia/wikidata/item/main_subject.py | 47 ++++++++++++------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index e9955e3..ad0daac 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -235,27 +235,38 @@ def fetch_items_and_get_job(self) -> Optional["BatchJob"]: """This method handles all the work needed to return a job""" self.__strip_qid_prefix__() self.__fetch_label_and_description_and_aliases__() - if self.label: + if self.__got_label__(): console.print(f"Working on {self.label}") - if self.confirmation: - answer = ask_yes_no_question("Do you want to continue?") - if not answer: - return None - self.__prepare_before_fetching_items__() - if self.items: - with console.status( + if self.__is_confirmed__(): + return self.__fetch_and_parse__() + return None + def __is_confirmed__(self) -> bool: + if self.confirmation: + return ask_yes_no_question("Do you want to continue?") + else: + return True + + def __fetch_and_parse__(self) -> Optional[BatchJob]: + self.__prepare_before_fetching_items__() + if self.items: + with console.status( f"Fetching items with labels that have one of " f"the search strings by running a total of " f"{self.number_of_queries} " f"queries on WDQS..." - ): - self.items.fetch_based_on_label() - return self.__parse_into_job__() - else: - raise ValueError("items was None") + ): + self.items.fetch_based_on_label() + return self.__parse_into_job__() else: - if self.task: - console.print( - f"Label for {self.task.language_code.name.title()} was None on {self.url}, skipping" - ) - return None + raise ValueError("items was None") + + def __got_label__(self) -> bool: + if not self.label: + if not self.task: + raise ValueError("task was None") + console.print( + f"Label for {self.task.language_code.name.title()} was None, see {self.url}, skipping" + ) + return False + else: + return True From 972f61d7dde21c35f8aa7a461283c4ce8a58f7e3 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 23:47:53 +0200 Subject: [PATCH 19/37] Remove export to dataframe support to keep the tool simple and focused. 2 files modified --- src/__init__.py | 61 +++++------------------------------ src/helpers/argparse_setup.py | 6 ---- 2 files changed, 8 insertions(+), 59 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index dc7f651..6d2ccf2 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -43,14 +43,13 @@ from src.tasks import tasks logging.basicConfig(level=config.loglevel) - +logger = logging.getLogger(__name__) class ItemSubjector(BaseModel): @staticmethod def match_main_subjects_from_sparql(args: argparse.Namespace = None): """Collect subjects via SPARQL and call get_validated_main_subjects() If we get any validated jobs we handle them""" - logger = logging.getLogger(__name__) if args is None or args.sparql is None: raise ValueError("args.sparql was None") if "P1889" not in args.sparql: @@ -81,55 +80,14 @@ def match_main_subjects_from_sparql(args: argparse.Namespace = None): else: console.print("Got 0 results. Try another query or debug it using --debug") - @staticmethod - def export_jobs_to_dataframe(): - logger = logging.getLogger(__name__) - logger.info("Exporting jobs to DataFrame. All jobs are appended to one frame") - batchjobs = parse_job_pickle() - if batchjobs: - if batchjobs and batchjobs.number_of_jobs > 0: - logger.info(f"Found {batchjobs.number_of_jobs} jobs") - df = pd.DataFrame() - count = 1 - for job in batchjobs.jobs: - count += 1 - logger.info(f"Working on job {count}/{batchjobs.number_of_jobs}") - job_df = pd.DataFrame() - for item in job.main_subject_item.items.sparql_items: - job_df = job_df.append( - pd.DataFrame( - data=[ - dict( - qid=item.id, - label=item.label, - description=item.description, - ) - ] - ) - ) - df = df.append(job_df) - logger.debug( - f"Added {len(job.main_subject_item.items.sparql_items)} items to the dataframe" - ) - logger.debug(f"Exporting {len(df)} rows to pickle") - pickle_filename = "dataframe.pkl.gz" - df.to_pickle(pickle_filename) - console.print(f"Wrote to {pickle_filename} in the current directory") - else: - console.print( - "No jobs found. Create a job sparql_items first by using '--prepare-jobs'" - ) - def run(self): """This is the main function that makes everything else happen""" - logger = logging.getLogger(__name__) migrate_pickle_detection() args = setup_argparse_and_return_args() # console.print(args.sparql_items) if args.remove_prepared_jobs is True: remove_job_pickle() console.print("Removed the job sparql_items.") - # exit(0) if args.prepare_jobs is True: logger.info("Preparing jobs") if check_if_pickle_exists(config.job_pickle_file_path): @@ -145,18 +103,15 @@ def run(self): batchjobs.run_jobs() # Remove the pickle afterwards remove_job_pickle(hash=file_hash) - elif args.export_jobs_to_dataframe: - self.export_jobs_to_dataframe() elif args.sparql: self.match_main_subjects_from_sparql(args=args) else: - # if not args.run_prepared_jobs: if args.add is None: console.print("Got no arguments or QIDs. Try '--help' for help.") - exit(0) - batchjobs = get_validated_main_subjects_as_jobs( - args=args, main_subjects=args.add - ) - handle_job_preparation_or_run_directly_if_any_jobs( - args=args, batchjobs=batchjobs - ) + else: + batchjobs = get_validated_main_subjects_as_jobs( + args=args, main_subjects=args.add + ) + handle_job_preparation_or_run_directly_if_any_jobs( + args=args, batchjobs=batchjobs + ) diff --git a/src/helpers/argparse_setup.py b/src/helpers/argparse_setup.py index 8b794ab..49cceec 100644 --- a/src/helpers/argparse_setup.py +++ b/src/helpers/argparse_setup.py @@ -106,10 +106,4 @@ def setup_argparse_and_return_args(): type=int, help="When working on SPARQL queries of e.g. galaxies, match more until this many matches are in the job sparql_items", ) - parser.add_argument( - "--export-jobs-to-dataframe", - action="store_true", - help="Export the prepared job sparql_items to a Pandas DataFrame.", - default=False, - ) return parser.parse_args() From b78623ccfa868f794b876939b020f77c35400b4f Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 23:49:10 +0200 Subject: [PATCH 20/37] Remove deprecated match existing main subjects. --- src/helpers/argparse_setup.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/helpers/argparse_setup.py b/src/helpers/argparse_setup.py index 49cceec..0d35fb4 100644 --- a/src/helpers/argparse_setup.py +++ b/src/helpers/argparse_setup.py @@ -59,15 +59,6 @@ def setup_argparse_and_return_args(): action="store_true", help="Remove prepared jobs", ) - parser.add_argument( - "-m", - "--match-existing-main-subjects", - action="store_true", - help=( - "Match from sparql_items of 136.000 already used " - "main subjects on other scientific articles" - ), - ) parser.add_argument( "-w", "--limit-to-items-without-p921", From 14e38ac655e5d4e76a8dd7db13cd2af779d10bd2 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 23:51:25 +0200 Subject: [PATCH 21/37] main_subject.py: Fix typing --- src/models/wikimedia/wikidata/item/main_subject.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index ad0daac..b80c1e7 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -246,7 +246,7 @@ def __is_confirmed__(self) -> bool: else: return True - def __fetch_and_parse__(self) -> Optional[BatchJob]: + def __fetch_and_parse__(self) -> Optional["BatchJob"]: self.__prepare_before_fetching_items__() if self.items: with console.status( From d2e156338537c5ba7b37ba7cf2fad35b0a366aa1 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Mon, 3 Oct 2022 23:56:28 +0200 Subject: [PATCH 22/37] pre-commit fixes --- pyproject.toml | 2 +- src/__init__.py | 1 + src/models/batch_jobs.py | 4 +++- src/models/items/riksdagen_documents.py | 1 - src/models/wikimedia/wikidata/item/main_subject.py | 9 +++++---- src/models/wikimedia/wikidata/query/__init__.py | 2 +- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6e6a2d3..e2b1f91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "itemsubjector" -version = "0.3.2" +version = "0.3.3" description = "CLI-tool to easily add \"main subject\" aka topics in bulk to groups of items on Wikidata" authors = ["Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com>"] license = "GPLv3+" diff --git a/src/__init__.py b/src/__init__.py index 6d2ccf2..1e29f45 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -45,6 +45,7 @@ logging.basicConfig(level=config.loglevel) logger = logging.getLogger(__name__) + class ItemSubjector(BaseModel): @staticmethod def match_main_subjects_from_sparql(args: argparse.Namespace = None): diff --git a/src/models/batch_jobs.py b/src/models/batch_jobs.py index 3ed33e9..bea3de1 100644 --- a/src/models/batch_jobs.py +++ b/src/models/batch_jobs.py @@ -44,7 +44,9 @@ def run_jobs(self): self.print_running_jobs() start_time = datetime.now() for job in self.jobs: - job.main_subject_item.add_to_items(jobs=self.jobs, job_count=self.number_of_jobs) + job.main_subject_item.add_to_items( + jobs=self.jobs, job_count=self.number_of_jobs + ) print_finished() end_time = datetime.now() console.print(f"Total runtime: {end_time - start_time}") diff --git a/src/models/items/riksdagen_documents.py b/src/models/items/riksdagen_documents.py index dac12b0..c1163fc 100644 --- a/src/models/items/riksdagen_documents.py +++ b/src/models/items/riksdagen_documents.py @@ -5,7 +5,6 @@ RiksdagenDocumentQuery, ) - # logger = logging.getLogger(__name__) diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index b80c1e7..63060ff 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -240,6 +240,7 @@ def fetch_items_and_get_job(self) -> Optional["BatchJob"]: if self.__is_confirmed__(): return self.__fetch_and_parse__() return None + def __is_confirmed__(self) -> bool: if self.confirmation: return ask_yes_no_question("Do you want to continue?") @@ -250,10 +251,10 @@ def __fetch_and_parse__(self) -> Optional["BatchJob"]: self.__prepare_before_fetching_items__() if self.items: with console.status( - f"Fetching items with labels that have one of " - f"the search strings by running a total of " - f"{self.number_of_queries} " - f"queries on WDQS..." + f"Fetching items with labels that have one of " + f"the search strings by running a total of " + f"{self.number_of_queries} " + f"queries on WDQS..." ): self.items.fetch_based_on_label() return self.__parse_into_job__() diff --git a/src/models/wikimedia/wikidata/query/__init__.py b/src/models/wikimedia/wikidata/query/__init__.py index c9125d2..b95ce66 100644 --- a/src/models/wikimedia/wikidata/query/__init__.py +++ b/src/models/wikimedia/wikidata/query/__init__.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, List, Any +from typing import Any, Dict, List from pydantic import BaseModel from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore From 3bf273eeb3feb13e0a30fa79ee86b2394f23bf63 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Tue, 4 Oct 2022 00:27:45 +0200 Subject: [PATCH 23/37] scholarly_articles.py: execute_queries(): Use it thesis.py: New file with new class. --- src/models/items/scholarly_articles.py | 9 +++++- src/models/wikimedia/wikidata/query/thesis.py | 30 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 src/models/wikimedia/wikidata/query/thesis.py diff --git a/src/models/items/scholarly_articles.py b/src/models/items/scholarly_articles.py index d19a583..af3411e 100644 --- a/src/models/items/scholarly_articles.py +++ b/src/models/items/scholarly_articles.py @@ -6,12 +6,13 @@ from src.models.items import Items from src.models.wikimedia.wikidata.query.preprint_article import PreprintArticleQuery from src.models.wikimedia.wikidata.query.published_article import PublishedArticleQuery +from src.models.wikimedia.wikidata.query.thesis import ThesisQuery logger = logging.getLogger(__name__) class ScholarlyArticleItems(Items): - """This supports both published peer reviewed articles and preprints""" + """This supports both published peer reviewed articles, thesis' and preprints""" cirrussearch_parameters: str = "" results: Dict = {} @@ -38,3 +39,9 @@ def execute_queries(self): preprint_query.get_results() preprint_query.print_number_of_results() self.sparql_items.extend(preprint_query.items) + thesis_query = ThesisQuery( + search_string=search_string, main_subject_item=self.main_subject_item + ) + thesis_query.get_results() + thesis_query.print_number_of_results() + self.sparql_items.extend(thesis_query.items) diff --git a/src/models/wikimedia/wikidata/query/thesis.py b/src/models/wikimedia/wikidata/query/thesis.py new file mode 100644 index 0000000..77b78eb --- /dev/null +++ b/src/models/wikimedia/wikidata/query/thesis.py @@ -0,0 +1,30 @@ +from src.models.wikimedia.wikidata.query import Query + + +class ThesisQuery(Query): + def __prepare_and_build_query__(self): + self.query_string = ( + f""" + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + {{ + ?item wdt:P31/wd:P279* wd:Q1266946. # thesis + }} UNION + {{ + ?item wdt:P31/wd:P279* wd:Q1385450. # dissertation + }} UNION + {{ + ?item wdt:P31/wd:P279* wd:Q3099732. # technical report + }} + MINUS {{ + ?item wdt:P921 wd:{self.main_subject_item.id}; + }} + ?item rdfs:label ?label. + FILTER(CONTAINS(LCASE(?label), " {self.search_string.lower()} "@{self.main_subject_item.task.language_code.value}) || + REGEX(LCASE(?label), ".* {self.search_string.lower()}$"@{self.main_subject_item.task.language_code.value}) || + REGEX(LCASE(?label), "^{self.search_string.lower()} .*"@{self.main_subject_item.task.language_code.value})) + MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{self.main_subject_item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + """, + ) From b12904ceaaa14916680ad781884df004564a31bf Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Tue, 4 Oct 2022 00:49:49 +0200 Subject: [PATCH 24/37] tasks.py: Fix label to include thesis' --- src/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tasks.py b/src/tasks.py index 3e72592..9a6bd80 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -7,7 +7,7 @@ Task( **dict( id=TaskIds.SCHOLARLY_ARTICLES, - label="Add main subject to scholarly articles and preprints", + label="Add main subject to scholarly articles, thesis' and preprints", language_code=SupportedLanguageCode.ENGLISH, best_practice_information=( "When adding Qid main subjects please try to first " From c3b4b08e2152ce68c4a87202d162367baa9662aa Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Tue, 4 Oct 2022 00:50:14 +0200 Subject: [PATCH 25/37] thesis.py: Fix tuple bug --- src/models/wikimedia/wikidata/query/thesis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/wikimedia/wikidata/query/thesis.py b/src/models/wikimedia/wikidata/query/thesis.py index 77b78eb..2714157 100644 --- a/src/models/wikimedia/wikidata/query/thesis.py +++ b/src/models/wikimedia/wikidata/query/thesis.py @@ -26,5 +26,5 @@ def __prepare_and_build_query__(self): MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{self.main_subject_item.id}. }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} - """, + """ ) From cbae0135926e2a83d1cc9aab5f35a99b4ee2345a Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Tue, 4 Oct 2022 01:05:12 +0200 Subject: [PATCH 26/37] README.md: Update to changes --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1afb081..fbd15c9 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,13 @@ removed the QuickStatements export to simplify the program. * This project has been used in a scientific paper I wrote together with [Houcemeddine Turki](https://scholia.toolforge.org/author/Q53505397) +## Rewrite 2022: +* Important to break down methods to 1 method 1 task to increase readability. -> helps reuse in other projects. +* Important to avoid resetting attributes and instantiate classes instead. -> helps reuse in other projects. +* Simplify as much as possible to keep the whole thing lean and avoid scope creep. -> helps reuse in other projects. (KISS-principle) +* Difficult to judge which features are used and which are not. User testing would be nice. +* UML diagrams are nice. They give a good quick overview. + # Thanks During the development of this tool the author got a help multiple times from **Jan Ainali** and **Jon Søby** @@ -254,7 +261,7 @@ helpful people in the Wikimedia Cloud Services Support chat that helped with making batch jobs run successfully. Thanks also to **jsamwrites** for help with testing and suggestions -for improvement. +for improvement and for using the tool to improve a ton of items :). # License GPLv3+ From e2e46860e2222ca948a48f0f70c7dd2fe7e8b937 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Tue, 4 Oct 2022 01:05:23 +0200 Subject: [PATCH 27/37] sequence_sparql.puml: Update --- diagrams/sequence_sparql.puml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/diagrams/sequence_sparql.puml b/diagrams/sequence_sparql.puml index 57b3525..d55987c 100644 --- a/diagrams/sequence_sparql.puml +++ b/diagrams/sequence_sparql.puml @@ -15,7 +15,12 @@ alt "arguments: sparql && limit" ItemSubjector -> Wikidata : fetch scientific articles according to SPARQL query built based on the details Wikidata -> ItemSubjector : response ItemSubjector -> User : present max 50 items + alt auto-approve < 50 items enabled + ItemSubjector -> User : auto-approving batch + end + alt auto-approve < 50 items enabled OR > 50 items ItemSubjector -> User : ask for approval of batch + end ItemSubjector -> User : show count of batches and matches in the job list in memory end alt "above limit" @@ -36,8 +41,13 @@ alt "arguments: sparql && limit && prepare-jobs" ItemSubjector -> Wikidata : fetch scientific articles according to SPARQL query built based on the details Wikidata -> ItemSubjector : response ItemSubjector -> User : present max 50 items + alt auto-approve < 50 items enabled + ItemSubjector -> User : auto-approving batch + end + alt auto-approve < 50 items enabled OR > 50 items ItemSubjector -> User : ask for approval of batch - ItemSubjector -> User : show count of batches and matches in the job list in memory + end +ItemSubjector -> User : show count of batches and matches in the job list in memory end alt "above limit" ItemSubjector -> User : ask before continuing From ed595ed0cad7fbe37624eada0a94732c05e53c59 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Tue, 4 Oct 2022 01:45:51 +0200 Subject: [PATCH 28/37] Add new class MainSubjects 1 new file and 3 modified --- src/__init__.py | 55 +---- src/helpers/jobs.py | 147 -------------- src/models/main_subjects.py | 188 ++++++++++++++++++ .../wikimedia/wikidata/item/main_subject.py | 2 +- src/models/wikimedia/wikidata/query/thesis.py | 4 +- 5 files changed, 198 insertions(+), 198 deletions(-) delete mode 100644 src/helpers/jobs.py create mode 100644 src/models/main_subjects.py diff --git a/src/__init__.py b/src/__init__.py index 1e29f45..56276ad 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,4 +1,3 @@ -import argparse import logging import pandas as pd # type: ignore @@ -16,11 +15,6 @@ ) from src.helpers.console import console, print_keep_an_eye_on_wdqs_lag from src.helpers.enums import TaskIds -from src.helpers.jobs import ( - get_validated_main_subjects_as_jobs, - handle_job_preparation_or_run_directly_if_any_jobs, - process_user_supplied_qids_into_batch_jobs, -) from src.helpers.menus import select_task from src.helpers.migration import migrate_pickle_detection from src.helpers.pickle import ( @@ -37,6 +31,7 @@ ) from src.models.batch_job import BatchJob from src.models.batch_jobs import BatchJobs +from src.models.main_subjects import MainSubjects from src.models.suggestion import Suggestion from src.models.task import Task from src.models.wikimedia.wikidata.entiyt_id import EntityId @@ -47,40 +42,6 @@ class ItemSubjector(BaseModel): - @staticmethod - def match_main_subjects_from_sparql(args: argparse.Namespace = None): - """Collect subjects via SPARQL and call get_validated_main_subjects() - If we get any validated jobs we handle them""" - if args is None or args.sparql is None: - raise ValueError("args.sparql was None") - if "P1889" not in args.sparql: - console.print( - "Your SPARQL did not contain P1889 (different from). " - "Please include 'MINUS {?main_subject_item wdt:P1889 [].}' " - "in your WHERE clause to avoid false positives." - ) - exit(0) - else: - logger.info("Detected P1889 in the query") - with console.status("Running query on WDQS..."): - main_subjects = [] - results = execute_sparql_query( - args.sparql.replace("{", "{{").replace("}", "}}"), - ) - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - main_subjects.append(item_json["item"]["value"]) - if main_subjects: - console.print(f"Got {len(main_subjects)} results") - batchjobs = get_validated_main_subjects_as_jobs( - args=args, main_subjects=main_subjects - ) - handle_job_preparation_or_run_directly_if_any_jobs( - args=args, batchjobs=batchjobs - ) - else: - console.print("Got 0 results. Try another query or debug it using --debug") - def run(self): """This is the main function that makes everything else happen""" migrate_pickle_detection() @@ -105,14 +66,14 @@ def run(self): # Remove the pickle afterwards remove_job_pickle(hash=file_hash) elif args.sparql: - self.match_main_subjects_from_sparql(args=args) + main_subjects = MainSubjects(args=args) + main_subjects.match_main_subjects_from_sparql() + main_subjects.get_validated_main_subjects_as_jobs() + main_subjects.handle_job_preparation_or_run_directly_if_any_jobs() else: if args.add is None: console.print("Got no arguments or QIDs. Try '--help' for help.") else: - batchjobs = get_validated_main_subjects_as_jobs( - args=args, main_subjects=args.add - ) - handle_job_preparation_or_run_directly_if_any_jobs( - args=args, batchjobs=batchjobs - ) + main_subjects = MainSubjects(args=args, main_subjects=args.add) + main_subjects.get_validated_main_subjects_as_jobs() + main_subjects.handle_job_preparation_or_run_directly_if_any_jobs() diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py deleted file mode 100644 index 9311745..0000000 --- a/src/helpers/jobs.py +++ /dev/null @@ -1,147 +0,0 @@ -# from __future__ import annotations - -import argparse -import logging -import random -from typing import List - -import config -from src.helpers.cli_messages import print_best_practice, print_job_statistics -from src.helpers.console import console -from src.helpers.menus import select_task -from src.helpers.questions import ask_add_to_job_queue, ask_yes_no_question -from src.models.batch_job import BatchJob -from src.models.batch_jobs import BatchJobs -from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem -from src.tasks import Task - -# TODO rewrite as OOP -logger = logging.getLogger(__name__) - - -def process_user_supplied_qids_into_batch_jobs( - args: argparse.Namespace = None, task: Task = None -) -> List[BatchJob]: - """Given a sparql_items of QIDs, we go through - them and return a sparql_items of jobs""" - # logger = logging.getLogger(__name__) - if not args: - raise ValueError("args was None") - if not task: - raise ValueError("task was None") - print_best_practice(task) - jobs = [] - for qid in args.add: - main_subject_item = MainSubjectItem(id=qid, args=args, task=task) - job = main_subject_item.fetch_items_and_get_job() - if job: - jobs.append(job) - return jobs - - -def handle_job_preparation_or_run_directly_if_any_jobs( - args: argparse.Namespace = None, batchjobs: BatchJobs = None -): - if batchjobs is None: - raise ValueError("batchjobs was None") - if args is None: - raise ValueError("args was None") - if len(batchjobs.jobs) > 0: - if args.prepare_jobs: - console.print(f"Adding {len(batchjobs.jobs)} job(s) " f"to the jobs file") - for job in batchjobs.jobs: - from src import add_to_job_pickle - - add_to_job_pickle(job) - print_job_statistics(batchjobs=batchjobs) - console.print( - f"You can run the jobs " - f"non-interactively e.g. on the Toolforge " - f"Kubernetes cluster using -r or --run-prepared-jobs. " - f"See Kubernetes_HOWTO.md for details." - ) - else: - batchjobs.run_jobs() - - -def get_validated_main_subjects_as_jobs( - args: argparse.Namespace, main_subjects: List[str] -) -> BatchJobs: - """This function randomly picks a subject and add it to the - sparql_items of jobs if it had any matches and the user approved it""" - if args is None: - raise ValueError("args was None") - if main_subjects is None: - raise ValueError("main subjects was None") - qid_subjects_not_picked_yet = main_subjects - task: Task = select_task() - if task is None: - raise ValueError("Got no task") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - batchjobs = BatchJobs(jobs=[]) - while True: - # Check if we have any subjects left in the sparql_items - if len(qid_subjects_not_picked_yet): - console.print(f"Picking a random main subject") - qid = random.choice(qid_subjects_not_picked_yet) - qid_subjects_not_picked_yet.remove(qid) - main_subject_item = MainSubjectItem( - id=qid, args=args, task=task, confirmation=args.no_confirmation - ) - job = main_subject_item.fetch_items_and_get_job() - if job: - # Here we check if the user has enabled no ask more limit. - if args.no_ask_match_more_limit is None: - logger.debug("No ask more was None") - if job.main_subject_item.items: - job.main_subject_item.items.print_items_list(args=args) - job.main_subject_item.print_search_strings() - answer = ask_add_to_job_queue(job) - if answer: - batchjobs.jobs.append(job) - else: - batchjobs.jobs.append(job) - logger.debug(f"joblist now has {len(batchjobs.jobs)} jobs") - print_job_statistics(batchjobs=batchjobs) - if len(qid_subjects_not_picked_yet) > 0: - if ( - args.no_ask_match_more_limit is None - or args.no_ask_match_more_limit - < sum( - len(job.main_subject_item.items.sparql_items) - for job in batchjobs.jobs - if job.main_subject_item.items - and job.main_subject_item.items.sparql_items - ) - ): - answer_was_yes = ask_yes_no_question("Match one more?") - if not answer_was_yes: - break - else: - console.print("No more subjects in the sparql_items.") - break - else: - console.print("No more subjects in the sparql_items. Exiting.") - break - if args.no_ask_match_more_limit: - batchjobs_limit = BatchJobs(jobs=[]) - for job in batchjobs.jobs: - if job.main_subject_item.items: - job.main_subject_item.items.print_items_list(args=args) - job.main_subject_item.print_search_strings() - if ( - config.automatically_approve_jobs_with_less_than_fifty_matches - and job.main_subject_item.items.number_of_sparql_items < 50 - ): - console.print( - f"This job with {job.main_subject_item.items.number_of_sparql_items} matching items was automatically approved", - style="green", - ) - batchjobs_limit.jobs.append(job) - else: - answer = ask_add_to_job_queue(job) - if answer: - batchjobs_limit.jobs.append(job) - return batchjobs_limit - return batchjobs diff --git a/src/models/main_subjects.py b/src/models/main_subjects.py new file mode 100644 index 0000000..6c232f9 --- /dev/null +++ b/src/models/main_subjects.py @@ -0,0 +1,188 @@ +# from __future__ import annotations + +import argparse +import logging +import random +from time import sleep +from typing import List, Optional + +from pydantic import BaseModel +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore + +import config +from src.helpers.cli_messages import print_best_practice, print_job_statistics +from src.helpers.console import console +from src.helpers.menus import select_task +from src.helpers.questions import ask_add_to_job_queue, ask_yes_no_question +from src.models.batch_job import BatchJob +from src.models.batch_jobs import BatchJobs +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.tasks import Task + +logger = logging.getLogger(__name__) + + +class MainSubjects(BaseModel): + args: argparse.Namespace + task: Optional[Task] = None + main_subjects: List[str] = [] + batchjobs: BatchJobs = BatchJobs(jobs=[]) + + class Config: + arbitrary_types_allowed = True + + def match_main_subjects_from_sparql(self): + """Collect subjects via SPARQL and call get_validated_main_subjects() + If we get any validated jobs we handle them""" + if self.args is None or self.args.sparql is None: + raise ValueError("args.sparql was None") + self.__check_different_from__() + self.__fetch_main_subjects__() + if self.main_subjects: + console.print(f"Got {len(self.main_subjects)} results") + sleep(1) + self.get_validated_main_subjects_as_jobs() + self.handle_job_preparation_or_run_directly_if_any_jobs() + else: + console.print("Got 0 results. Try another query or debug it using --debug") + + def process_user_supplied_qids_into_batch_jobs(self) -> List[BatchJob]: + """Given a sparql_items of QIDs, we go through + them and return a sparql_items of jobs""" + # TODO this should not return anything + if self.task: + print_best_practice(self.task) + jobs = [] + for qid in self.args.add: + main_subject_item = MainSubjectItem(id=qid, args=self.args, task=self.task) + job = main_subject_item.fetch_items_and_get_job_if_confirmed() + if job: + jobs.append(job) + return jobs + return [] + + def handle_job_preparation_or_run_directly_if_any_jobs(self): + if self.batchjobs is None: + raise ValueError("batchjobs was None") + if self.args is None: + raise ValueError("args was None") + if self.batchjobs.number_of_jobs: + if self.args.prepare_jobs: + console.print( + f"Adding {self.batchjobs.number_of_jobs} job(s) " + f"to the jobs file" + ) + for job in self.batchjobs.jobs: + from src import add_to_job_pickle + + add_to_job_pickle(job) + print_job_statistics(batchjobs=self.batchjobs) + console.print( + f"You can run the jobs " + f"non-interactively e.g. on the Toolforge " + f"Kubernetes cluster using -r or --run-prepared-jobs. " + f"See Kubernetes_HOWTO.md for details." + ) + else: + self.batchjobs.run_jobs() + + def get_validated_main_subjects_as_jobs( + self, + ) -> None: + """This function randomly picks a subject and add it to the + sparql_items of jobs if it had any matches and the user approved it""" + # TODO break this down into smaller methods + qid_subjects_not_picked_yet = self.main_subjects + self.__select_task__() + while True: + # Check if we have any subjects left in the sparql_items + if len(qid_subjects_not_picked_yet): + console.print(f"Picking a random main subject") + qid = random.choice(qid_subjects_not_picked_yet) + qid_subjects_not_picked_yet.remove(qid) + main_subject_item = MainSubjectItem( + id=qid, + args=self.args, + task=self.task, + confirmation=self.args.no_confirmation, + ) + job = main_subject_item.fetch_items_and_get_job_if_confirmed() + if job: + # Here we check if the user has enabled no ask more limit. + if self.args.no_ask_match_more_limit is None: + logger.debug("No ask more was None") + if job.main_subject_item.items: + job.main_subject_item.items.print_items_list(args=self.args) + job.main_subject_item.print_search_strings() + answer = ask_add_to_job_queue(job) + if answer: + self.batchjobs.jobs.append(job) + else: + self.batchjobs.jobs.append(job) + logger.debug(f"joblist now has {self.batchjobs.number_of_jobs} jobs") + print_job_statistics(batchjobs=self.batchjobs) + if len(qid_subjects_not_picked_yet): + if ( + self.args.no_ask_match_more_limit is None + or self.args.no_ask_match_more_limit + < sum( + job.main_subject_item.items.number_of_sparql_items + for job in self.batchjobs.jobs + if job.main_subject_item.items + and job.main_subject_item.items.sparql_items + ) + ): + answer_was_yes = ask_yes_no_question("Match one more?") + if not answer_was_yes: + break + else: + console.print("No more subjects in the sparql_items.") + break + else: + console.print("No more subjects in the sparql_items. Exiting.") + break + if self.args.no_ask_match_more_limit: + for job in self.batchjobs.jobs: + if job.main_subject_item.items: + job.main_subject_item.items.print_items_list(args=self.args) + job.main_subject_item.print_search_strings() + if ( + config.automatically_approve_jobs_with_less_than_fifty_matches + and job.main_subject_item.items.number_of_sparql_items < 50 + ): + console.print( + f"This job with {job.main_subject_item.items.number_of_sparql_items} matching items was automatically approved", + style="green", + ) + self.batchjobs.jobs.append(job) + else: + answer = ask_add_to_job_queue(job) + if answer: + self.batchjobs.jobs.append(job) + + def __select_task__(self): + self.task: Task = select_task() + if self.task is None: + raise ValueError("Got no task") + + def __fetch_main_subjects__(self): + with console.status("Running query on WDQS..."): + results = execute_sparql_query( + self.args.sparql.replace("{", "{{").replace("}", "}}"), + ) + for item_json in results["results"]["bindings"]: + logging.debug(f"item_json:{item_json}") + self.main_subjects.append(item_json["item"]["value"]) + + def __check_different_from__(self): + if "P1889" not in self.args.sparql: + console.print( + "Your SPARQL did not contain P1889 (different from). " + "Please include 'MINUS {?main_subject_item wdt:P1889 [].}' " + "in your WHERE clause to avoid false positives." + ) + exit(0) + else: + logger.info("Detected P1889 in the query") + + diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index 63060ff..c7430ae 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -231,7 +231,7 @@ def __instantiate_the_right_class_for_this_task__(self): else: raise ValueError(f"{self.task.id} was not recognized") - def fetch_items_and_get_job(self) -> Optional["BatchJob"]: + def fetch_items_and_get_job_if_confirmed(self) -> Optional["BatchJob"]: """This method handles all the work needed to return a job""" self.__strip_qid_prefix__() self.__fetch_label_and_description_and_aliases__() diff --git a/src/models/wikimedia/wikidata/query/thesis.py b/src/models/wikimedia/wikidata/query/thesis.py index 2714157..71efd6d 100644 --- a/src/models/wikimedia/wikidata/query/thesis.py +++ b/src/models/wikimedia/wikidata/query/thesis.py @@ -3,8 +3,7 @@ class ThesisQuery(Query): def __prepare_and_build_query__(self): - self.query_string = ( - f""" + self.query_string = f""" SELECT DISTINCT ?item ?itemLabel WHERE {{ {{ @@ -27,4 +26,3 @@ def __prepare_and_build_query__(self): SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }} """ - ) From 1cb10d18b7426b8c5f7a4f380658cf3be8b8da41 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Tue, 4 Oct 2022 01:47:10 +0200 Subject: [PATCH 29/37] Disable dead code --- src/models/main_subjects.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/models/main_subjects.py b/src/models/main_subjects.py index 6c232f9..5fcd8cc 100644 --- a/src/models/main_subjects.py +++ b/src/models/main_subjects.py @@ -7,14 +7,13 @@ from typing import List, Optional from pydantic import BaseModel -from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore import config -from src.helpers.cli_messages import print_best_practice, print_job_statistics +from src.helpers.cli_messages import print_job_statistics from src.helpers.console import console from src.helpers.menus import select_task from src.helpers.questions import ask_add_to_job_queue, ask_yes_no_question -from src.models.batch_job import BatchJob from src.models.batch_jobs import BatchJobs from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem from src.tasks import Task @@ -46,20 +45,20 @@ def match_main_subjects_from_sparql(self): else: console.print("Got 0 results. Try another query or debug it using --debug") - def process_user_supplied_qids_into_batch_jobs(self) -> List[BatchJob]: - """Given a sparql_items of QIDs, we go through - them and return a sparql_items of jobs""" - # TODO this should not return anything - if self.task: - print_best_practice(self.task) - jobs = [] - for qid in self.args.add: - main_subject_item = MainSubjectItem(id=qid, args=self.args, task=self.task) - job = main_subject_item.fetch_items_and_get_job_if_confirmed() - if job: - jobs.append(job) - return jobs - return [] + # def process_user_supplied_qids_into_batch_jobs(self) -> List[BatchJob]: + # """Given a sparql_items of QIDs, we go through + # them and return a sparql_items of jobs""" + # # TODO this should not return anything + # if self.task: + # print_best_practice(self.task) + # jobs = [] + # for qid in self.args.add: + # main_subject_item = MainSubjectItem(id=qid, args=self.args, task=self.task) + # job = main_subject_item.fetch_items_and_get_job_if_confirmed() + # if job: + # jobs.append(job) + # return jobs + # return [] def handle_job_preparation_or_run_directly_if_any_jobs(self): if self.batchjobs is None: From 3a32ce9bc6fb0359bf3feda60ebabc571f6b8b75 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 6 Oct 2022 08:33:42 +0200 Subject: [PATCH 30/37] Avoid separate config module which just complicates the setup unnecessarily. Update README.md and add section Caveat also. --- .gitignore | 4 +-- README.md | 16 +++++++++-- .../__init__example.py => config.example.py | 28 +++++++++++++++++-- config/items.py | 24 ---------------- .../wikimedia/wikidata/item/main_subject.py | 7 ++--- src/models/wikimedia/wikidata/item/sparql.py | 5 ++-- 6 files changed, 47 insertions(+), 37 deletions(-) rename config/__init__example.py => config.example.py (59%) delete mode 100644 config/items.py diff --git a/.gitignore b/.gitignore index 64f6dda..6572641 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ -config/__init__.py -pickle.dat \ No newline at end of file +pickle.dat +config.py diff --git a/README.md b/README.md index fbd15c9..a014ea4 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,11 @@ open graph editable by anyone and maintained by the community itself for the pur scientists find each others work. Wikipedia and Scholia can fill that gap but we need good tooling to curate the millions of items. +# Caveat +This type of matching that ONLY takes the label and not the underlying structured +data into account is SUBOPTIMAL. You are very welcome to suggest or contribute improvements +so we can improve the tool to help you make better edits. + # Features This tool has the following features: * Adding a list of manually supplied main subjects to a few selected subgraphs @@ -34,6 +39,11 @@ so that batches can easily be undone later if needed. Click "details" in the summary of edits to see more. # Installation +Download the latest release with: + +`$ pip install itemsubjector` + +# Alternative installation in venv Download the release tarball or clone the tool using Git. ## Clone the repository @@ -41,7 +51,7 @@ Download the release tarball or clone the tool using Git. Then checkout the latest release. -`git checkout v0.x` where x is the latest number on the release page. +`git checkout vx.x.x` where x is the latest number on the release page. ## Setup the environment @@ -72,6 +82,8 @@ issues. ## Wikimedia Cloud Services Kubernetes Beta cluster +*Note: this is for advanced users experienced with a SSH console environment, ask in the [Telegram WikiCite group](https://meta.m.wikimedia.org/wiki/Telegram#Wikidata) if you need help* + See [Kubernetes_HOWTO.md](Kubernetes_HOWTO.md) # Setup @@ -82,7 +94,7 @@ config/__init__.py and enter the botusername for your account and make sure you give it the *edit page permission* and *high volume permissions*) -* e.g. `cd config && cp __init__example.py __init__.py && nano __init__.py` +* e.g. `cp config_example.py config.py && nano config.py` *GNU Nano is an editor, press `ctrl+x` when you are done and `y` to save your changes* diff --git a/config/__init__example.py b/config.example.py similarity index 59% rename from config/__init__example.py rename to config.example.py index 7b7398f..e5cef59 100644 --- a/config/__init__example.py +++ b/config.example.py @@ -1,14 +1,14 @@ import logging import tempfile +from typing import List # Rename this file to __init__.py # Add your botpassword and login here: - username = "" password = "" -# Settings +# General settings automatically_approve_jobs_with_less_than_fifty_matches = False loglevel = logging.WARNING wiki_user = "User:Username" # Change this to your username @@ -21,3 +21,27 @@ # This should work for all platforms except kubernetes job_pickle_file_path = f"{tempfile.gettempdir()}/pickle.dat" # job_pickle_file_path = "~/pickle.dat" # works on kubernetes + +""" +Settings for items +""" + +list_of_allowed_aliases: List[str] = [] # Add elements like this ["API"] + +# Scholarly items settings +blocklist_for_scholarly_items: List[str] = [ + "Q28196260", + "Q28196260", + "Q28196266", # iodine + "Q27863114", # testosterone + "Q28196266", + "Q28196260", + "Q109270553", # dieback +] +no_alias_for_scholarly_items: List[str] = [ + "Q407541", + "Q423930", + "Q502327", + "Q416950", + "Q95566669", # hypertension +] diff --git a/config/items.py b/config/items.py deleted file mode 100644 index bffb1a4..0000000 --- a/config/items.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -Settings for items -""" -from typing import List - -list_of_allowed_aliases: List[str] = [] # Add elements like this ["API"] - -# Scholarly items settings -blocklist_for_scholarly_items: List[str] = [ - "Q28196260", - "Q28196260", - "Q28196266", # iodine - "Q27863114", # testosterone - "Q28196266", - "Q28196260", - "Q109270553", # dieback -] -no_alias_for_scholarly_items: List[str] = [ - "Q407541", - "Q423930", - "Q502327", - "Q416950", - "Q95566669", # hypertension -] diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py index c7430ae..e5f867d 100644 --- a/src/models/wikimedia/wikidata/item/main_subject.py +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -8,7 +8,6 @@ from wikibaseintegrator.wbi_helpers import search_entities # type: ignore import config -import config.items from src.helpers.calculations import calculate_random_editgroups_hash from src.helpers.cleaning import clean_rich_formatting from src.helpers.console import console @@ -139,7 +138,7 @@ def __extract_search_strings__(self): if self.args.no_aliases is True: console.print("Alias matching is turned off") no_aliases = True - elif self.id in config.items.no_alias_for_scholarly_items: + elif self.id in config.no_alias_for_scholarly_items: logger.info( f"Alias matching is turned off for this main_subject_item: {self.label}" ) @@ -153,7 +152,7 @@ def __extract_search_strings__(self): if self.aliases and no_aliases is False: for alias in self.aliases: # logger.debug(f"extracting alias:{alias}") - if len(alias) < 5 and alias not in config.items.list_of_allowed_aliases: + if len(alias) < 5 and alias not in config.list_of_allowed_aliases: console.print( f"Skipping short alias '{alias}' to avoid false positives", style="#FF8000", @@ -164,7 +163,7 @@ def __extract_search_strings__(self): f"in a label of at least one Qid that is not a scholarly article", style="#FF8000", ) - elif alias in config.items.list_of_allowed_aliases: + elif alias in config.list_of_allowed_aliases: console.print(f"Found {alias} in the allow sparql_items") self.search_strings.add(self.__clean_special_symbols__(alias)) else: diff --git a/src/models/wikimedia/wikidata/item/sparql.py b/src/models/wikimedia/wikidata/item/sparql.py index 3e207e3..89591cc 100644 --- a/src/models/wikimedia/wikidata/item/sparql.py +++ b/src/models/wikimedia/wikidata/item/sparql.py @@ -1,7 +1,6 @@ from pydantic import BaseModel import config -import config.items from src.models.wikimedia.wikidata.entiyt_id import EntityId from src.models.wikimedia.wikidata.item import Item @@ -23,11 +22,11 @@ def validate_qid_and_copy_label(self): def is_in_blocklist(self) -> bool: if self.id is None: raise ValueError("did not get an id") - if config.items.blocklist_for_scholarly_items is None: + if config.blocklist_for_scholarly_items is None: raise ValueError( "config.blocklist_for_scholarly_items was None, please fix" ) - if self.id in config.items.blocklist_for_scholarly_items: + if self.id in config.blocklist_for_scholarly_items: return True else: return False From cfba4eb37efb20a2294519e6be5519c9a7826b31 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 6 Oct 2022 08:34:11 +0200 Subject: [PATCH 31/37] pre-commit fixes --- src/models/main_subjects.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/models/main_subjects.py b/src/models/main_subjects.py index 5fcd8cc..62e6dd5 100644 --- a/src/models/main_subjects.py +++ b/src/models/main_subjects.py @@ -183,5 +183,3 @@ def __check_different_from__(self): exit(0) else: logger.info("Detected P1889 in the query") - - From 6bd1a354c0b1ae77d8aa9dcaa1a39b5ef9c2ca8e Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 6 Oct 2022 08:39:03 +0200 Subject: [PATCH 32/37] inspection fixes --- src/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/__init__.py b/src/__init__.py index 56276ad..998d65a 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -42,7 +42,8 @@ class ItemSubjector(BaseModel): - def run(self): + @staticmethod + def run(): """This is the main function that makes everything else happen""" migrate_pickle_detection() args = setup_argparse_and_return_args() From b2f61bebdc11f2f5ae422e97e9ba795e0269bb82 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 6 Oct 2022 09:15:26 +0200 Subject: [PATCH 33/37] poetry.lock: Update --- poetry.lock | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/poetry.lock b/poetry.lock index dc7db08..0b9b7b7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -313,7 +313,7 @@ gitdb = ">=4.0.1,<5" [[package]] name = "identify" -version = "2.5.5" +version = "2.5.6" description = "File identification library for Python" category = "dev" optional = false @@ -567,7 +567,7 @@ optional = false python-versions = ">=3.6" [[package]] -name = "Pygments" +name = "pygments" version = "2.13.0" description = "Pygments is a syntax highlighting package written in Python." category = "main" @@ -645,7 +645,7 @@ reference = "pypi-test" [[package]] name = "pytz" -version = "2022.2.1" +version = "2022.4" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -653,14 +653,14 @@ python-versions = "*" [[package]] name = "pyupgrade" -version = "2.38.2" +version = "2.38.4" description = "A tool to automatically upgrade syntax for newer versions." category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] -tokenize-rt = ">=3.2.0" +tokenize-rt = "<5" [[package]] name = "PyYAML" @@ -705,7 +705,7 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] [[package]] name = "rich" -version = "12.5.1" +version = "12.6.0" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" category = "main" optional = false @@ -744,7 +744,7 @@ python-versions = ">=3.5" [[package]] name = "safety" -version = "2.2.0" +version = "2.3.1" description = "Checks installed dependencies for known vulnerabilities and licenses." category = "dev" optional = false @@ -758,6 +758,10 @@ requests = "*" "ruamel.yaml" = ">=0.17.21" setuptools = ">=19.3" +[package.extras] +github = ["jinja2 (>=3.1.0)", "pygithub (>=1.43.3)"] +gitlab = ["python-gitlab (>=1.3.0)"] + [[package]] name = "setuptools" version = "65.4.1" @@ -1079,8 +1083,8 @@ GitPython = [ {file = "GitPython-3.1.27.tar.gz", hash = "sha256:1c885ce809e8ba2d88a29befeb385fcea06338d3640712b59ca623c220bb5704"}, ] identify = [ - {file = "identify-2.5.5-py2.py3-none-any.whl", hash = "sha256:ef78c0d96098a3b5fe7720be4a97e73f439af7cf088ebf47b620aeaa10fadf97"}, - {file = "identify-2.5.5.tar.gz", hash = "sha256:322a5699daecf7c6fd60e68852f36f2ecbb6a36ff6e6e973e0d2bb6fca203ee6"}, + {file = "identify-2.5.6-py2.py3-none-any.whl", hash = "sha256:b276db7ec52d7e89f5bc4653380e33054ddc803d25875952ad90b0f012cbcdaa"}, + {file = "identify-2.5.6.tar.gz", hash = "sha256:6c32dbd747aa4ceee1df33f25fed0b0f6e0d65721b15bd151307ff7056d50245"}, ] idna = [ {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, @@ -1273,7 +1277,7 @@ pyflakes = [ {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, ] -Pygments = [ +pygments = [ {file = "Pygments-2.13.0-py3-none-any.whl", hash = "sha256:f258f2a4c5bb73c7b9daae54f90aa5fa3aba3390164e059cc0408606a12d0647"}, {file = "Pygments-2.13.0.tar.gz", hash = "sha256:e9a08f2ce610f4e86142d6a6d8209f61bea945865993fe46de701f40aed43cdd"}, ] @@ -1294,12 +1298,12 @@ python-dateutil = [ {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:1efd93a2e222eb7360b5396108fdfa04e9753637d24143b8026dfb48ffbc755b"}, ] pytz = [ - {file = "pytz-2022.2.1-py2.py3-none-any.whl", hash = "sha256:220f481bdafa09c3955dfbdddb7b57780e9a94f5127e35456a48589b9e0c0197"}, - {file = "pytz-2022.2.1.tar.gz", hash = "sha256:cea221417204f2d1a2aa03ddae3e867921971d0d76f14d87abb4414415bbdcf5"}, + {file = "pytz-2022.4-py2.py3-none-any.whl", hash = "sha256:2c0784747071402c6e99f0bafdb7da0fa22645f06554c7ae06bf6358897e9c91"}, + {file = "pytz-2022.4.tar.gz", hash = "sha256:48ce799d83b6f8aab2020e369b627446696619e79645419610b9facd909b3174"}, ] pyupgrade = [ - {file = "pyupgrade-2.38.2-py2.py3-none-any.whl", hash = "sha256:41bb9a9fd48fe57163b0dacffff433d6d5a63a0f7c2402918917b5f1a533342b"}, - {file = "pyupgrade-2.38.2.tar.gz", hash = "sha256:a5d778c9de0b53975c6a9eac2d0df5adfad244a9f7d7993d8a114223ebbda367"}, + {file = "pyupgrade-2.38.4-py2.py3-none-any.whl", hash = "sha256:944ff993c396ddc2b9012eb3de4cda138eb4c149b22c6c560d4c8bfd0e180982"}, + {file = "pyupgrade-2.38.4.tar.gz", hash = "sha256:1eb43a49f416752929741ba4d706bf3f33593d3cac9bdc217fc1ef55c047c1f4"}, ] PyYAML = [ {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, @@ -1352,8 +1356,8 @@ requests-oauthlib = [ {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"}, ] rich = [ - {file = "rich-12.5.1-py3-none-any.whl", hash = "sha256:2eb4e6894cde1e017976d2975ac210ef515d7548bc595ba20e195fb9628acdeb"}, - {file = "rich-12.5.1.tar.gz", hash = "sha256:63a5c5ce3673d3d5fbbf23cd87e11ab84b6b451436f1b7f19ec54b6bc36ed7ca"}, + {file = "rich-12.6.0-py3-none-any.whl", hash = "sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e"}, + {file = "rich-12.6.0.tar.gz", hash = "sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0"}, ] "ruamel.yaml" = [ {file = "ruamel.yaml-0.17.21-py3-none-any.whl", hash = "sha256:742b35d3d665023981bd6d16b3d24248ce5df75fdb4e2924e93a05c1f8b61ca7"}, @@ -1392,8 +1396,8 @@ rich = [ {file = "ruamel.yaml.clib-0.2.6.tar.gz", hash = "sha256:4ff604ce439abb20794f05613c374759ce10e3595d1867764dd1ae675b85acbd"}, ] safety = [ - {file = "safety-2.2.0-py3-none-any.whl", hash = "sha256:b1a0f4c34fb41c502a7a5c54774c18376da382bc9d866ee26b39b2c747c0de40"}, - {file = "safety-2.2.0.tar.gz", hash = "sha256:6745de12acbd60a58001fe66cb540355187d7b991b30104d9ef14ff4e4826073"}, + {file = "safety-2.3.1-py3-none-any.whl", hash = "sha256:8f098d12b607db2756886280e85c28ece8db1bba4f45fc5f981f4663217bd619"}, + {file = "safety-2.3.1.tar.gz", hash = "sha256:6e6fcb7d4e8321098cf289f59b65051cafd3467f089c6e57c9f894ae32c23b71"}, ] setuptools = [ {file = "setuptools-65.4.1-py3-none-any.whl", hash = "sha256:1b6bdc6161661409c5f21508763dc63ab20a9ac2f8ba20029aaaa7fdb9118012"}, From 8330d6055cc8625e9c4bf529bdda1417ec301d21 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 6 Oct 2022 09:35:10 +0200 Subject: [PATCH 34/37] Convert tests from Suggestion to MainSubjectItem. --- tests/test_main_subject_item.py | 37 ++++++++++++++++++++++++++ tests/test_suggestion.py | 46 --------------------------------- 2 files changed, 37 insertions(+), 46 deletions(-) create mode 100644 tests/test_main_subject_item.py delete mode 100644 tests/test_suggestion.py diff --git a/tests/test_main_subject_item.py b/tests/test_main_subject_item.py new file mode 100644 index 0000000..4629604 --- /dev/null +++ b/tests/test_main_subject_item.py @@ -0,0 +1,37 @@ +import argparse +from unittest import TestCase + +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.tasks import tasks + + +class TestMainSubjectItem(TestCase): + def test_extract_search_strings(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__extract_search_strings__() + if not len(msi.search_strings) == 1: + self.fail() + + def test_extract_search_strings_with_problematic_alias(self): + # Note this will fail if anyone adds or remove an alias on the item. + msi = MainSubjectItem( + id="Q273510", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__fetch_label_and_description_and_aliases__() + msi.__extract_search_strings__() + msi.print_search_strings() + print(len(msi.search_strings)) + assert len(msi.search_strings) == 10 diff --git a/tests/test_suggestion.py b/tests/test_suggestion.py deleted file mode 100644 index 67c0a35..0000000 --- a/tests/test_suggestion.py +++ /dev/null @@ -1,46 +0,0 @@ -# import argparse -# from unittest import TestCase -# -# from src.models.suggestion import Suggestion -# from src.models.wikimedia.wikidata.sparql_item import SparqlItem, Value -# from src.tasks import tasks -# -# -# class TestSuggestion(TestCase): -# def test_extract_search_strings(self): -# item = SparqlItem( -# item=Value(value="Q407541"), itemLabel=Value(value="fentanyl") -# ) -# item.validate_qid_and_copy_label() -# suggestion = Suggestion( -# main_subject_item=item, -# task=tasks[0], -# args=argparse.Namespace( -# no_aliases=dict(no_aliases=False), -# show_search_urls=dict(show_search_urls=False), -# ), -# ) -# suggestion.__extract_search_strings__() -# # suggestion.print_search_strings() -# if not len(suggestion.search_strings) == 1: -# self.fail() -# -# def test_extract_search_strings_with_problematic_alias(self): -# """This has a problematic alias "thrush" which is also a bird""" -# item = SparqlItem( -# item=Value(value="Q273510"), itemLabel=Value(value="candidadis") -# ) -# item.validate_qid_and_copy_label() -# item.__fetch_label_and_description_and_aliases__(task=tasks[0]) -# suggestion = Suggestion( -# main_subject_item=item, -# task=tasks[0], -# args=argparse.Namespace( -# no_aliases=dict(no_aliases=False), -# show_search_urls=dict(show_search_urls=False), -# ), -# ) -# suggestion.__extract_search_strings__() -# suggestion.print_search_strings() -# print(len(suggestion.search_strings)) -# assert len(suggestion.search_strings) == 10 From 4f38971af0e1664eba6aa06f2c455032fbea8906 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 6 Oct 2022 10:44:12 +0200 Subject: [PATCH 35/37] Add tests for queries. 4 new files and 3 modified --- .../wikidata/query/published_article.py | 8 +-- src/models/wikimedia/wikidata/query/thesis.py | 2 + tests/test_preprint_article.py | 46 +++++++++++++++ tests/test_published_article.py | 58 +++++++++++++++++++ tests/test_query.py | 6 ++ tests/test_riksdagen_document.py | 51 ++++++++++++++++ tests/test_thesis.py | 49 ++++++++++++++++ 7 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 tests/test_preprint_article.py create mode 100644 tests/test_published_article.py create mode 100644 tests/test_riksdagen_document.py create mode 100644 tests/test_thesis.py diff --git a/src/models/wikimedia/wikidata/query/published_article.py b/src/models/wikimedia/wikidata/query/published_article.py index 9343b8b..578c454 100644 --- a/src/models/wikimedia/wikidata/query/published_article.py +++ b/src/models/wikimedia/wikidata/query/published_article.py @@ -11,10 +11,10 @@ def __check_we_got_everything_we_need__(self): raise ValueError("main_subject_item was None") if not self.main_subject_item.args: raise ValueError("main_subject_item.args was None") - if self.main_subject_item.args.limit_to_items_without_p921: - raise Exception( - "Limiting to items without P921 is not " "supported yet for this task." - ) + # if self.main_subject_item.args.limit_to_items_without_p921: + # raise Exception( + # "Limiting to items without P921 is not " "supported yet for this task." + # ) if self.main_subject_item.task is None: raise ValueError("task was None") if self.main_subject_item.task.language_code is None: diff --git a/src/models/wikimedia/wikidata/query/thesis.py b/src/models/wikimedia/wikidata/query/thesis.py index 71efd6d..0e74935 100644 --- a/src/models/wikimedia/wikidata/query/thesis.py +++ b/src/models/wikimedia/wikidata/query/thesis.py @@ -1,9 +1,11 @@ +import config from src.models.wikimedia.wikidata.query import Query class ThesisQuery(Query): def __prepare_and_build_query__(self): self.query_string = f""" + #{config.user_agent} SELECT DISTINCT ?item ?itemLabel WHERE {{ {{ diff --git a/tests/test_preprint_article.py b/tests/test_preprint_article.py new file mode 100644 index 0000000..9bf1dec --- /dev/null +++ b/tests/test_preprint_article.py @@ -0,0 +1,46 @@ +import argparse +from unittest import TestCase + +from src import tasks +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.models.wikimedia.wikidata.query.preprint_article import PreprintArticleQuery + + +class TestPreprintArticle(TestCase): + def test_preprint_article_query(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__extract_search_strings__() + q = PreprintArticleQuery(main_subject_item=msi) + for string in msi.search_strings: + q.search_string = string + q.__prepare_and_build_query__() + print(q.query_string) + assert q.query_string.replace(" ","").strip() == """ + #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q + SELECT DISTINCT ?item ?itemLabel + WHERE { + ?item wdt:P31/wd:P279* wd:Q580922. # preprint + MINUS { + ?item wdt:P921 wd:Q407541; + } + ?item rdfs:label ?label. + FILTER(CONTAINS( + LCASE(?label), " fentanyl " + @en) || + REGEX(LCASE(?label), ".* fentanyl$" + @en) || + REGEX(LCASE(?label), "^fentanyl .*" + @en) + ) + MINUS {?item wdt:P921/wdt:P279 wd:Q407541. } + SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + }""".replace(" ","").strip() + break \ No newline at end of file diff --git a/tests/test_published_article.py b/tests/test_published_article.py new file mode 100644 index 0000000..5e8cdc3 --- /dev/null +++ b/tests/test_published_article.py @@ -0,0 +1,58 @@ +import argparse +from unittest import TestCase + +from src import tasks +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.models.wikimedia.wikidata.query.published_article import PublishedArticleQuery + + +class TestPublishedArticleQuery(TestCase): + def test_published_article_query(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + limit_to_items_without_p921=dict(limit_to_items_without_p921=False), + ), + ) + msi.__extract_search_strings__() + q = PublishedArticleQuery(main_subject_item=msi) + for string in msi.search_strings: + q.search_string = string + q.__prepare_and_build_query__() + print(q.query_string) + assert q.query_string.replace(" ","").replace("\\", "").strip() == """ + #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q + SELECT DISTINCT ?item ?itemLabel + WHERE { + hint:Query hint:optimizer "None". + BIND(STR('haswbstatement:P31=Q13442814 -haswbstatement:P921 "fentanyl"') as ?search_string) + SERVICE wikibase:mwapi { + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch ?search_string. + ?title wikibase:apiOutput mwapi:title. + } + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + BIND(REPLACE(LCASE(?label), ",", "") as ?label1) + BIND(REPLACE(?label1, ":", "") as ?label2) + BIND(REPLACE(?label2, ";", "") as ?label3) + BIND(REPLACE(?label3, "\\(", "") as ?label4) + BIND(REPLACE(?label4, "\\)", "") as ?label5) + BIND(REPLACE(?label5, "\\[", "") as ?label6) + BIND(REPLACE(?label6, "\\]", "") as ?label7) + BIND(REPLACE(?label7, "\\\\", "") as ?label8) + BIND(?label8 as ?cleaned_label) + FILTER(CONTAINS(?cleaned_label, ' fentanyl '@en) || + REGEX(?cleaned_label, '.* fentanyl$'@en) || + REGEX(?cleaned_label, '^fentanyl .*'@en)) + MINUS {?item wdt:P921/wdt:P279 wd:Q407541. } + MINUS {?item wdt:P921/wdt:P279/wdt:P279 wd:Q407541. } + MINUS {?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:Q407541. } + SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + }""".replace(" ","").replace("\\", "").strip() + break \ No newline at end of file diff --git a/tests/test_query.py b/tests/test_query.py index 3ae7bb3..14bd9fe 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -1,3 +1,9 @@ +from src.models.wikimedia.wikidata.query import Query + + class TestQuery: def test_parse_results(self): pass + def test_build_query(self): + """We test that all the parameters are taken into consideration""" + q = Query() \ No newline at end of file diff --git a/tests/test_riksdagen_document.py b/tests/test_riksdagen_document.py new file mode 100644 index 0000000..bb6b974 --- /dev/null +++ b/tests/test_riksdagen_document.py @@ -0,0 +1,51 @@ +import argparse +from unittest import TestCase + +from src import tasks +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.models.wikimedia.wikidata.query.riksdagen_document import RiksdagenDocumentQuery + + +class TestRiksdagenDocumentQuery(TestCase): + def test_riksdagen_document_query(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__extract_search_strings__() + q = RiksdagenDocumentQuery(main_subject_item=msi) + for string in msi.search_strings: + q.search_string = string + q.__prepare_and_build_query__() + print(q.query_string) + assert q.query_string.replace(" ","").strip() == """ + #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q + SELECT DISTINCT ?item ?itemLabel + WHERE { + hint:Query hint:optimizer "None". + SERVICE wikibase:mwapi { + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch 'haswbstatement:P8433 -haswbstatement:P921=Q407541 "fentanyl"' . + ?title wikibase:apiOutput mwapi:title. + } + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + # We lowercase the label first and search for the + # string in both the beginning, middle and end of the label + FILTER(CONTAINS( + LCASE(?label), " fentanyl "@en) || + REGEX(LCASE(?label), ".* fentanyl$"@en) || + REGEX(LCASE(?label), "^fentanyl .*"@en) + ) + # remove more specific forms of the main subject also + # Thanks to Jan Ainali for this improvement :) + MINUS {?main_subject_item wdt:P921 ?topic. ?topic wdt:P279 wd:Q407541. } + SERVICE wikibase:label { bd:serviceParam wikibase:language "sv". } + }""".replace(" ","").strip() + break \ No newline at end of file diff --git a/tests/test_thesis.py b/tests/test_thesis.py new file mode 100644 index 0000000..62efe70 --- /dev/null +++ b/tests/test_thesis.py @@ -0,0 +1,49 @@ +import argparse +from unittest import TestCase + +from src import tasks +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.models.wikimedia.wikidata.query.thesis import ThesisQuery + + +class TestThesisQuery(TestCase): + def test_thesis_query(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__extract_search_strings__() + q = ThesisQuery(main_subject_item=msi) + for string in msi.search_strings: + q.search_string = string + q.__prepare_and_build_query__() + print(q.query_string) + assert q.query_string.replace(" ","").strip() == """ + #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q + SELECT DISTINCT ?item ?itemLabel + WHERE { + { + ?item wdt:P31/wd:P279* wd:Q1266946. # thesis + } UNION + { + ?item wdt:P31/wd:P279* wd:Q1385450. # dissertation + } UNION + { + ?item wdt:P31/wd:P279* wd:Q3099732. # technical report + } + MINUS { + ?item wdt:P921 wd:Q407541; + } + ?item rdfs:label ?label. + FILTER(CONTAINS(LCASE(?label), " fentanyl "@en) || + REGEX(LCASE(?label), ".* fentanyl$"@en) || + REGEX(LCASE(?label), "^fentanyl .*"@en)) + MINUS {?item wdt:P921 ?topic. ?topic wdt:P279 wd:Q407541. } + SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + }""".replace(" ","").strip() + break \ No newline at end of file From a97f3ea24875e8526b61c36c7a1e78422afec4a8 Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 6 Oct 2022 10:48:05 +0200 Subject: [PATCH 36/37] Disable --limit-to-items-without-p921 for now Update README.md --- README.md | 12 ++++++------ src/helpers/argparse_setup.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a014ea4..1e5c0ff 100644 --- a/README.md +++ b/README.md @@ -160,10 +160,10 @@ Usage example: `poetry run python itemsubjector.py -a Q34 --show-item-urls` (the shorthand `-iu` also works) -### Limit to scholarly articles without main subject -Usage example: -`poetry run python itemsubjector.py -a Q34 --limit-to-items-without-p921` -(the shorthand `-w` also works) +[//]: # (### Limit to scholarly articles without main subject) +[//]: # (Usage example:) +[//]: # (`poetry run python itemsubjector.py -a Q34 --limit-to-items-without-p921` ) +[//]: # ((the shorthand `-w` also works)) ## Matching main subjects based on a SPARQL query. The tool can create a list of jobs by picking random subjects from a @@ -225,8 +225,6 @@ optional arguments: Remove prepared jobs -m, --match-existing-main-subjects Match from list of 136.000 already used main subjects on other scientific articles - -w, --limit-to-items-without-p921 - Limit matching to scientific articles without P921 main subject -su, --show-search-urls Show an extra column in the table of search strings with links -iu, --show-item-urls @@ -258,6 +256,8 @@ removed the QuickStatements export to simplify the program. * Simplify as much as possible to keep the whole thing lean and avoid scope creep. -> helps reuse in other projects. (KISS-principle) * Difficult to judge which features are used and which are not. User testing would be nice. * UML diagrams are nice. They give a good quick overview. +* Removing options that no-one seems to use helps keeping it simple. It would be valuable to get better insight of how the +program is used by the users. A discussion in github might help in this. # Thanks During the development of this tool the author got a diff --git a/src/helpers/argparse_setup.py b/src/helpers/argparse_setup.py index 0d35fb4..15d2549 100644 --- a/src/helpers/argparse_setup.py +++ b/src/helpers/argparse_setup.py @@ -59,12 +59,12 @@ def setup_argparse_and_return_args(): action="store_true", help="Remove prepared jobs", ) - parser.add_argument( - "-w", - "--limit-to-items-without-p921", - action="store_true", - help="Limit matching to scientific articles without P921 main subject", - ) + # parser.add_argument( + # "-w", + # "--limit-to-items-without-p921", + # action="store_true", + # help="Limit matching to scientific articles without P921 main subject", + # ) parser.add_argument( "-su", "--show-search-urls", From 54f2c9efba0c9b61d6257fddf3c387025fe5085a Mon Sep 17 00:00:00 2001 From: Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com> Date: Thu, 6 Oct 2022 10:50:27 +0200 Subject: [PATCH 37/37] Remove empty test_query.py --- tests/test_preprint_article.py | 11 ++++++++--- tests/test_published_article.py | 13 ++++++++++--- tests/test_query.py | 9 --------- tests/test_riksdagen_document.py | 15 +++++++++++---- tests/test_thesis.py | 11 ++++++++--- 5 files changed, 37 insertions(+), 22 deletions(-) delete mode 100644 tests/test_query.py diff --git a/tests/test_preprint_article.py b/tests/test_preprint_article.py index 9bf1dec..57db65c 100644 --- a/tests/test_preprint_article.py +++ b/tests/test_preprint_article.py @@ -23,7 +23,9 @@ def test_preprint_article_query(self): q.search_string = string q.__prepare_and_build_query__() print(q.query_string) - assert q.query_string.replace(" ","").strip() == """ + assert ( + q.query_string.replace(" ", "").strip() + == """ #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q SELECT DISTINCT ?item ?itemLabel WHERE { @@ -42,5 +44,8 @@ def test_preprint_article_query(self): ) MINUS {?item wdt:P921/wdt:P279 wd:Q407541. } SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } - }""".replace(" ","").strip() - break \ No newline at end of file + }""".replace( + " ", "" + ).strip() + ) + break diff --git a/tests/test_published_article.py b/tests/test_published_article.py index 5e8cdc3..137a1fe 100644 --- a/tests/test_published_article.py +++ b/tests/test_published_article.py @@ -24,7 +24,9 @@ def test_published_article_query(self): q.search_string = string q.__prepare_and_build_query__() print(q.query_string) - assert q.query_string.replace(" ","").replace("\\", "").strip() == """ + assert ( + q.query_string.replace(" ", "").replace("\\", "").strip() + == """ #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q SELECT DISTINCT ?item ?itemLabel WHERE { @@ -54,5 +56,10 @@ def test_published_article_query(self): MINUS {?item wdt:P921/wdt:P279/wdt:P279 wd:Q407541. } MINUS {?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:Q407541. } SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } - }""".replace(" ","").replace("\\", "").strip() - break \ No newline at end of file + }""".replace( + " ", "" + ) + .replace("\\", "") + .strip() + ) + break diff --git a/tests/test_query.py b/tests/test_query.py deleted file mode 100644 index 14bd9fe..0000000 --- a/tests/test_query.py +++ /dev/null @@ -1,9 +0,0 @@ -from src.models.wikimedia.wikidata.query import Query - - -class TestQuery: - def test_parse_results(self): - pass - def test_build_query(self): - """We test that all the parameters are taken into consideration""" - q = Query() \ No newline at end of file diff --git a/tests/test_riksdagen_document.py b/tests/test_riksdagen_document.py index bb6b974..d66604a 100644 --- a/tests/test_riksdagen_document.py +++ b/tests/test_riksdagen_document.py @@ -3,7 +3,9 @@ from src import tasks from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem -from src.models.wikimedia.wikidata.query.riksdagen_document import RiksdagenDocumentQuery +from src.models.wikimedia.wikidata.query.riksdagen_document import ( + RiksdagenDocumentQuery, +) class TestRiksdagenDocumentQuery(TestCase): @@ -23,7 +25,9 @@ def test_riksdagen_document_query(self): q.search_string = string q.__prepare_and_build_query__() print(q.query_string) - assert q.query_string.replace(" ","").strip() == """ + assert ( + q.query_string.replace(" ", "").strip() + == """ #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q SELECT DISTINCT ?item ?itemLabel WHERE { @@ -47,5 +51,8 @@ def test_riksdagen_document_query(self): # Thanks to Jan Ainali for this improvement :) MINUS {?main_subject_item wdt:P921 ?topic. ?topic wdt:P279 wd:Q407541. } SERVICE wikibase:label { bd:serviceParam wikibase:language "sv". } - }""".replace(" ","").strip() - break \ No newline at end of file + }""".replace( + " ", "" + ).strip() + ) + break diff --git a/tests/test_thesis.py b/tests/test_thesis.py index 62efe70..784ac6b 100644 --- a/tests/test_thesis.py +++ b/tests/test_thesis.py @@ -23,7 +23,9 @@ def test_thesis_query(self): q.search_string = string q.__prepare_and_build_query__() print(q.query_string) - assert q.query_string.replace(" ","").strip() == """ + assert ( + q.query_string.replace(" ", "").strip() + == """ #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q SELECT DISTINCT ?item ?itemLabel WHERE { @@ -45,5 +47,8 @@ def test_thesis_query(self): REGEX(LCASE(?label), "^fentanyl .*"@en)) MINUS {?item wdt:P921 ?topic. ?topic wdt:P279 wd:Q407541. } SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } - }""".replace(" ","").strip() - break \ No newline at end of file + }""".replace( + " ", "" + ).strip() + ) + break