diff --git a/.github/workflows/lint_python.yml b/.github/workflows/lint_python.yml index 69fc6c5..370cd51 100644 --- a/.github/workflows/lint_python.yml +++ b/.github/workflows/lint_python.yml @@ -17,7 +17,7 @@ jobs: - run: poetry install --with=dev - run: poetry run bandit --recursive --skip B301,B105,B403,B311,B101,B324 src # B101 is assert statements - run: poetry run black --check . - - run: poetry run codespell # --ignore-words-list="" --skip="*.css,*.js,*.lock" + - run: poetry run codespell # --ignore-words-sparql_items="" --skip="*.css,*.js,*.lock" - run: poetry run flake8 --ignore=C408,C416,E203,F401,F541,R501,R502,R503,R504,W503 --max-complexity=21 --max-line-length=162 --show-source --statistics . - run: poetry run isort --check-only --profile black . diff --git a/.gitignore b/.gitignore index 64f6dda..6572641 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ -config/__init__.py -pickle.dat \ No newline at end of file +pickle.dat +config.py diff --git a/README.md b/README.md index 1afb081..1e5c0ff 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,11 @@ open graph editable by anyone and maintained by the community itself for the pur scientists find each others work. Wikipedia and Scholia can fill that gap but we need good tooling to curate the millions of items. +# Caveat +This type of matching that ONLY takes the label and not the underlying structured +data into account is SUBOPTIMAL. You are very welcome to suggest or contribute improvements +so we can improve the tool to help you make better edits. + # Features This tool has the following features: * Adding a list of manually supplied main subjects to a few selected subgraphs @@ -34,6 +39,11 @@ so that batches can easily be undone later if needed. Click "details" in the summary of edits to see more. # Installation +Download the latest release with: + +`$ pip install itemsubjector` + +# Alternative installation in venv Download the release tarball or clone the tool using Git. ## Clone the repository @@ -41,7 +51,7 @@ Download the release tarball or clone the tool using Git. Then checkout the latest release. -`git checkout v0.x` where x is the latest number on the release page. +`git checkout vx.x.x` where x is the latest number on the release page. ## Setup the environment @@ -72,6 +82,8 @@ issues. ## Wikimedia Cloud Services Kubernetes Beta cluster +*Note: this is for advanced users experienced with a SSH console environment, ask in the [Telegram WikiCite group](https://meta.m.wikimedia.org/wiki/Telegram#Wikidata) if you need help* + See [Kubernetes_HOWTO.md](Kubernetes_HOWTO.md) # Setup @@ -82,7 +94,7 @@ config/__init__.py and enter the botusername for your account and make sure you give it the *edit page permission* and *high volume permissions*) -* e.g. `cd config && cp __init__example.py __init__.py && nano __init__.py` +* e.g. `cp config_example.py config.py && nano config.py` *GNU Nano is an editor, press `ctrl+x` when you are done and `y` to save your changes* @@ -148,10 +160,10 @@ Usage example: `poetry run python itemsubjector.py -a Q34 --show-item-urls` (the shorthand `-iu` also works) -### Limit to scholarly articles without main subject -Usage example: -`poetry run python itemsubjector.py -a Q34 --limit-to-items-without-p921` -(the shorthand `-w` also works) +[//]: # (### Limit to scholarly articles without main subject) +[//]: # (Usage example:) +[//]: # (`poetry run python itemsubjector.py -a Q34 --limit-to-items-without-p921` ) +[//]: # ((the shorthand `-w` also works)) ## Matching main subjects based on a SPARQL query. The tool can create a list of jobs by picking random subjects from a @@ -213,8 +225,6 @@ optional arguments: Remove prepared jobs -m, --match-existing-main-subjects Match from list of 136.000 already used main subjects on other scientific articles - -w, --limit-to-items-without-p921 - Limit matching to scientific articles without P921 main subject -su, --show-search-urls Show an extra column in the table of search strings with links -iu, --show-item-urls @@ -240,6 +250,15 @@ removed the QuickStatements export to simplify the program. * This project has been used in a scientific paper I wrote together with [Houcemeddine Turki](https://scholia.toolforge.org/author/Q53505397) +## Rewrite 2022: +* Important to break down methods to 1 method 1 task to increase readability. -> helps reuse in other projects. +* Important to avoid resetting attributes and instantiate classes instead. -> helps reuse in other projects. +* Simplify as much as possible to keep the whole thing lean and avoid scope creep. -> helps reuse in other projects. (KISS-principle) +* Difficult to judge which features are used and which are not. User testing would be nice. +* UML diagrams are nice. They give a good quick overview. +* Removing options that no-one seems to use helps keeping it simple. It would be valuable to get better insight of how the +program is used by the users. A discussion in github might help in this. + # Thanks During the development of this tool the author got a help multiple times from **Jan Ainali** and **Jon Søby** @@ -254,7 +273,7 @@ helpful people in the Wikimedia Cloud Services Support chat that helped with making batch jobs run successfully. Thanks also to **jsamwrites** for help with testing and suggestions -for improvement. +for improvement and for using the tool to improve a ton of items :). # License GPLv3+ diff --git a/config/__init__example.py b/config.example.py similarity index 59% rename from config/__init__example.py rename to config.example.py index 7b7398f..e5cef59 100644 --- a/config/__init__example.py +++ b/config.example.py @@ -1,14 +1,14 @@ import logging import tempfile +from typing import List # Rename this file to __init__.py # Add your botpassword and login here: - username = "" password = "" -# Settings +# General settings automatically_approve_jobs_with_less_than_fifty_matches = False loglevel = logging.WARNING wiki_user = "User:Username" # Change this to your username @@ -21,3 +21,27 @@ # This should work for all platforms except kubernetes job_pickle_file_path = f"{tempfile.gettempdir()}/pickle.dat" # job_pickle_file_path = "~/pickle.dat" # works on kubernetes + +""" +Settings for items +""" + +list_of_allowed_aliases: List[str] = [] # Add elements like this ["API"] + +# Scholarly items settings +blocklist_for_scholarly_items: List[str] = [ + "Q28196260", + "Q28196260", + "Q28196266", # iodine + "Q27863114", # testosterone + "Q28196266", + "Q28196260", + "Q109270553", # dieback +] +no_alias_for_scholarly_items: List[str] = [ + "Q407541", + "Q423930", + "Q502327", + "Q416950", + "Q95566669", # hypertension +] diff --git a/config/items.py b/config/items.py deleted file mode 100644 index bffb1a4..0000000 --- a/config/items.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -Settings for items -""" -from typing import List - -list_of_allowed_aliases: List[str] = [] # Add elements like this ["API"] - -# Scholarly items settings -blocklist_for_scholarly_items: List[str] = [ - "Q28196260", - "Q28196260", - "Q28196266", # iodine - "Q27863114", # testosterone - "Q28196266", - "Q28196260", - "Q109270553", # dieback -] -no_alias_for_scholarly_items: List[str] = [ - "Q407541", - "Q423930", - "Q502327", - "Q416950", - "Q95566669", # hypertension -] diff --git a/diagrams/classes.puml b/diagrams/classes.puml index 3bf280d..5593a04 100644 --- a/diagrams/classes.puml +++ b/diagrams/classes.puml @@ -46,17 +46,44 @@ package wikimedia { class EntityID{ letter: WikidataNamespaceLetters rest: str - __init__() __str__() } - class ForeignID{ - __init__() + abstract class Query{ + __execute__() + __parse_results__() + __prepare_and_build_query__() + __strip_bad_chars__() + get_results() + print_number_of_results() + } + class PreprintArticleQuery { + __prepare_and_build_query__() + } + class RiksdagenDocumentQuery { + __prepare_and_build_query__() + } + class PublishedArticleQuery { + __build_query__() + __check_we_got_everything_we_need__() + __prepare_and_build_query__() + __setup_cirrussearch_params__() } class SparqlItem{ item: Value itemLabel: Value validate_qid_and_copy_label() } + class MainSubjectItem { + item: Item = None + search_strings: List[str] = None + task: Task = None + args: argparse.Namespace = None + __init__() + __str__() + add_to_items() + extract_search_strings() + search_urls ()) + } class Item{ label: Optional[str] = None description: Optional[str] = None @@ -84,51 +111,40 @@ package wikimedia { SUPINE THIRD_PERSON_SINGULAR } - enum WikidataLexicalCategory { - ADJECTIVE - ADVERB - AFFIX - NOUN - PROPER_NOUN - VERB - } - enum WikidataNamespaceLetters { - ITEM - LEXEME - PROPERTY - } +' enum WikidataLexicalCategory { +' ADJECTIVE +' ADVERB +' AFFIX +' NOUN +' PROPER_NOUN +' VERB +' } +' enum WikidataNamespaceLetters { +' ITEM +' LEXEME +' PROPERTY +' } } } package items { - abstract class Items - class AcademicJournalItems { - fetch_based_on_label() + abstract class Items { + execute_queries() + fetch_based_on_label() + number_of_sparql_items() + print_items_list() + print_total_items() + random_shuffle_items() + remove_duplicates() } class RiksdagenDocumentItems { - +list - +fetch_based_on_label() +execute_queries() +fetch_based_on_label() } - class ScholarlyArticleItems { - +list - +fetch_based_on_label() - } - class ThesisItems { - list - fetch_based_on_label() +execute_queries() +fetch_based_on_label() } } -class Suggestion { - item: Item = None - search_strings: List[str] = None - task: Task = None - args: argparse.Namespace = None - __init__() - __str__() - add_to_items() - extract_search_strings() - search_urls ()) -} class Task { best_practice_information: Union[str, None] = None @@ -136,7 +152,6 @@ class Task { label: str = None language_code: SupportedLanguageCode = None number_of_queries_per_search_string = 1 - __init__() __str__() } @@ -152,18 +167,26 @@ class BatchJob { +items: Items run() } - -Items <|-- AcademicJournalItems +class ItemSubjector { + export_jobs_to_dataframe() + match_main_subjects_from_sparql() + run() +} +'Items <|-- AcademicJournalItems Items <|-- RiksdagenDocumentItems Items <|-- ScholarlyArticleItems -Items <|-- ThesisItems +'Items <|-- ThesisItems BaseModel <|-- Entity BaseModel <|-- Task -BaseModel <|-- Suggestion BaseModel <|-- BatchJob BaseModel <|-- BatchJobs BaseModel <|-- Items +BaseModel <|-- ItemSubjector Entity <|-- Item Item <|-- SparqlItem +Item <|-- MainSubjectItem +Query <|-- PreprintArticleQuery +Query <|-- PublishedArticleQuery +Query <|-- RiksdagenDocumentQuery @enduml \ No newline at end of file diff --git a/diagrams/sequence_sparql.puml b/diagrams/sequence_sparql.puml index 57b3525..d55987c 100644 --- a/diagrams/sequence_sparql.puml +++ b/diagrams/sequence_sparql.puml @@ -15,7 +15,12 @@ alt "arguments: sparql && limit" ItemSubjector -> Wikidata : fetch scientific articles according to SPARQL query built based on the details Wikidata -> ItemSubjector : response ItemSubjector -> User : present max 50 items + alt auto-approve < 50 items enabled + ItemSubjector -> User : auto-approving batch + end + alt auto-approve < 50 items enabled OR > 50 items ItemSubjector -> User : ask for approval of batch + end ItemSubjector -> User : show count of batches and matches in the job list in memory end alt "above limit" @@ -36,8 +41,13 @@ alt "arguments: sparql && limit && prepare-jobs" ItemSubjector -> Wikidata : fetch scientific articles according to SPARQL query built based on the details Wikidata -> ItemSubjector : response ItemSubjector -> User : present max 50 items + alt auto-approve < 50 items enabled + ItemSubjector -> User : auto-approving batch + end + alt auto-approve < 50 items enabled OR > 50 items ItemSubjector -> User : ask for approval of batch - ItemSubjector -> User : show count of batches and matches in the job list in memory + end +ItemSubjector -> User : show count of batches and matches in the job list in memory end alt "above limit" ItemSubjector -> User : ask before continuing diff --git a/poetry.lock b/poetry.lock index dc7db08..0b9b7b7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -313,7 +313,7 @@ gitdb = ">=4.0.1,<5" [[package]] name = "identify" -version = "2.5.5" +version = "2.5.6" description = "File identification library for Python" category = "dev" optional = false @@ -567,7 +567,7 @@ optional = false python-versions = ">=3.6" [[package]] -name = "Pygments" +name = "pygments" version = "2.13.0" description = "Pygments is a syntax highlighting package written in Python." category = "main" @@ -645,7 +645,7 @@ reference = "pypi-test" [[package]] name = "pytz" -version = "2022.2.1" +version = "2022.4" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -653,14 +653,14 @@ python-versions = "*" [[package]] name = "pyupgrade" -version = "2.38.2" +version = "2.38.4" description = "A tool to automatically upgrade syntax for newer versions." category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] -tokenize-rt = ">=3.2.0" +tokenize-rt = "<5" [[package]] name = "PyYAML" @@ -705,7 +705,7 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] [[package]] name = "rich" -version = "12.5.1" +version = "12.6.0" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" category = "main" optional = false @@ -744,7 +744,7 @@ python-versions = ">=3.5" [[package]] name = "safety" -version = "2.2.0" +version = "2.3.1" description = "Checks installed dependencies for known vulnerabilities and licenses." category = "dev" optional = false @@ -758,6 +758,10 @@ requests = "*" "ruamel.yaml" = ">=0.17.21" setuptools = ">=19.3" +[package.extras] +github = ["jinja2 (>=3.1.0)", "pygithub (>=1.43.3)"] +gitlab = ["python-gitlab (>=1.3.0)"] + [[package]] name = "setuptools" version = "65.4.1" @@ -1079,8 +1083,8 @@ GitPython = [ {file = "GitPython-3.1.27.tar.gz", hash = "sha256:1c885ce809e8ba2d88a29befeb385fcea06338d3640712b59ca623c220bb5704"}, ] identify = [ - {file = "identify-2.5.5-py2.py3-none-any.whl", hash = "sha256:ef78c0d96098a3b5fe7720be4a97e73f439af7cf088ebf47b620aeaa10fadf97"}, - {file = "identify-2.5.5.tar.gz", hash = "sha256:322a5699daecf7c6fd60e68852f36f2ecbb6a36ff6e6e973e0d2bb6fca203ee6"}, + {file = "identify-2.5.6-py2.py3-none-any.whl", hash = "sha256:b276db7ec52d7e89f5bc4653380e33054ddc803d25875952ad90b0f012cbcdaa"}, + {file = "identify-2.5.6.tar.gz", hash = "sha256:6c32dbd747aa4ceee1df33f25fed0b0f6e0d65721b15bd151307ff7056d50245"}, ] idna = [ {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, @@ -1273,7 +1277,7 @@ pyflakes = [ {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, ] -Pygments = [ +pygments = [ {file = "Pygments-2.13.0-py3-none-any.whl", hash = "sha256:f258f2a4c5bb73c7b9daae54f90aa5fa3aba3390164e059cc0408606a12d0647"}, {file = "Pygments-2.13.0.tar.gz", hash = "sha256:e9a08f2ce610f4e86142d6a6d8209f61bea945865993fe46de701f40aed43cdd"}, ] @@ -1294,12 +1298,12 @@ python-dateutil = [ {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:1efd93a2e222eb7360b5396108fdfa04e9753637d24143b8026dfb48ffbc755b"}, ] pytz = [ - {file = "pytz-2022.2.1-py2.py3-none-any.whl", hash = "sha256:220f481bdafa09c3955dfbdddb7b57780e9a94f5127e35456a48589b9e0c0197"}, - {file = "pytz-2022.2.1.tar.gz", hash = "sha256:cea221417204f2d1a2aa03ddae3e867921971d0d76f14d87abb4414415bbdcf5"}, + {file = "pytz-2022.4-py2.py3-none-any.whl", hash = "sha256:2c0784747071402c6e99f0bafdb7da0fa22645f06554c7ae06bf6358897e9c91"}, + {file = "pytz-2022.4.tar.gz", hash = "sha256:48ce799d83b6f8aab2020e369b627446696619e79645419610b9facd909b3174"}, ] pyupgrade = [ - {file = "pyupgrade-2.38.2-py2.py3-none-any.whl", hash = "sha256:41bb9a9fd48fe57163b0dacffff433d6d5a63a0f7c2402918917b5f1a533342b"}, - {file = "pyupgrade-2.38.2.tar.gz", hash = "sha256:a5d778c9de0b53975c6a9eac2d0df5adfad244a9f7d7993d8a114223ebbda367"}, + {file = "pyupgrade-2.38.4-py2.py3-none-any.whl", hash = "sha256:944ff993c396ddc2b9012eb3de4cda138eb4c149b22c6c560d4c8bfd0e180982"}, + {file = "pyupgrade-2.38.4.tar.gz", hash = "sha256:1eb43a49f416752929741ba4d706bf3f33593d3cac9bdc217fc1ef55c047c1f4"}, ] PyYAML = [ {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, @@ -1352,8 +1356,8 @@ requests-oauthlib = [ {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"}, ] rich = [ - {file = "rich-12.5.1-py3-none-any.whl", hash = "sha256:2eb4e6894cde1e017976d2975ac210ef515d7548bc595ba20e195fb9628acdeb"}, - {file = "rich-12.5.1.tar.gz", hash = "sha256:63a5c5ce3673d3d5fbbf23cd87e11ab84b6b451436f1b7f19ec54b6bc36ed7ca"}, + {file = "rich-12.6.0-py3-none-any.whl", hash = "sha256:a4eb26484f2c82589bd9a17c73d32a010b1e29d89f1604cd9bf3a2097b81bb5e"}, + {file = "rich-12.6.0.tar.gz", hash = "sha256:ba3a3775974105c221d31141f2c116f4fd65c5ceb0698657a11e9f295ec93fd0"}, ] "ruamel.yaml" = [ {file = "ruamel.yaml-0.17.21-py3-none-any.whl", hash = "sha256:742b35d3d665023981bd6d16b3d24248ce5df75fdb4e2924e93a05c1f8b61ca7"}, @@ -1392,8 +1396,8 @@ rich = [ {file = "ruamel.yaml.clib-0.2.6.tar.gz", hash = "sha256:4ff604ce439abb20794f05613c374759ce10e3595d1867764dd1ae675b85acbd"}, ] safety = [ - {file = "safety-2.2.0-py3-none-any.whl", hash = "sha256:b1a0f4c34fb41c502a7a5c54774c18376da382bc9d866ee26b39b2c747c0de40"}, - {file = "safety-2.2.0.tar.gz", hash = "sha256:6745de12acbd60a58001fe66cb540355187d7b991b30104d9ef14ff4e4826073"}, + {file = "safety-2.3.1-py3-none-any.whl", hash = "sha256:8f098d12b607db2756886280e85c28ece8db1bba4f45fc5f981f4663217bd619"}, + {file = "safety-2.3.1.tar.gz", hash = "sha256:6e6fcb7d4e8321098cf289f59b65051cafd3467f089c6e57c9f894ae32c23b71"}, ] setuptools = [ {file = "setuptools-65.4.1-py3-none-any.whl", hash = "sha256:1b6bdc6161661409c5f21508763dc63ab20a9ac2f8ba20029aaaa7fdb9118012"}, diff --git a/pyproject.toml b/pyproject.toml index 6e6a2d3..e2b1f91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "itemsubjector" -version = "0.3.2" +version = "0.3.3" description = "CLI-tool to easily add \"main subject\" aka topics in bulk to groups of items on Wikidata" authors = ["Dennis Priskorn <68460690+dpriskorn@users.noreply.github.com>"] license = "GPLv3+" diff --git a/src/__init__.py b/src/__init__.py index 51fd1a7..998d65a 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,4 +1,3 @@ -import argparse import logging import pandas as pd # type: ignore @@ -8,25 +7,14 @@ import config from src.helpers.argparse_setup import setup_argparse_and_return_args -from src.helpers.cleaning import strip_prefix -from src.helpers.console import ( - ask_add_to_job_queue, - ask_discard_existing_job_pickle, - ask_yes_no_question, - console, +from src.helpers.cli_messages import ( print_best_practice, print_finished, print_found_items_table, print_job_statistics, - print_keep_an_eye_on_wdqs_lag, ) +from src.helpers.console import console, print_keep_an_eye_on_wdqs_lag from src.helpers.enums import TaskIds -from src.helpers.jobs import ( - get_validated_main_subjects_as_jobs, - handle_job_preparation_or_run_directly_if_any_jobs, - process_qid_into_job, - process_user_supplied_qids_into_batch_jobs, -) from src.helpers.menus import select_task from src.helpers.migration import migrate_pickle_detection from src.helpers.pickle import ( @@ -36,99 +24,33 @@ parse_job_pickle, remove_job_pickle, ) +from src.helpers.questions import ( + ask_add_to_job_queue, + ask_discard_existing_job_pickle, + ask_yes_no_question, +) from src.models.batch_job import BatchJob from src.models.batch_jobs import BatchJobs +from src.models.main_subjects import MainSubjects from src.models.suggestion import Suggestion from src.models.task import Task from src.models.wikimedia.wikidata.entiyt_id import EntityId from src.tasks import tasks logging.basicConfig(level=config.loglevel) +logger = logging.getLogger(__name__) class ItemSubjector(BaseModel): @staticmethod - def match_main_subjects_from_sparql(args: argparse.Namespace = None): - """Collect subjects via SPARQL and call get_validated_main_subjects() - If we get any validated jobs we handle them""" - logger = logging.getLogger(__name__) - if args is None or args.sparql is None: - raise ValueError("args.sparql was None") - if "P1889" not in args.sparql: - console.print( - "Your SPARQL did not contain P1889 (different from). " - "Please include 'MINUS {?item wdt:P1889 [].}' " - "in your WHERE clause to avoid false positives." - ) - exit(0) - else: - logger.info("Detected P1889 in the query") - with console.status("Running query on WDQS..."): - main_subjects = [] - results = execute_sparql_query( - args.sparql.replace("{", "{{").replace("}", "}}"), - ) - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - main_subjects.append(item_json["item"]["value"]) - if len(main_subjects) > 0: - console.print(f"Got {len(main_subjects)} results") - batchjobs = get_validated_main_subjects_as_jobs( - args=args, main_subjects=main_subjects - ) - handle_job_preparation_or_run_directly_if_any_jobs( - args=args, batchjobs=batchjobs - ) - else: - console.print("Got 0 results. Try another query or debug it using --debug") - - @staticmethod - def export_jobs_to_dataframe(): - logger = logging.getLogger(__name__) - logger.info("Exporting jobs to DataFrame. All jobs are appended to one frame") - batchjobs = parse_job_pickle() - if batchjobs is not None: - if batchjobs is not None and batchjobs.job_count > 0: - logger.info(f"Found {batchjobs.job_count} jobs") - df = pd.DataFrame() - count = 1 - for job in batchjobs.jobs: - count += 1 - logger.info(f"Working on job {count}/{batchjobs.job_count}") - job_df = pd.DataFrame() - for item in job.items.list: - job_df = job_df.append( - pd.DataFrame( - data=[ - dict( - qid=item.id, - label=item.label, - description=item.description, - ) - ] - ) - ) - df = df.append(job_df) - logger.debug(f"Added {len(job.items.list)} items to the dataframe") - logger.debug(f"Exporting {len(df)} rows to pickle") - pickle_filename = "dataframe.pkl.gz" - df.to_pickle(pickle_filename) - console.print(f"Wrote to {pickle_filename} in the current directory") - else: - console.print( - "No jobs found. Create a job list first by using '--prepare-jobs'" - ) - - def run(self): + def run(): """This is the main function that makes everything else happen""" - logger = logging.getLogger(__name__) migrate_pickle_detection() args = setup_argparse_and_return_args() - # console.print(args.list) + # console.print(args.sparql_items) if args.remove_prepared_jobs is True: remove_job_pickle() - console.print("Removed the job list.") - # exit(0) + console.print("Removed the job sparql_items.") if args.prepare_jobs is True: logger.info("Preparing jobs") if check_if_pickle_exists(config.job_pickle_file_path): @@ -139,28 +61,20 @@ def run(self): if args.run_prepared_jobs is True: logger.info("Running prepared jobs") batchjobs = parse_job_pickle() - if batchjobs is not None and len(batchjobs.jobs) > 0: + if batchjobs and len(batchjobs.jobs) > 0: file_hash = get_hash_of_job_pickle() batchjobs.run_jobs() # Remove the pickle afterwards remove_job_pickle(hash=file_hash) - elif args.export_jobs_to_dataframe: - self.export_jobs_to_dataframe() elif args.sparql: - self.match_main_subjects_from_sparql(args=args) + main_subjects = MainSubjects(args=args) + main_subjects.match_main_subjects_from_sparql() + main_subjects.get_validated_main_subjects_as_jobs() + main_subjects.handle_job_preparation_or_run_directly_if_any_jobs() else: - # if not args.run_prepared_jobs: if args.add is None: console.print("Got no arguments or QIDs. Try '--help' for help.") - exit(0) - task: Task = select_task() - if task is None: - raise ValueError("Got no task") - jobs = [] - jobs.extend( - process_user_supplied_qids_into_batch_jobs(args=args, task=task) - ) - batchjobs = BatchJobs(jobs=jobs) - handle_job_preparation_or_run_directly_if_any_jobs( - args=args, batchjobs=batchjobs - ) + else: + main_subjects = MainSubjects(args=args, main_subjects=args.add) + main_subjects.get_validated_main_subjects_as_jobs() + main_subjects.handle_job_preparation_or_run_directly_if_any_jobs() diff --git a/src/helpers/argparse_setup.py b/src/helpers/argparse_setup.py index 328a693..15d2549 100644 --- a/src/helpers/argparse_setup.py +++ b/src/helpers/argparse_setup.py @@ -6,16 +6,16 @@ def setup_argparse_and_return_args(): formatter_class=argparse.RawDescriptionHelpFormatter, description=""" ItemSubjector enables working main subject statements on items based on a -heuristic matching the subject with the title of the item. +heuristic matching the subject with the title of the main_subject_item. Example adding one Qid: '$ itemsubjector.py -a Q1234' -Example adding one Qid and prepare a job list to be run non-interactively later: +Example adding one Qid and prepare a job sparql_items to be run non-interactively later: '$ itemsubjector.py -a Q1234 -p' Example working on all diseases: -'$ itemsubjector.py --sparql "SELECT ?item WHERE {?item wdt:P31 wd:Q12136. MINUS {?item wdt:P1889 [].}}"' +'$ itemsubjector.py --sparql "SELECT ?main_subject_item WHERE {?main_subject_item wdt:P31 wd:Q12136. MINUS {?main_subject_item wdt:P1889 [].}}"' """, ) parser.add_argument( @@ -59,21 +59,12 @@ def setup_argparse_and_return_args(): action="store_true", help="Remove prepared jobs", ) - parser.add_argument( - "-m", - "--match-existing-main-subjects", - action="store_true", - help=( - "Match from list of 136.000 already used " - "main subjects on other scientific articles" - ), - ) - parser.add_argument( - "-w", - "--limit-to-items-without-p921", - action="store_true", - help="Limit matching to scientific articles without P921 main subject", - ) + # parser.add_argument( + # "-w", + # "--limit-to-items-without-p921", + # action="store_true", + # help="Limit matching to scientific articles without P921 main subject", + # ) parser.add_argument( "-su", "--show-search-urls", @@ -82,7 +73,7 @@ def setup_argparse_and_return_args(): ) parser.add_argument( "-iu", - "--show-item-urls", + "--show-main_subject_item-urls", action="store_true", help="Show an extra column in the table of items with links", ) @@ -90,8 +81,8 @@ def setup_argparse_and_return_args(): "--sparql", nargs="?", help="Work on main subject items returned by this SPARQL query.\n" - 'Note: "?item" has to be selected for it to work, see the example above.\n' - "Note: MINUS {?item wdt:P1889 [].} must be present in the query to avoid false positives.", + 'Note: "?main_subject_item" has to be selected for it to work, see the example above.\n' + "Note: MINUS {?main_subject_item wdt:P1889 [].} must be present in the query to avoid false positives.", ) parser.add_argument( "--debug-sparql", @@ -104,12 +95,6 @@ def setup_argparse_and_return_args(): "--limit", nargs="?", type=int, - help="When working on SPARQL queries of e.g. galaxies, match more until this many matches are in the job list", - ) - parser.add_argument( - "--export-jobs-to-dataframe", - action="store_true", - help="Export the prepared job list to a Pandas DataFrame.", - default=False, + help="When working on SPARQL queries of e.g. galaxies, match more until this many matches are in the job sparql_items", ) return parser.parse_args() diff --git a/src/helpers/cleaning.py b/src/helpers/cleaning.py index 840850e..44a5631 100644 --- a/src/helpers/cleaning.py +++ b/src/helpers/cleaning.py @@ -1,33 +1,3 @@ -def strip_bad_chars(string): - # Note this has to match the cleaning done in the sparql query - # We lowercase and remove common symbols - # We replace like this to save CPU cycles see - # https://stackoverflow.com/questions/3411771/best-way-to-replace-multiple-characters-in-a-string - return ( - string - # Needed for matching backslashes e.g. "Dmel\CG5330" on Q29717230 - .replace("\\", "\\\\") - # Needed for when labels contain apostrophe - .replace("'", "\\'") - .replace(",", "") - .replace(":", "") - .replace(";", "") - .replace("(", "") - .replace(")", "") - .replace("[", "") - .replace("]", "") - ) - - def clean_rich_formatting(label): # Fix rich parse bug with "[/TSUP]" and "[/ITAL]" return label.replace("[/", "['/") - - -def strip_prefix(qid): - if "https://www.wikidata.org/wiki/" in qid: - qid = qid[30:] - if "http://www.wikidata.org/entity/" in qid: - qid = qid[31:] - # logger.debug(f"qid:{qid}") - return qid diff --git a/src/helpers/cli_messages.py b/src/helpers/cli_messages.py new file mode 100644 index 0000000..ad60b9d --- /dev/null +++ b/src/helpers/cli_messages.py @@ -0,0 +1,108 @@ +# from __future__ import annotations + +import argparse +from typing import Set +from urllib.parse import quote + +from rich.table import Table + +from src.helpers.cleaning import clean_rich_formatting +from src.helpers.console import console, press_enter_to_continue +from src.models.batch_jobs import BatchJobs +from src.models.items import Items +from src.models.task import Task + + +def print_best_practice(task: Task): + if task.best_practice_information: + console.print(task.best_practice_information) + press_enter_to_continue() + + +def print_search_strings_table( + args: argparse.Namespace = None, search_strings: Set[str] = None +): + if args is None: + raise ValueError("args was None") + if search_strings is None: + raise ValueError("search strings was None") + table = Table(title="Search strings") + table.add_column(f"Extracted the following {len(search_strings)} search strings") + if args.show_search_urls: + table.add_column(f"Wikidata search URL") + for string in search_strings: + if args.show_search_urls: + table.add_row( + string, f"https://www.wikidata.org/w/index.php?search={quote(string)}" + ) + else: + table.add_row(string) + console.print(table) + + +def print_found_items_table(args: argparse.Namespace = None, items: Items = None): + if args is None: + raise ValueError("args was None") + if items is None: + raise ValueError("items was None") + if items.sparql_items is None: + raise ValueError("items.sparql_items was None") + table = Table(title="Matched items found") + if items.number_of_sparql_items < 1000: + list_to_show = items.sparql_items[0:50] + else: + # Show 1 sample for each 20 items in the sparql_items + list_to_show = items.sparql_items[0 : int(items.number_of_sparql_items / 20)] + if items.number_of_sparql_items > 4000: + console.print( + "[red]Warning: This is a very large batch, please proceed with caution[/red]" + ) + press_enter_to_continue() + table.add_column( + f"Showing a random subset of {len(list_to_show)} " + f"items, please review as many as possible for false " + f"positives and reject the batch if you find any." + ) + if getattr(args, "show_item_urls", False): + table.add_column(f"Wikidata URL") + for item in list_to_show: + if item.label is None: + raise ValueError("main_subject_item.label was None") + if getattr(args, "show_item_urls", False): + label = clean_rich_formatting(item.label) + table.add_row(label, item.url) + else: + table.add_row(item.label) + console.print(table) + + +def print_finished(): + console.print("All jobs finished successfully") + + +def print_job_statistics(batchjobs: BatchJobs = None): + if not batchjobs: + raise ValueError("batchjobs was None") + if not batchjobs.jobs: + # No jobs to print information about + return + if not isinstance(batchjobs.jobs, list): + raise ValueError("jobs was not a sparql_items") + if not batchjobs.number_of_jobs: + console.print("The jobs sparql_items is empty") + else: + total_number_of_queries = sum([job.number_of_queries for job in batchjobs.jobs]) + total_number_of_items = sum( + job.main_subject_item.items.number_of_sparql_items + for job in batchjobs.jobs + if batchjobs.jobs + and job + and job.main_subject_item.items + and job.main_subject_item.items.sparql_items + ) + console.print( + f"The jobs sparql_items now contain a total of {batchjobs.number_of_jobs} " # type: ignore + f"jobs with a total of " + f"{total_number_of_items} items found from " + f"{total_number_of_queries} queries" + ) diff --git a/src/helpers/console.py b/src/helpers/console.py index febb26d..68951a2 100644 --- a/src/helpers/console.py +++ b/src/helpers/console.py @@ -1,37 +1,8 @@ -from __future__ import annotations - -import argparse -from typing import TYPE_CHECKING, Set -from urllib.parse import quote - from rich.console import Console -from rich.table import Table - -from src.helpers.cleaning import clean_rich_formatting -from src.models.batch_job import BatchJob -from src.models.batch_jobs import BatchJobs - -if TYPE_CHECKING: - from src.models.items import Items - from src.models.task import Task console = Console() -def ask_yes_no_question(message: str): - # https://www.quora.com/ - # I%E2%80%99m-new-to-Python-how-can-I-write-a-yes-no-question - # this will loop forever - while True: - answer = console.input(message + " [Y/Enter/n]: ") - if len(answer) == 0 or answer[0].lower() in ("y", "n"): - if len(answer) == 0: - return True - else: - # the == operator just returns a boolean, - return answer[0].lower() == "y" - - def print_keep_an_eye_on_wdqs_lag(): console.print( "Please keep an eye on the lag of the WDQS cluster here and avoid " @@ -49,122 +20,3 @@ def print_keep_an_eye_on_wdqs_lag(): def press_enter_to_continue(): console.input("Press Enter to continue.") - - -def print_best_practice(task: Task): - if task.best_practice_information is not None: - console.print(task.best_practice_information) - press_enter_to_continue() - - -def print_search_strings_table( - args: argparse.Namespace = None, search_strings: Set[str] = None -): - if args is None: - raise ValueError("args was None") - if search_strings is None: - raise ValueError("search strings was None") - table = Table(title="Search strings") - table.add_column(f"Extracted the following {len(search_strings)} search strings") - if args.show_search_urls: - table.add_column(f"Wikidata search URL") - for string in search_strings: - if args.show_search_urls: - table.add_row( - string, f"https://www.wikidata.org/w/index.php?search={quote(string)}" - ) - else: - table.add_row(string) - console.print(table) - - -def print_found_items_table(args: argparse.Namespace = None, items: Items = None): - if args is None: - raise ValueError("args was None") - if items is None: - raise ValueError("items was None") - if items.list is None: - raise ValueError("items.list was None") - table = Table(title="Matched items found") - if len(items.list) < 1000: - list_to_show = items.list[0:50] - else: - # Show 1 sample for each 20 items in the list - list_to_show = items.list[0 : int(len(items.list) / 20)] - if len(items.list) > 4000: - console.print( - "[red]Warning: This is a very large batch, please proceed with caution[/red]" - ) - press_enter_to_continue() - table.add_column( - f"Showing a random subset of {len(list_to_show)} " - f"items, please review as many as possible for false " - f"positives and reject the batch if you find any." - ) - if args.show_item_urls: - table.add_column(f"Wikidata URL") - for item in list_to_show: - if item.label is None: - raise ValueError("item.label was None") - if args.show_item_urls: - label = clean_rich_formatting(item.label) - table.add_row(label, item.url()) - else: - table.add_row(item.label) - console.print(table) - - -def ask_add_to_job_queue(job: BatchJob = None): - if job is None: - raise ValueError("job was None") - if job.suggestion.item is None: - raise ValueError("job.suggestion.item was None") - if job.suggestion.item.label is None: - raise ValueError("job.suggestion.item.label was None") - if job.suggestion.item.description is None: - job.suggestion.item.description = "" - if job.items.list is None: - raise ValueError("job.items.list was None") - return ask_yes_no_question( - f"Do you want to add this job for " - f"[magenta]{job.suggestion.item.label}: " - f"{job.suggestion.item.description}[/magenta] with " - f"{len(job.items.list)} items to the queue? (see {job.suggestion.item.url()})" - ) - - -def print_finished(): - console.print("All jobs finished successfully") - - -def print_job_statistics(batchjobs: BatchJobs = None): - if batchjobs is None: - raise ValueError("jobs was None") - if batchjobs.jobs is None: - raise ValueError("batchjobs.jobs was None") - if not isinstance(batchjobs.jobs, list): - raise ValueError("jobs was not a list") - if len(batchjobs.jobs) == 0: - console.print("The jobs list is empty") - else: - total_number_of_queries = sum([job.number_of_queries for job in batchjobs.jobs]) - total_number_of_items = sum( - len(job.items.list) - for job in batchjobs.jobs - if batchjobs.jobs is not None - and job is not None - and job.items is not None - and job.items.list is not None - ) - console.print( - f"The jobs list now contain a total of {len(batchjobs.jobs)} " # type: ignore - f"jobs with a total of " - f"{total_number_of_items} items found from " - f"{total_number_of_queries} queries" - ) - - -def ask_discard_existing_job_pickle(): - return ask_yes_no_question( - "A prepared list of jobs already exist, " "do you want to delete it?" - ) diff --git a/src/helpers/jobs.py b/src/helpers/jobs.py deleted file mode 100644 index a79e1d5..0000000 --- a/src/helpers/jobs.py +++ /dev/null @@ -1,236 +0,0 @@ -from __future__ import annotations - -import argparse -import logging -import random -from typing import TYPE_CHECKING, List, Optional, Union - -import config -from src import ( - TaskIds, - ask_add_to_job_queue, - ask_yes_no_question, - console, - print_best_practice, - print_job_statistics, - strip_prefix, -) -from src.helpers.menus import select_task -from src.models.batch_jobs import BatchJobs -from src.models.items import Items -from src.models.items.academic_journals import AcademicJournalItems -from src.models.items.riksdagen_documents import RiksdagenDocumentItems -from src.models.items.scholarly_articles import ScholarlyArticleItems -from src.models.items.thesis import ThesisItems -from src.tasks import Task - -if TYPE_CHECKING: - from src import BatchJob - -# TODO rewrite as OOP -logger = logging.getLogger(__name__) - - -def process_qid_into_job( - qid: str = None, - task: Task = None, - args: argparse.Namespace = None, - confirmation: bool = False, -) -> Union[BatchJob, None]: - if qid is None: - raise ValueError("qid was None") - if args is None: - raise ValueError("args was None") - if task is None: - raise ValueError("task was None") - from src.models.wikimedia.wikidata.item import Item - - item = Item( - id=strip_prefix(qid), - ) - item.fetch_label_and_description_and_aliases(task=task) - if item.label is not None: - console.print(f"Working on {item}") - # generate suggestion with all we need - from src import Suggestion - - suggestion = Suggestion(item=item, task=task, args=args) - if confirmation: - answer = ask_yes_no_question("Do you want to continue?") - if not answer: - return None - suggestion.extract_search_strings() - if config.loglevel == logging.INFO: - suggestion.print_search_strings() - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - number_of_queries = ( - len(suggestion.search_strings) * task.number_of_queries_per_search_string - ) - with console.status( - f"Fetching items with labels that have one of " - f"the search strings by running a total of " - f"{number_of_queries} " - f"queries on WDQS..." - ): - items: Optional[Items] = None - if task.id == TaskIds.SCHOLARLY_ARTICLES: - items = ScholarlyArticleItems() - elif task.id == TaskIds.RIKSDAGEN_DOCUMENTS: - items = RiksdagenDocumentItems() - elif task.id == TaskIds.THESIS: - items = ThesisItems() - elif task.id == TaskIds.ACADEMIC_JOURNALS: - items = AcademicJournalItems() - else: - raise ValueError(f"{task.id} was not recognized") - items.fetch_based_on_label(suggestion=suggestion, task=task) - if items.list is None: - raise ValueError("items.list was None") - if len(items.list) > 0: - # Remove duplicates - logger.debug(f"{len(items.list)} before duplicate removal") - items.list = list(set(items.list)) - logger.debug(f"{len(items.list)} after duplicate removal") - # Randomize the list - items.random_shuffle_list() - from src import BatchJob - - job = BatchJob( - items=items, number_of_queries=number_of_queries, suggestion=suggestion - ) - return job - else: - console.print("No matching items found") - return None - else: - console.print( - f"Label for {task.language_code} was None on {item.url()}, skipping" - ) - return None - - -def process_user_supplied_qids_into_batch_jobs( - args: argparse.Namespace = None, task: Task = None -) -> List[BatchJob]: - """Given a list of QIDs, we go through - them and return a list of jobs""" - # logger = logging.getLogger(__name__) - if args is None: - raise ValueError("args was None") - if task is None: - raise ValueError("task was None") - print_best_practice(task) - jobs = [] - for qid in args.add: - job = process_qid_into_job(qid=qid, task=task, args=args) - if job is not None: - jobs.append(job) - return jobs - - -def handle_job_preparation_or_run_directly_if_any_jobs( - args: argparse.Namespace = None, batchjobs: BatchJobs = None -): - if batchjobs is None: - raise ValueError("batchjobs was None") - if args is None: - raise ValueError("args was None") - if len(batchjobs.jobs) > 0: - if args.prepare_jobs: - console.print(f"Adding {len(batchjobs.jobs)} job(s) " f"to the jobs file") - for job in batchjobs.jobs: - from src import add_to_job_pickle - - add_to_job_pickle(job) - print_job_statistics(batchjobs=batchjobs) - console.print( - f"You can run the jobs " - f"non-interactively e.g. on the Toolforge " - f"Kubernetes cluster using -r or --run-prepared-jobs. " - f"See Kubernetes_HOWTO.md for details." - ) - else: - batchjobs.run_jobs() - - -def get_validated_main_subjects_as_jobs( - args: argparse.Namespace = None, main_subjects: List[str] = None -) -> BatchJobs: - """This function randomly picks a subject and add it to the - list of jobs if it had any matches and the user approved it""" - if args is None: - raise ValueError("args was None") - if main_subjects is None: - raise ValueError("main subjects was None") - subjects_not_picked_yet = main_subjects - task: Task = select_task() - if task is None: - raise ValueError("Got no task") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - batchjobs = BatchJobs(jobs=[]) - while True: - # Check if we have any subjects left in the list - if len(subjects_not_picked_yet) > 0: - console.print(f"Picking a random main subject") - qid = random.choice(subjects_not_picked_yet) - subjects_not_picked_yet.remove(qid) - job = process_qid_into_job( - qid=qid, - task=task, - args=args, - confirmation=args.no_confirmation, - ) - if job is not None: - # Here we check if the user has enabled no ask more limit. - if args.no_ask_match_more_limit is None: - logger.debug("No ask more was None") - job.items.print_items_list(args=args) - job.suggestion.print_search_strings() - answer = ask_add_to_job_queue(job) - if answer: - batchjobs.jobs.append(job) - else: - batchjobs.jobs.append(job) - logger.debug(f"joblist now has {len(batchjobs.jobs)} jobs") - print_job_statistics(batchjobs=batchjobs) - if len(subjects_not_picked_yet) > 0: - if ( - args.no_ask_match_more_limit is None - or args.no_ask_match_more_limit - < sum( - len(job.items.list) - for job in batchjobs.jobs - if job.items.list is not None - ) - ): - answer_was_yes = ask_yes_no_question("Match one more?") - if not answer_was_yes: - break - else: - console.print("No more subjects in the list.") - break - else: - console.print("No more subjects in the list. Exiting.") - break - if args.no_ask_match_more_limit is not None: - batchjobs_limit = BatchJobs(jobs=[]) - for job in batchjobs.jobs: - job.items.print_items_list(args=args) - job.suggestion.print_search_strings() - if ( - config.automatically_approve_jobs_with_less_than_fifty_matches - and job.items.number_of_items < 50 - ): - console.print( - f"This job with {job.items.number_of_items} matching items was automatically approved", - style="green", - ) - batchjobs_limit.jobs.append(job) - else: - answer = ask_add_to_job_queue(job) - if answer: - batchjobs_limit.jobs.append(job) - return batchjobs_limit - return batchjobs diff --git a/src/helpers/menus.py b/src/helpers/menus.py index 49c2290..360a144 100644 --- a/src/helpers/menus.py +++ b/src/helpers/menus.py @@ -3,15 +3,16 @@ from consolemenu import SelectionMenu # type: ignore -from src.models.suggestion import Suggestion from src.models.wikimedia.wikidata.item import Item +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem from src.tasks import Task +logger = logging.getLogger(__name__) -def select_suggestion(suggestions: List[Suggestion] = None, item: Item = None): - if item is None or suggestions is None: + +def select_suggestion(suggestions: List[MainSubjectItem], item: Item): + if not item or not item.id or not suggestions: raise ValueError("Did not get what we need") - logger = logging.getLogger(__name__) menu = SelectionMenu( suggestions, f"Does any of these fit the label \n'{item.label}'" ) @@ -31,7 +32,7 @@ def select_suggestion(suggestions: List[Suggestion] = None, item: Item = None): def select_task() -> Task: - logger = logging.getLogger(__name__) + # TODO use questionary here? from src.tasks import tasks labels = [task.label for task in tasks] @@ -55,8 +56,8 @@ def select_task() -> Task: # menu.join() # selected_language_index = menu.selected_option # mapping = {} -# for index, item in enumerate(WikimediaLanguageCode): -# mapping[index] = item +# for index, main_subject_item in enumerate(WikimediaLanguageCode): +# mapping[index] = main_subject_item # selected_language = mapping[selected_language_index] # logger.debug(f"selected:{selected_language_index}=" # f"{selected_language}") @@ -69,8 +70,8 @@ def select_task() -> Task: # menu.join() # selected_lexical_category_index = menu.selected_option # category_mapping = {} -# for index, item in enumerate(WikidataLexicalCategory): -# category_mapping[index] = item +# for index, main_subject_item in enumerate(WikidataLexicalCategory): +# category_mapping[index] = main_subject_item # selected_lexical_category = category_mapping[selected_lexical_category_index] # logger.debug(f"selected:{selected_lexical_category_index}=" # f"{selected_lexical_category}") diff --git a/src/helpers/pickle.py b/src/helpers/pickle.py index 738d87f..5661075 100644 --- a/src/helpers/pickle.py +++ b/src/helpers/pickle.py @@ -36,7 +36,7 @@ def check_if_pickle_exists(path): def parse_job_pickle(silent: bool = False) -> Optional[BatchJobs]: - """Reads the pickle into a list of batch jobs""" + """Reads the pickle into a sparql_items of batch jobs""" if check_if_pickle_exists(config.job_pickle_file_path): jobs: List[BatchJob] = [] for job in read_from_pickle(config.job_pickle_file_path): @@ -58,16 +58,16 @@ def remove_job_pickle(silent: bool = False, hash: str = None): if os.path.exists(config.job_pickle_file_path): os.remove(config.job_pickle_file_path) if not silent: - console.print("The job list file was removed") + console.print("The job sparql_items file was removed") if os.path.exists(config.job_pickle_file_path): hash_now = get_hash_of_job_pickle() if hash == hash_now: os.remove(config.job_pickle_file_path) if not silent: - console.print("The job list file was removed") + console.print("The job sparql_items file was removed") else: console.print( - "Job list file not deleted because the contents " + "Job sparql_items file not deleted because the contents " "has changed since this batch of jobs was started." ) else: diff --git a/src/helpers/questions.py b/src/helpers/questions.py new file mode 100644 index 0000000..aa768f9 --- /dev/null +++ b/src/helpers/questions.py @@ -0,0 +1,49 @@ +# from __future__ import annotations + +from typing import TYPE_CHECKING + +from src.helpers.console import console + +if TYPE_CHECKING: + from src.models.batch_jobs import BatchJob + + +def ask_add_to_job_queue(job: "BatchJob" = None): + if not job: + raise ValueError("job was None") + if not job.main_subject_item: + raise ValueError("job.main_subject_item was None") + if not job.main_subject_item.label: + raise ValueError("job.main_subject_item.label was None") + if not job.main_subject_item.description: + job.main_subject_item.description = "" + if not job.main_subject_item.items: + raise ValueError("items was None") + if not job.main_subject_item.items.sparql_items: + raise ValueError("sparql_items was None") + return ask_yes_no_question( + f"Do you want to add this job for " + f"[magenta]{job.main_subject_item.label}: " + f"{job.main_subject_item.description}[/magenta] with " + f"{len(job.main_subject_item.items.sparql_items)} items to the queue? (see {job.main_subject_item.url})" + ) + + +def ask_discard_existing_job_pickle(): + return ask_yes_no_question( + "A prepared sparql_items of jobs already exist, " "do you want to delete it?" + ) + + +def ask_yes_no_question(message: str): + # https://www.quora.com/ + # I%E2%80%99m-new-to-Python-how-can-I-write-a-yes-no-question + # this will loop forever + while True: + answer = console.input(message + " [Y/Enter/n]: ") + if len(answer) == 0 or answer[0].lower() in ("y", "n"): + if len(answer) == 0: + return True + else: + # the == operator just returns a boolean, + return answer[0].lower() == "y" diff --git a/src/models/batch_job.py b/src/models/batch_job.py index 0822a88..40142d1 100644 --- a/src/models/batch_job.py +++ b/src/models/batch_job.py @@ -1,12 +1,10 @@ from pydantic import BaseModel -from src.models.items import Items -from src.models.suggestion import Suggestion +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem class BatchJob(BaseModel): """Models a batch job intended to be run non-interactively""" - suggestion: Suggestion - items: Items + main_subject_item: MainSubjectItem number_of_queries: int diff --git a/src/models/batch_jobs.py b/src/models/batch_jobs.py index 90f7dc3..bea3de1 100644 --- a/src/models/batch_jobs.py +++ b/src/models/batch_jobs.py @@ -12,27 +12,29 @@ class BatchJobs(BaseModel): jobs: List[BatchJob] @property - def job_count(self): + def number_of_jobs(self): return len(self.jobs) def print_running_jobs(self): if not isinstance(self.jobs, list): - raise ValueError("jobs is not a list") + raise ValueError("jobs is not a sparql_items") from src.helpers.console import console + number_of_items = sum( + job.main_subject_item.items.number_of_sparql_items + for job in self.jobs + if job.main_subject_item.items and job.main_subject_item.items.sparql_items + ) console.print( f"Running {len(self.jobs)} job(s) with a total of " - f"{sum(len(job.items.list) for job in self.jobs if job.items.list is not None)} items " + f"{number_of_items} items " f"non-interactively now. You can take a " f"coffee break and lean back :)" ) def run_jobs(self): - from src.helpers.console import ( - console, - print_finished, - print_keep_an_eye_on_wdqs_lag, - ) + from src import print_finished + from src.helpers.console import console, print_keep_an_eye_on_wdqs_lag if self.jobs is None or len(self.jobs) == 0: raise ValueError("did not get what we need") @@ -42,8 +44,8 @@ def run_jobs(self): self.print_running_jobs() start_time = datetime.now() for job in self.jobs: - job.suggestion.add_to_items( - items=job.items, jobs=self.jobs, job_count=self.job_count + job.main_subject_item.add_to_items( + jobs=self.jobs, job_count=self.number_of_jobs ) print_finished() end_time = datetime.now() diff --git a/src/models/items/__init__.py b/src/models/items/__init__.py index b59e27a..6f4058b 100644 --- a/src/models/items/__init__.py +++ b/src/models/items/__init__.py @@ -1,32 +1,50 @@ -from __future__ import annotations +# from __future__ import annotations import argparse +import logging import random -from typing import TYPE_CHECKING, List, Optional +from typing import Any, List from pydantic import BaseModel -from src.models.task import Task -from src.models.wikimedia.wikidata.sparql_item import SparqlItem +from src.helpers.console import console +from src.models.wikimedia.wikidata.item.sparql import SparqlItem -if TYPE_CHECKING: - from src.models.suggestion import Suggestion +# if TYPE_CHECKING: +# from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem + +logger = logging.getLogger(__name__) class Items(BaseModel): - list: Optional[List[SparqlItem]] + # pydantic forwardref error + main_subject_item: Any # type MainSubjectItem + sparql_items: List[SparqlItem] = [] @property - def number_of_items(self): - return len(self.list) + def number_of_sparql_items(self): + return len(self.sparql_items) - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): + def fetch_based_on_label(self): pass - def random_shuffle_list(self): - random.shuffle(self.list) + def random_shuffle_items(self): + random.shuffle(self.sparql_items) def print_items_list(self, args: argparse.Namespace): from src import print_found_items_table print_found_items_table(args=args, items=self) + + def remove_duplicates(self): + if self.sparql_items is None: + raise ValueError("items.sparql_items was None") + logger.debug(f"{len(self.sparql_items)} before duplicate removal") + self.sparql_items = list(set(self.sparql_items)) + logger.debug(f"{len(self.sparql_items)} after duplicate removal") + + def print_total_items(self): + console.print(f"Got a total of {len(self.sparql_items)} items") + + def execute_queries(self): + pass diff --git a/src/models/items/academic_journals.py b/src/models/items/academic_journals.py index d155850..5e0e449 100644 --- a/src/models/items/academic_journals.py +++ b/src/models/items/academic_journals.py @@ -1,72 +1,72 @@ -import logging - -from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore - -import config -from src.helpers.cleaning import strip_bad_chars -from src.helpers.console import console -from src.models.items import Items -from src.models.suggestion import Suggestion -from src.models.task import Task -from src.models.wikimedia.wikidata.sparql_item import SparqlItem - - -class AcademicJournalItems(Items): - """This supports both published peer reviewed articles and preprints""" - - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): - def process_results(results): - # TODO refactor into private method - items = [] - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = SparqlItem(**item_json) - items.append(item) - return items - - # logger = logging.getLogger(__name__) - if suggestion is None: - raise ValueError("suggestion was None") - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - if suggestion.item is None: - raise ValueError("suggestion.item was None") - if suggestion.item.id is None: - raise ValueError("suggestion.item.id was None") - if suggestion.args is None: - raise ValueError("suggestion.args was None") - # Fetch all items matching the search strings - self.list = [] - for search_string in suggestion.search_strings: - search_string = strip_bad_chars(search_string) - results = execute_sparql_query( - f""" - #{config.user_agent} - SELECT ?item ?itemLabel - WHERE - {{ - ?item wdt:P31 wd:Q737498. - minus {{?item wdt:P921 wd:{suggestion.item.id}.}} - ?item rdfs:label ?label. - # We lowercase the label first and search for the - # string in both the beginning, middle and end of the label - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """, - ) - logging.info( - f'Got {len(results["results"]["bindings"])} academic journal items from ' - f"WDQS using the search string {search_string}" - ) - self.list.extend(process_results(results)) - console.print(f"Got a total of {len(self.list)} items") +# import logging +# +# from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore +# +# import config +# from src.helpers.cleaning import __strip_bad_chars__ +# from src.helpers.console import console +# from src.models.items import Items +# from src.models.suggestion import Suggestion +# from src.models.task import Task +# from src.models.wikimedia.wikidata.sparql_item import SparqlItem +# +# +# class AcademicJournalItems(Items): +# """This supports both published peer reviewed articles and preprints""" +# +# def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): +# def process_results(results): +# # TODO refactor into private method +# items = [] +# for item_json in results["results"]["bindings"]: +# logging.debug(f"item_json:{item_json}") +# item = SparqlItem(**item_json) +# items.append(item) +# return items +# +# # logger = logging.getLogger(__name__) +# if suggestion is None: +# raise ValueError("suggestion was None") +# if task is None: +# raise ValueError("task was None") +# if task.language_code is None: +# raise ValueError("task.language_code was None") +# if suggestion.search_strings is None: +# raise ValueError("suggestion.search_strings was None") +# if suggestion.main_subject_item is None: +# raise ValueError("suggestion.main_subject_item was None") +# if suggestion.main_subject_item.id is None: +# raise ValueError("suggestion.main_subject_item.id was None") +# if suggestion.args is None: +# raise ValueError("suggestion.args was None") +# # Fetch all items matching the search strings +# self.list = [] +# for search_string in suggestion.search_strings: +# search_string = __strip_bad_chars__(search_string) +# results = execute_sparql_query( +# f""" +# #{config.user_agent} +# SELECT ?main_subject_item ?itemLabel +# WHERE +# {{ +# ?main_subject_item wdt:P31 wd:Q737498. +# minus {{?main_subject_item wdt:P921 wd:{suggestion.main_subject_item.id}.}} +# ?main_subject_item rdfs:label ?label. +# # We lowercase the label first and search for the +# # string in both the beginning, middle and end of the label +# FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || +# REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || +# REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) +# MINUS {{?main_subject_item wdt:P921/wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# MINUS {{?main_subject_item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# MINUS {{?main_subject_item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} +# }} +# """, +# ) +# logging.info( +# f'Got {len(results["results"]["bindings"])} academic journal items from ' +# f"WDQS using the search string {search_string}" +# ) +# self.list.extend(process_results(results)) +# console.print(f"Got a total of {len(self.list)} items") diff --git a/src/models/items/riksdagen_documents.py b/src/models/items/riksdagen_documents.py index c85ab07..c1163fc 100644 --- a/src/models/items/riksdagen_documents.py +++ b/src/models/items/riksdagen_documents.py @@ -1,72 +1,23 @@ -import logging - from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore -import config -from src.helpers.console import console from src.models.items import Items -from src.models.suggestion import Suggestion -from src.models.task import Task -from src.models.wikimedia.wikidata.sparql_item import SparqlItem +from src.models.wikimedia.wikidata.query.riksdagen_document import ( + RiksdagenDocumentQuery, +) + +# logger = logging.getLogger(__name__) class RiksdagenDocumentItems(Items): - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): - # logger = logging.getLogger(__name__) - if suggestion is None: - raise ValueError("suggestion was None") - if suggestion.item is None: - raise ValueError("suggestion.item was None") - if suggestion.args is None: - raise ValueError("suggestion.args was None") - if suggestion.args.limit_to_items_without_p921: - raise Exception( - "Limiting to items without P921 is not " "supported yet for this task." - ) - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") + def fetch_based_on_label(self): + self.execute_queries() + self.print_total_items() + + def execute_queries(self): # Fetch all items matching the search strings - self.list = [] - # Include spaces around the n-gram to avoid edits like this one - # https://www.wikidata.org/w/index.php?title=Q40671507&diff=1497186802&oldid=1496945583 - # Lowercase is not needed here as Elastic matches anyway - for search_string in suggestion.search_strings: - results = execute_sparql_query( - f""" - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - hint:Query hint:optimizer "None". - SERVICE wikibase:mwapi {{ - bd:serviceParam wikibase:api "Search"; - wikibase:endpoint "www.wikidata.org"; - mwapi:srsearch 'haswbstatement:P8433 -haswbstatement:P921={suggestion.item.id} "{search_string}"' . - ?title wikibase:apiOutput mwapi:title. - }} - BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) - ?item rdfs:label ?label. - # We lowercase the label first and search for the - # string in both the beginning, middle and end of the label - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - # remove more specific forms of the main subject also - # Thanks to Jan Ainali for this improvement :) - MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "sv". }} - }} - """, - ) - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = SparqlItem(**item_json) - self.list.append(item) - logging.info( - f'Got {len(results["results"]["bindings"])} items from ' - f"WDQS using the search string {search_string}" + for search_string in self.main_subject_item.search_strings: + riksdagen_query = RiksdagenDocumentQuery( + main_subject_item=self.main_subject_item, search_string=search_string ) - console.print(f"Got a total of {len(self.list)} items") + riksdagen_query.get_results() + self.sparql_items.extend(riksdagen_query.items) diff --git a/src/models/items/scholarly_articles.py b/src/models/items/scholarly_articles.py index 73995c9..af3411e 100644 --- a/src/models/items/scholarly_articles.py +++ b/src/models/items/scholarly_articles.py @@ -1,163 +1,47 @@ import logging +from typing import Dict from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore -import config -from src.helpers.cleaning import strip_bad_chars -from src.helpers.console import console from src.models.items import Items -from src.models.suggestion import Suggestion -from src.models.task import Task -from src.models.wikimedia.wikidata.sparql_item import SparqlItem +from src.models.wikimedia.wikidata.query.preprint_article import PreprintArticleQuery +from src.models.wikimedia.wikidata.query.published_article import PublishedArticleQuery +from src.models.wikimedia.wikidata.query.thesis import ThesisQuery logger = logging.getLogger(__name__) class ScholarlyArticleItems(Items): - """This supports both published peer reviewed articles and preprints""" + """This supports both published peer reviewed articles, thesis' and preprints""" - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): - def build_query( - suggestion: Suggestion = None, - search_string: str = None, - task: Task = None, - cirrussearch_parameters: str = None, - ): - # TODO refactor - if suggestion is None: - raise ValueError("suggestion was None") - if suggestion.item is None: - raise ValueError("suggestion.item was None") - if search_string is None: - raise ValueError("search_string was None") - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - if cirrussearch_parameters is None: - raise ValueError("cirrussearch_parameters was None") - # This query uses https://www.w3.org/TR/sparql11-property-paths/ to - # find subjects that are subclass of one another up to 3 hops away - # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI - # which has a hardcoded limit of 10,000 items so you will never get more matches than that - # This query use regex to match beginning, middle and end of the label of matched items - # The replacing lines should match the similar python replacements in cleaning.py - # The replacing with "\\\\\\\\" becomes "\\\\" after leaving python and then it works in - # SPARQL where it becomes "\\" and thus match a single backslash - return f""" - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - hint:Query hint:optimizer "None". - BIND(STR('{cirrussearch_parameters} \"{search_string}\"') as ?search_string) - SERVICE wikibase:mwapi {{ - bd:serviceParam wikibase:api "Search"; - wikibase:endpoint "www.wikidata.org"; - mwapi:srsearch ?search_string. - ?title wikibase:apiOutput mwapi:title. - }} - BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) - ?item rdfs:label ?label. - BIND(REPLACE(LCASE(?label), ",", "") as ?label1) - BIND(REPLACE(?label1, ":", "") as ?label2) - BIND(REPLACE(?label2, ";", "") as ?label3) - BIND(REPLACE(?label3, "\\\\(", "") as ?label4) - BIND(REPLACE(?label4, "\\\\)", "") as ?label5) - BIND(REPLACE(?label5, "\\\\[", "") as ?label6) - BIND(REPLACE(?label6, "\\\\]", "") as ?label7) - BIND(REPLACE(?label7, "\\\\\\\\", "") as ?label8) - BIND(?label8 as ?cleaned_label) - FILTER(CONTAINS(?cleaned_label, ' {search_string.lower()} '@{task.language_code.value}) || - REGEX(?cleaned_label, '.* {search_string.lower()}$'@{task.language_code.value}) || - REGEX(?cleaned_label, '^{search_string.lower()} .*'@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """ + cirrussearch_parameters: str = "" + results: Dict = {} - def process_results(results): - # TODO refactor - items = [] - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = SparqlItem(**item_json) - item.validate_qid_and_copy_label() - if not item.is_in_blocklist(): - items.append(item) - else: - logger.info(f"{item.label} found in blocklist, skipping") - return items + def fetch_based_on_label(self): + self.execute_queries() + self.print_total_items() - if suggestion is None: - raise ValueError("suggestion was None") - if suggestion.item is None: - raise ValueError("suggestion.item was None") - if suggestion.args is None: - raise ValueError("suggestion.args was None") - if suggestion.args.limit_to_items_without_p921: - raise Exception( - "Limiting to items without P921 is not " "supported yet for this task." - ) - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - if suggestion.args.limit_to_items_without_p921: - console.print( - "Limiting to scholarly articles without P921 main subject only" - ) - cirrussearch_parameters = ( - f"haswbstatement:P31=Q13442814 -haswbstatement:P921" - ) - else: - cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={suggestion.item.id}" + def execute_queries(self): # Fetch all items matching the search strings - self.list = [] - for search_string in suggestion.search_strings: - search_string = strip_bad_chars(search_string) - results = execute_sparql_query( - build_query( - cirrussearch_parameters=cirrussearch_parameters, - suggestion=suggestion, - search_string=search_string, - task=task, - ) - ) - logging.info( - f'Got {len(results["results"]["bindings"])} scholarly items from ' - f"WDQS using the search string {search_string}" + for search_string in self.main_subject_item.search_strings: + published_article_query = PublishedArticleQuery( + search_string=search_string, + main_subject_item=self.main_subject_item, + cirrussearch_parameters=self.cirrussearch_parameters, ) - self.list.extend(process_results(results)) - # preprints - # We don't use CirrusSearch in this query because we can do it more easily in - # SPARQL on a small subgraph like this - # find all items that are ?item wdt:P31/wd:P279* wd:Q1266946 - # minus the Qid we want to add - results_preprint = execute_sparql_query( - f""" - #{config.user_agent} - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - ?item wdt:P31/wd:P279* wd:Q580922. # preprint - MINUS {{ - ?item wdt:P921 wd:{suggestion.item.id}; - }} - ?item rdfs:label ?label. - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - MINUS {{?item wdt:P921/wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """, + published_article_query.get_results() + # https://pythonexamples.org/python-append-list-to-another-list/ + self.sparql_items.extend(published_article_query.items) + published_article_query.print_number_of_results() + preprint_query = PreprintArticleQuery( + search_string=search_string, main_subject_item=self.main_subject_item ) - logging.info( - f'Got {len(results["results"]["bindings"])} preprint items from ' - f"WDQS using the search string {search_string}" + preprint_query.get_results() + preprint_query.print_number_of_results() + self.sparql_items.extend(preprint_query.items) + thesis_query = ThesisQuery( + search_string=search_string, main_subject_item=self.main_subject_item ) - self.list.extend(process_results(results_preprint)) - console.print(f"Got a total of {len(self.list)} items") + thesis_query.get_results() + thesis_query.print_number_of_results() + self.sparql_items.extend(thesis_query.items) diff --git a/src/models/items/thesis.py b/src/models/items/thesis.py index 6d256fa..2aa9d93 100644 --- a/src/models/items/thesis.py +++ b/src/models/items/thesis.py @@ -1,69 +1,69 @@ -import logging - -from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore - -from src.helpers.console import console -from src.models.items import Items -from src.models.suggestion import Suggestion -from src.models.task import Task - -# There were ~16.000 thesis' in WD when this was written -from src.models.wikimedia.wikidata.sparql_item import SparqlItem - - -class ThesisItems(Items): - def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): - # logger = logging.getLogger(__name__) - if suggestion is None: - raise ValueError("suggestion was None") - if suggestion.search_strings is None: - raise ValueError("suggestion.search_strings was None") - if suggestion.args.limit_to_items_without_p921: - raise Exception( - "Limiting to items without P921 is not " "supported yet for this task." - ) - if task is None: - raise ValueError("task was None") - if task.language_code is None: - raise ValueError("task.language_code was None") - # Fetch all items matching the search strings - self.list = [] - for search_string in suggestion.search_strings: - # We don't use CirrusSearch in this query because we can do it more easily in - # SPARQL on a small subgraph like this - # find all items that are ?item wdt:P31/wd:P279* wd:Q1266946 - # minus the Qid we want to add - results = execute_sparql_query( - f""" - SELECT DISTINCT ?item ?itemLabel - WHERE {{ - {{ - ?item wdt:P31/wd:P279* wd:Q1266946. # thesis - }} UNION - {{ - ?item wdt:P31/wd:P279* wd:Q1385450. # dissertation - }} UNION - {{ - ?item wdt:P31/wd:P279* wd:Q3099732. # technical report - }} - MINUS {{ - ?item wdt:P921 wd:{suggestion.item.id}; - }} - ?item rdfs:label ?label. - FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || - REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || - REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) - MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{suggestion.item.id}. }} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} - }} - """, - ) - for item_json in results["results"]["bindings"]: - logging.debug(f"item_json:{item_json}") - item = SparqlItem(**item_json) - self.list.append(item) - logging.info( - f'Got {len(results["results"]["bindings"])} items from ' - f"WDQS using the search string {search_string}" - ) - console.print(f"Got a total of {len(self.list)} items") +# import logging +# +# from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore +# +# from src.helpers.console import console +# from src.models.items import Items +# from src.models.suggestion import Suggestion +# from src.models.task import Task +# +# # There were ~16.000 thesis' in WD when this was written +# from src.models.wikimedia.wikidata.sparql_item import SparqlItem +# +# +# class ThesisItems(Items): +# def fetch_based_on_label(self, suggestion: Suggestion = None, task: Task = None): +# # logger = logging.getLogger(__name__) +# if suggestion is None: +# raise ValueError("suggestion was None") +# if suggestion.search_strings is None: +# raise ValueError("suggestion.search_strings was None") +# if suggestion.args.limit_to_items_without_p921: +# raise Exception( +# "Limiting to items without P921 is not " "supported yet for this task." +# ) +# if task is None: +# raise ValueError("task was None") +# if task.language_code is None: +# raise ValueError("task.language_code was None") +# # Fetch all items matching the search strings +# self.list = [] +# for search_string in suggestion.search_strings: +# # We don't use CirrusSearch in this query because we can do it more easily in +# # SPARQL on a small subgraph like this +# # find all items that are ?main_subject_item wdt:P31/wd:P279* wd:Q1266946 +# # minus the Qid we want to add +# results = execute_sparql_query( +# f""" +# SELECT DISTINCT ?item ?itemLabel +# WHERE {{ +# {{ +# ?item wdt:P31/wd:P279* wd:Q1266946. # thesis +# }} UNION +# {{ +# ?item wdt:P31/wd:P279* wd:Q1385450. # dissertation +# }} UNION +# {{ +# ?item wdt:P31/wd:P279* wd:Q3099732. # technical report +# }} +# MINUS {{ +# ?item wdt:P921 wd:{suggestion.main_subject_item.id}; +# }} +# ?item rdfs:label ?label. +# FILTER(CONTAINS(LCASE(?label), " {search_string.lower()} "@{task.language_code.value}) || +# REGEX(LCASE(?label), ".* {search_string.lower()}$"@{task.language_code.value}) || +# REGEX(LCASE(?label), "^{search_string.lower()} .*"@{task.language_code.value})) +# MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{suggestion.main_subject_item.id}. }} +# SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} +# }} +# """, +# ) +# for item_json in results["results"]["bindings"]: +# logging.debug(f"item_json:{item_json}") +# item = SparqlItem(**item_json) +# self.list.append(item) +# logging.info( +# f'Got {len(results["results"]["bindings"])} items from ' +# f"WDQS using the search string {search_string}" +# ) +# console.print(f"Got a total of {len(self.list)} items") diff --git a/src/models/login.py b/src/models/login.py index d2e229b..ccb2206 100644 --- a/src/models/login.py +++ b/src/models/login.py @@ -1,4 +1,4 @@ -from wikibaseintegrator import wbi_config, wbi_login +from wikibaseintegrator import wbi_config, wbi_login # type: ignore import config diff --git a/src/models/main_subjects.py b/src/models/main_subjects.py new file mode 100644 index 0000000..62e6dd5 --- /dev/null +++ b/src/models/main_subjects.py @@ -0,0 +1,185 @@ +# from __future__ import annotations + +import argparse +import logging +import random +from time import sleep +from typing import List, Optional + +from pydantic import BaseModel +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore + +import config +from src.helpers.cli_messages import print_job_statistics +from src.helpers.console import console +from src.helpers.menus import select_task +from src.helpers.questions import ask_add_to_job_queue, ask_yes_no_question +from src.models.batch_jobs import BatchJobs +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.tasks import Task + +logger = logging.getLogger(__name__) + + +class MainSubjects(BaseModel): + args: argparse.Namespace + task: Optional[Task] = None + main_subjects: List[str] = [] + batchjobs: BatchJobs = BatchJobs(jobs=[]) + + class Config: + arbitrary_types_allowed = True + + def match_main_subjects_from_sparql(self): + """Collect subjects via SPARQL and call get_validated_main_subjects() + If we get any validated jobs we handle them""" + if self.args is None or self.args.sparql is None: + raise ValueError("args.sparql was None") + self.__check_different_from__() + self.__fetch_main_subjects__() + if self.main_subjects: + console.print(f"Got {len(self.main_subjects)} results") + sleep(1) + self.get_validated_main_subjects_as_jobs() + self.handle_job_preparation_or_run_directly_if_any_jobs() + else: + console.print("Got 0 results. Try another query or debug it using --debug") + + # def process_user_supplied_qids_into_batch_jobs(self) -> List[BatchJob]: + # """Given a sparql_items of QIDs, we go through + # them and return a sparql_items of jobs""" + # # TODO this should not return anything + # if self.task: + # print_best_practice(self.task) + # jobs = [] + # for qid in self.args.add: + # main_subject_item = MainSubjectItem(id=qid, args=self.args, task=self.task) + # job = main_subject_item.fetch_items_and_get_job_if_confirmed() + # if job: + # jobs.append(job) + # return jobs + # return [] + + def handle_job_preparation_or_run_directly_if_any_jobs(self): + if self.batchjobs is None: + raise ValueError("batchjobs was None") + if self.args is None: + raise ValueError("args was None") + if self.batchjobs.number_of_jobs: + if self.args.prepare_jobs: + console.print( + f"Adding {self.batchjobs.number_of_jobs} job(s) " + f"to the jobs file" + ) + for job in self.batchjobs.jobs: + from src import add_to_job_pickle + + add_to_job_pickle(job) + print_job_statistics(batchjobs=self.batchjobs) + console.print( + f"You can run the jobs " + f"non-interactively e.g. on the Toolforge " + f"Kubernetes cluster using -r or --run-prepared-jobs. " + f"See Kubernetes_HOWTO.md for details." + ) + else: + self.batchjobs.run_jobs() + + def get_validated_main_subjects_as_jobs( + self, + ) -> None: + """This function randomly picks a subject and add it to the + sparql_items of jobs if it had any matches and the user approved it""" + # TODO break this down into smaller methods + qid_subjects_not_picked_yet = self.main_subjects + self.__select_task__() + while True: + # Check if we have any subjects left in the sparql_items + if len(qid_subjects_not_picked_yet): + console.print(f"Picking a random main subject") + qid = random.choice(qid_subjects_not_picked_yet) + qid_subjects_not_picked_yet.remove(qid) + main_subject_item = MainSubjectItem( + id=qid, + args=self.args, + task=self.task, + confirmation=self.args.no_confirmation, + ) + job = main_subject_item.fetch_items_and_get_job_if_confirmed() + if job: + # Here we check if the user has enabled no ask more limit. + if self.args.no_ask_match_more_limit is None: + logger.debug("No ask more was None") + if job.main_subject_item.items: + job.main_subject_item.items.print_items_list(args=self.args) + job.main_subject_item.print_search_strings() + answer = ask_add_to_job_queue(job) + if answer: + self.batchjobs.jobs.append(job) + else: + self.batchjobs.jobs.append(job) + logger.debug(f"joblist now has {self.batchjobs.number_of_jobs} jobs") + print_job_statistics(batchjobs=self.batchjobs) + if len(qid_subjects_not_picked_yet): + if ( + self.args.no_ask_match_more_limit is None + or self.args.no_ask_match_more_limit + < sum( + job.main_subject_item.items.number_of_sparql_items + for job in self.batchjobs.jobs + if job.main_subject_item.items + and job.main_subject_item.items.sparql_items + ) + ): + answer_was_yes = ask_yes_no_question("Match one more?") + if not answer_was_yes: + break + else: + console.print("No more subjects in the sparql_items.") + break + else: + console.print("No more subjects in the sparql_items. Exiting.") + break + if self.args.no_ask_match_more_limit: + for job in self.batchjobs.jobs: + if job.main_subject_item.items: + job.main_subject_item.items.print_items_list(args=self.args) + job.main_subject_item.print_search_strings() + if ( + config.automatically_approve_jobs_with_less_than_fifty_matches + and job.main_subject_item.items.number_of_sparql_items < 50 + ): + console.print( + f"This job with {job.main_subject_item.items.number_of_sparql_items} matching items was automatically approved", + style="green", + ) + self.batchjobs.jobs.append(job) + else: + answer = ask_add_to_job_queue(job) + if answer: + self.batchjobs.jobs.append(job) + + def __select_task__(self): + self.task: Task = select_task() + if self.task is None: + raise ValueError("Got no task") + + def __fetch_main_subjects__(self): + with console.status("Running query on WDQS..."): + results = execute_sparql_query( + self.args.sparql.replace("{", "{{").replace("}", "}}"), + ) + for item_json in results["results"]["bindings"]: + logging.debug(f"item_json:{item_json}") + self.main_subjects.append(item_json["item"]["value"]) + + def __check_different_from__(self): + if "P1889" not in self.args.sparql: + console.print( + "Your SPARQL did not contain P1889 (different from). " + "Please include 'MINUS {?main_subject_item wdt:P1889 [].}' " + "in your WHERE clause to avoid false positives." + ) + exit(0) + else: + logger.info("Detected P1889 in the query") diff --git a/src/models/suggestion.py b/src/models/suggestion.py index 07510a8..a992433 100644 --- a/src/models/suggestion.py +++ b/src/models/suggestion.py @@ -1,9 +1,6 @@ -from __future__ import annotations +# from __future__ import annotations -import argparse import logging -from typing import TYPE_CHECKING, List, Optional, Set -from urllib.parse import quote from pydantic import BaseModel from wikibaseintegrator import WikibaseIntegrator # type: ignore @@ -11,181 +8,8 @@ from wikibaseintegrator.models import Claim # type: ignore from wikibaseintegrator.wbi_helpers import search_entities # type: ignore -import config -import config.items -from src.helpers.calculations import calculate_random_editgroups_hash -from src.helpers.cleaning import clean_rich_formatting -from src.models.items import Items -from src.models.task import Task -from src.models.wikimedia.wikidata.enums import Property, Qid -from src.models.wikimedia.wikidata.item import Item - -if TYPE_CHECKING: - from src.models.batch_job import BatchJob - logger = logging.getLogger(__name__) class Suggestion(BaseModel): - item: Item - task: Task - args: argparse.Namespace - search_strings: Optional[Set[str]] = None - - class Config: - arbitrary_types_allowed = True - - def __alias_appears_in_label_of_a_qid__(self, alias: str) -> bool: - if alias is None: - raise ValueError("alias was none") - results = search_entities(alias, dict_result=True) - for result in results: - if result["label"] == alias: - qid = result["id"] - logger.info(f"Found {alias} as label in {qid}") - # verify that it is not a scientific article - return self.__is_not_scientific_article__(qid=qid) - return False - - @staticmethod - def __is_not_scientific_article__(qid: str): - """Looks up the QID in Wikidata to check whether it is a scholarly article or not. - We negate the result""" - if qid is None: - raise ValueError("qid was None") - wbi = WikibaseIntegrator() - item = wbi.item.get(qid) - claims: List[Claim] = item.claims - for claim in claims: - if claim.mainsnak.property_number == Property.INSTANCE_OF.value: - qid = claim.mainsnak.datavalue["value"]["id"] - logger.info(f"Found P31 with value {qid}") - from src.helpers.console import console - - # console.print(claim.mainsnak) - if qid == Qid.SCHOLARLY_ARTICLE.value: - logger.debug("__is_not_scientific_article__:returning false now") - return False - else: - return True - - def __str__(self): - """Return label and description, the latter cut to 50 chars""" - if self.item is not None: - string = ( - f"label: [bold]{clean_rich_formatting(self.item.label)}[/bold]\n" - f"aliases: {', '.join(self.item.aliases)}\n" - f"description: {self.item.description[:70]}\n" - f"{self.item.url()}\n" - ) - for url in self.search_urls(): - string = string + f"{url}\n" - return string - - def add_to_items( - self, items: Items = None, jobs: List[BatchJob] = None, job_count: int = None - ): - """Add a suggested Qid as main subject on all items that - have a label that matches one of the search strings for this Qid - We calculate a new edit group hash each time this function is - called so similar edits are grouped and easily be undone. - - This function is non-interactive""" - if items is None: - raise ValueError("Items was None") - if items.list is None: - raise ValueError("items.list was None") - if jobs is None: - raise ValueError("jobs was None") - if job_count is None: - raise ValueError("job count was None") - editgroups_hash: str = calculate_random_editgroups_hash() - count = 0 - for target_item in items.list: - count += 1 - from src import console - - with console.status( - f"Uploading main subject " - f"[green]{clean_rich_formatting(self.item.label)}[/green] " - f"to {clean_rich_formatting(target_item.label)}" - ): - main_subject_property = "P921" - reference = ItemType( - "Q69652283", # inferred from title - prop_nr="P887", # based on heuristic - ) - statement = ItemType( - self.item.id, prop_nr=main_subject_property, references=[reference] - ) - target_item.upload_one_statement_to_wikidata( - statement=statement, - summary=f"[[Property:{main_subject_property}]]: [[{self.item.id}]]", - editgroups_hash=editgroups_hash, - ) - console.print( - f"(job {job_count}/{len(jobs)})(item {count}/{len(items.list)}) " - f"Added '{clean_rich_formatting(self.item.label)}' to " - f"{clean_rich_formatting(target_item.label)}: {target_item.url()}" - ) - # input("Press enter to continue") - - def extract_search_strings(self): - def clean_special_symbols(string: str): - return string.replace("®", "").replace("™", "").replace('"', "") - - from src.helpers.console import console - - logger = logging.getLogger(__name__) - if self.args is None: - raise ValueError("args was None") - else: - logger.debug(f"args:{self.args}") - if self.args.no_aliases is True: - console.print("Alias matching is turned off") - no_aliases = True - elif self.item.id in config.items.no_alias_for_scholarly_items: - logger.info( - f"Alias matching is turned off for this item: {self.item.label}" - ) - no_aliases = True - else: - no_aliases = False - if self.item.label is None: - raise ValueError("self.item.label was None") - self.search_strings: Set[str] = set() - self.search_strings.add(clean_special_symbols(self.item.label)) - if self.item.aliases is not None and no_aliases is False: - for alias in self.item.aliases: - # logger.debug(f"extracting alias:{alias}") - if len(alias) < 5 and alias not in config.items.list_of_allowed_aliases: - console.print( - f"Skipping short alias '{alias}' to avoid false positives", - style="#FF8000", - ) - elif self.__alias_appears_in_label_of_a_qid__(alias=alias): - console.print( - f"Skipped '{alias}' because it appears " - f"in a label of at least one Qid that is not a scholarly article", - style="#FF8000", - ) - elif alias in config.items.list_of_allowed_aliases: - console.print(f"Found {alias} in the allow list") - self.search_strings.add(clean_special_symbols(alias)) - else: - self.search_strings.add(clean_special_symbols(alias)) - - def print_search_strings(self): - # logger.debug(f"search_strings:{self.search_strings}") - from src.helpers.console import print_search_strings_table - - print_search_strings_table(args=self.args, search_strings=self.search_strings) - - def search_urls(self) -> List[str]: - if self.search_strings is None: - raise ValueError("self.search_strings was None") - urls = [] - for search_string in self.search_strings: - search_term = quote(f'"{search_string}"') - urls.append(f"https://www.wikidata.org/w/index.php?search={search_term}") - return urls + pass diff --git a/src/models/wikimedia/wikidata/entity.py b/src/models/wikimedia/wikidata/entity.py index ba3c82c..3449c6d 100644 --- a/src/models/wikimedia/wikidata/entity.py +++ b/src/models/wikimedia/wikidata/entity.py @@ -37,7 +37,7 @@ def upload_one_statement_to_wikidata( This mandates an editgroups hash to be supplied""" logger = logging.getLogger(__name__) if self.id is None: - raise ValueError("no id on item") + raise ValueError("no id on main_subject_item") if statement is None: raise ValueError("Statement was None") if summary is None: @@ -58,5 +58,6 @@ def upload_one_statement_to_wikidata( logger.error(f"Got error from the API: {e}") # logger.debug(f"result from WBI:{result}") + @property def url(self): return f"http://www.wikidata.org/entity/{self.id}" diff --git a/src/models/wikimedia/wikidata/entiyt_id.py b/src/models/wikimedia/wikidata/entiyt_id.py index 9a0dfd4..29d0992 100644 --- a/src/models/wikimedia/wikidata/entiyt_id.py +++ b/src/models/wikimedia/wikidata/entiyt_id.py @@ -13,7 +13,7 @@ class EntityId: rest: str def __init__(self, entity_id: str): - if entity_id is not None: + if entity_id: # Remove prefix if found if config.wd_prefix in entity_id: logger.debug("Removing prefix") diff --git a/src/models/wikimedia/wikidata/foreign_id.py b/src/models/wikimedia/wikidata/foreign_id.py index de1c9bb..6c402eb 100644 --- a/src/models/wikimedia/wikidata/foreign_id.py +++ b/src/models/wikimedia/wikidata/foreign_id.py @@ -1,23 +1,23 @@ -from typing import Optional - -from src.models.wikimedia.wikidata.entiyt_id import EntityId - - -class ForeignID: - id: Optional[str] - property: Optional[str] # This is the property with type ExternalId - source_item_id: Optional[str] # This is the Q-item for the source - - def __init__( - self, - id: Optional[str] = None, - property: Optional[str] = None, - source_item_id: Optional[str] = None, - ): - self.id = id - if property is None: - raise ValueError("property was None") - self.property = str(EntityId(property)) - if source_item_id is None: - raise ValueError("source_item_id was None") - self.source_item_id = str(EntityId(source_item_id)) +# from typing import Optional +# +# from src.models.wikimedia.wikidata.entiyt_id import EntityId +# +# +# class ForeignID: +# id: Optional[str] +# property: Optional[str] # This is the property with type ExternalId +# source_item_id: Optional[str] # This is the Q-main_subject_item for the source +# +# def __init__( +# self, +# id: Optional[str] = None, +# property: Optional[str] = None, +# source_item_id: Optional[str] = None, +# ): +# self.id = id +# if property is None: +# raise ValueError("property was None") +# self.property = str(EntityId(property)) +# if source_item_id is None: +# raise ValueError("source_item_id was None") +# self.source_item_id = str(EntityId(source_item_id)) diff --git a/src/models/wikimedia/wikidata/item.py b/src/models/wikimedia/wikidata/item.py deleted file mode 100644 index 3a68362..0000000 --- a/src/models/wikimedia/wikidata/item.py +++ /dev/null @@ -1,53 +0,0 @@ -from typing import List, Optional - -from wikibaseintegrator import WikibaseIntegrator # type: ignore -from wikibaseintegrator import wbi_config # type: ignore -from wikibaseintegrator.models import Alias # type: ignore - -import config -from src.models.task import Task -from src.models.wikimedia.wikidata.entity import Entity - -wbi_config.config["USER_AGENT"] = config.user_agent - - -class Item(Entity): - """This represents an item in Wikidata - We always work on one language at a time, - so we don't bother with languages here and keep to simple strings""" - - description: Optional[str] = None - aliases: Optional[List[str]] = None - - def __str__(self): - return f"{self.label}, see {self.url()}" - - def fetch_label_and_description_and_aliases(self, task: Task = None): - """Fetch label and aliases in the task language from the Wikidata API""" - if task is None: - raise ValueError("task was None") - if not isinstance(task, Task): - raise ValueError("task was not a Task object") - if task.language_code is None: - raise ValueError("task.language_code was None") - from src.helpers.console import console - - with console.status( - f"Fetching {task.language_code.name.title()} label and aliases from the Wikidata API..." - ): - wbi = WikibaseIntegrator() - item = wbi.item.get(self.id) - label = item.labels.get(task.language_code.value) - if label is not None: - self.label = str(label) - description = item.descriptions.get(task.language_code.value) - if description is not None: - self.description = str(description) - aliases: List[Alias] = item.aliases.get(task.language_code.value) - # logging.debug(f"aliases from wbi:{item.aliases.get('en')}") - if aliases is not None: - self.aliases = [] - for alias in aliases: - self.aliases.append(str(alias)) - # logging.debug(f"appended:{alias.value}") - # logging.debug(f"aliases:{self.aliases}") diff --git a/src/models/wikimedia/wikidata/item/__init__.py b/src/models/wikimedia/wikidata/item/__init__.py new file mode 100644 index 0000000..3b49510 --- /dev/null +++ b/src/models/wikimedia/wikidata/item/__init__.py @@ -0,0 +1,67 @@ +import argparse +from typing import List, Optional + +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator import wbi_config # type: ignore +from wikibaseintegrator.models import Alias # type: ignore + +import config +from src.models.task import Task +from src.models.wikimedia.wikidata.entity import Entity + +wbi_config.config["USER_AGENT"] = config.user_agent + + +class Item(Entity): + """This represents an main_subject_item in Wikidata + We always work on one language at a time, + so we don't bother with languages here and keep to simple strings""" + + aliases: Optional[List[str]] = None + args: Optional[argparse.Namespace] = None + confirmation: bool = False + description: Optional[str] = None + task: Optional[Task] = None + + class Config: + arbitrary_types_allowed = True + + def __str__(self): + return f"{self.label}, see {self.url}" + + def __fetch_label_and_description_and_aliases__(self): + """Fetch label and aliases in the task language from the Wikidata API""" + if not self.task: + raise ValueError("self.task was None") + if not isinstance(self.task, Task): + raise ValueError("self.task was not a Task object") + if self.task.language_code is None: + raise ValueError("self.task.language_code was None") + from src.helpers.console import console + + with console.status( + f"Fetching {self.task.language_code.name.title()} label and aliases from the Wikidata API..." + ): + wbi = WikibaseIntegrator() + item = wbi.item.get(self.id) + label = item.labels.get(self.task.language_code.value) + if label: + self.label = str(label) + description = item.descriptions.get(self.task.language_code.value) + if description: + self.description = str(description) + aliases: List[Alias] = item.aliases.get(self.task.language_code.value) + # logging.debug(f"aliases from wbi:{main_subject_item.aliases.get('en')}") + if aliases: + self.aliases = [] + for alias in aliases: + self.aliases.append(str(alias)) + # logging.debug(f"appended:{alias.value}") + # logging.debug(f"aliases:{self.aliases}") + + def __strip_qid_prefix__(self): + if "https://www.wikidata.org/wiki/" in self.id: + self.id = self.id[30:] + if "http://www.wikidata.org/entity/" in self.id: + self.id = self.id[31:] + # logger.debug(f"id:{id}") diff --git a/src/models/wikimedia/wikidata/item/main_subject.py b/src/models/wikimedia/wikidata/item/main_subject.py new file mode 100644 index 0000000..e5f867d --- /dev/null +++ b/src/models/wikimedia/wikidata/item/main_subject.py @@ -0,0 +1,272 @@ +import logging +from typing import TYPE_CHECKING, List, Optional, Set +from urllib.parse import quote + +from wikibaseintegrator import WikibaseIntegrator # type: ignore +from wikibaseintegrator.datatypes import Item as ItemType # type: ignore +from wikibaseintegrator.models import Claim # type: ignore +from wikibaseintegrator.wbi_helpers import search_entities # type: ignore + +import config +from src.helpers.calculations import calculate_random_editgroups_hash +from src.helpers.cleaning import clean_rich_formatting +from src.helpers.console import console +from src.helpers.questions import ask_yes_no_question +from src.models.items import Items +from src.models.items.riksdagen_documents import RiksdagenDocumentItems +from src.models.items.scholarly_articles import ScholarlyArticleItems +from src.models.wikimedia.wikidata.enums import Property, Qid +from src.models.wikimedia.wikidata.item import Item +from src.tasks import TaskIds + +if TYPE_CHECKING: + from src.models.batch_job import BatchJob + +logger = logging.getLogger(__name__) + + +class MainSubjectItem(Item): + search_strings: Set[str] = set() + items: Optional[Items] = None + number_of_queries: int = 0 + + class Config: + arbitrary_types_allowed = True + + def __alias_appears_in_label_of_a_qid__(self, alias: str) -> bool: + if not alias: + raise ValueError("alias was none") + results = search_entities(alias, dict_result=True) + for result in results: + if result["label"] == alias: + qid = result["id"] + logger.info(f"Found {alias} as label in {qid}") + # verify that it is not a scientific article + return self.__is_not_scientific_article__(qid=qid) + return False + + @staticmethod + def __is_not_scientific_article__(qid: str): + """Looks up the QID in Wikidata to check whether it is a scholarly article or not. + We negate the result""" + # TODO avoid negating here + if not qid: + raise ValueError("qid was None") + wbi = WikibaseIntegrator() + item = wbi.item.get(qid) + claims: List[Claim] = item.claims + for claim in claims: + if claim.mainsnak.property_number == Property.INSTANCE_OF.value: + qid = claim.mainsnak.datavalue["value"]["id"] + logger.info(f"Found P31 with value {qid}") + # console.print(claim.mainsnak) + if qid == Qid.SCHOLARLY_ARTICLE.value: + logger.debug("__is_not_scientific_article__:returning false now") + return False + else: + return True + + def __str__(self): + """Return label and description, the latter cut to 50 chars""" + string = ( + f"label: [bold]{clean_rich_formatting(self.label)}[/bold]\n" + f"aliases: {', '.join(self.aliases)}\n" + f"description: {self.description[:70]}\n" + f"{self.url}\n" + ) + for url in self.search_urls(): + string = string + f"{url}\n" + return string + + def add_to_items(self, jobs: List["BatchJob"] = None, job_count: int = None): + """Add a suggested Qid as main subject on all items that + have a label that matches one of the search strings for this Qid + We calculate a new edit group hash each time this function is + called so similar edits are grouped and easily be undone. + + This function is non-interactive""" + if not self.items: + raise ValueError("Items was None") + if not self.items.sparql_items: + raise ValueError("items.sparql_items was None") + if not jobs: + raise ValueError("jobs was None") + if not job_count: + raise ValueError("job count was None") + editgroups_hash: str = calculate_random_editgroups_hash() + count = 0 + for target_item in self.items.sparql_items: + count += 1 + if not target_item.label: + target_item.label = "main_subject_item with missing label" + with console.status( + f"Uploading main subject " + f"[green]{clean_rich_formatting(self.label)}[/green] " + f"to {clean_rich_formatting(target_item.label)} ({target_item.id})" + ): + main_subject_property = "P921" + reference = ItemType( + "Q69652283", # inferred from title + prop_nr="P887", # based on heuristic + ) + statement = ItemType( + self.id, + prop_nr=main_subject_property, + references=[reference], + ) + target_item.upload_one_statement_to_wikidata( + statement=statement, + summary=f"[[Property:{main_subject_property}]]: [[{self.id}]]", + editgroups_hash=editgroups_hash, + ) + console.print( + f"(job {job_count}/{len(jobs)})(main_subject_item {count}/{self.items.number_of_sparql_items} " + f"Added '{clean_rich_formatting(self.label)}' to " + f"{clean_rich_formatting(target_item.label)}: {target_item.url}" + ) + # input("Press enter to continue") + + @staticmethod + def __clean_special_symbols__(string: str): + return string.replace("®", "").replace("™", "").replace('"', "") + + def __extract_search_strings__(self): + if not self.args: + raise ValueError("args was None") + else: + logger.debug(f"args:{self.args}") + if self.args.no_aliases is True: + console.print("Alias matching is turned off") + no_aliases = True + elif self.id in config.no_alias_for_scholarly_items: + logger.info( + f"Alias matching is turned off for this main_subject_item: {self.label}" + ) + no_aliases = True + else: + no_aliases = False + if not self.label: + raise ValueError("self.label was None") + self.search_strings: Set[str] = set() + self.search_strings.add(self.__clean_special_symbols__(self.label)) + if self.aliases and no_aliases is False: + for alias in self.aliases: + # logger.debug(f"extracting alias:{alias}") + if len(alias) < 5 and alias not in config.list_of_allowed_aliases: + console.print( + f"Skipping short alias '{alias}' to avoid false positives", + style="#FF8000", + ) + elif self.__alias_appears_in_label_of_a_qid__(alias=alias): + console.print( + f"Skipped '{alias}' because it appears " + f"in a label of at least one Qid that is not a scholarly article", + style="#FF8000", + ) + elif alias in config.list_of_allowed_aliases: + console.print(f"Found {alias} in the allow sparql_items") + self.search_strings.add(self.__clean_special_symbols__(alias)) + else: + self.search_strings.add(self.__clean_special_symbols__(alias)) + + def print_search_strings(self): + # logger.debug(f"search_strings:{self.search_strings}") + from src.helpers.cli_messages import print_search_strings_table + + print_search_strings_table(args=self.args, search_strings=self.search_strings) + + def search_urls(self) -> List[str]: + if not self.search_strings: + raise ValueError("self.search_strings was None") + urls = [] + for search_string in self.search_strings: + search_term = quote(f'"{search_string}"') + urls.append(f"https://www.wikidata.org/w/index.php?search={search_term}") + return urls + + def __prepare_before_fetching_items__(self): + self.__extract_search_strings__() + self.__check_we_got_what_we_need__() + if config.loglevel in [logging.INFO, logging.DEBUG]: + self.print_search_strings() + self.__count_number_of_queries__() + self.__instantiate_the_right_class_for_this_task__() + + def __parse_into_job__(self): + if self.items.number_of_sparql_items: + self.items.remove_duplicates() + self.items.random_shuffle_items() + from src import BatchJob + + job = BatchJob( + number_of_queries=self.number_of_queries, + main_subject_item=self, + ) + return job + else: + console.print("No matching items found") + return None + + def __count_number_of_queries__(self): + self.number_of_queries = ( + len(self.search_strings) * self.task.number_of_queries_per_search_string + ) + + def __check_we_got_what_we_need__(self): + if not self.search_strings: + raise ValueError("search_strings was None") + if not self.task: + raise ValueError("task was None") + + def __instantiate_the_right_class_for_this_task__(self): + if self.task.id == TaskIds.SCHOLARLY_ARTICLES: + self.items = ScholarlyArticleItems(main_subject_item=self) + elif self.task.id == TaskIds.RIKSDAGEN_DOCUMENTS: + self.items = RiksdagenDocumentItems(main_subject_item=self) + # elif self.task.id == TaskIds.THESIS: + # items = ThesisItems(main_subject_item=self) + # elif self.task.id == TaskIds.ACADEMIC_JOURNALS: + # items = AcademicJournalItems(main_subject_item=self) + else: + raise ValueError(f"{self.task.id} was not recognized") + + def fetch_items_and_get_job_if_confirmed(self) -> Optional["BatchJob"]: + """This method handles all the work needed to return a job""" + self.__strip_qid_prefix__() + self.__fetch_label_and_description_and_aliases__() + if self.__got_label__(): + console.print(f"Working on {self.label}") + if self.__is_confirmed__(): + return self.__fetch_and_parse__() + return None + + def __is_confirmed__(self) -> bool: + if self.confirmation: + return ask_yes_no_question("Do you want to continue?") + else: + return True + + def __fetch_and_parse__(self) -> Optional["BatchJob"]: + self.__prepare_before_fetching_items__() + if self.items: + with console.status( + f"Fetching items with labels that have one of " + f"the search strings by running a total of " + f"{self.number_of_queries} " + f"queries on WDQS..." + ): + self.items.fetch_based_on_label() + return self.__parse_into_job__() + else: + raise ValueError("items was None") + + def __got_label__(self) -> bool: + if not self.label: + if not self.task: + raise ValueError("task was None") + console.print( + f"Label for {self.task.language_code.name.title()} was None, see {self.url}, skipping" + ) + return False + else: + return True diff --git a/src/models/wikimedia/wikidata/sparql_item.py b/src/models/wikimedia/wikidata/item/sparql.py similarity index 83% rename from src/models/wikimedia/wikidata/sparql_item.py rename to src/models/wikimedia/wikidata/item/sparql.py index 3e207e3..89591cc 100644 --- a/src/models/wikimedia/wikidata/sparql_item.py +++ b/src/models/wikimedia/wikidata/item/sparql.py @@ -1,7 +1,6 @@ from pydantic import BaseModel import config -import config.items from src.models.wikimedia.wikidata.entiyt_id import EntityId from src.models.wikimedia.wikidata.item import Item @@ -23,11 +22,11 @@ def validate_qid_and_copy_label(self): def is_in_blocklist(self) -> bool: if self.id is None: raise ValueError("did not get an id") - if config.items.blocklist_for_scholarly_items is None: + if config.blocklist_for_scholarly_items is None: raise ValueError( "config.blocklist_for_scholarly_items was None, please fix" ) - if self.id in config.items.blocklist_for_scholarly_items: + if self.id in config.blocklist_for_scholarly_items: return True else: return False diff --git a/src/models/wikimedia/wikidata/query/__init__.py b/src/models/wikimedia/wikidata/query/__init__.py new file mode 100644 index 0000000..b95ce66 --- /dev/null +++ b/src/models/wikimedia/wikidata/query/__init__.py @@ -0,0 +1,69 @@ +import logging +from typing import Any, Dict, List + +from pydantic import BaseModel +from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore + +from src.models.wikimedia.wikidata.item import Item +from src.models.wikimedia.wikidata.item.sparql import SparqlItem + +logger = logging.getLogger(__name__) + + +class Query(BaseModel): + results: Dict = {} + search_string = "" + query_string = "" + items: List[Item] = [] + # any here because of pydantic error + main_subject_item: Any + + def __parse_results__(self) -> None: + # console.print(self.results) + for item_json in self.results["results"]["bindings"]: + logging.debug(f"item_json:{item_json}") + item = SparqlItem(**item_json) + item.validate_qid_and_copy_label() + if not item.is_in_blocklist(): + self.items.append(item) + else: + logger.info(f"{item.label} found in blocklist, skipping") + + def __strip_bad_chars__(self): + # Note this has to match the cleaning done in the sparql query + # We lowercase and remove common symbols + # We replace like this to save CPU cycles see + # https://stackoverflow.com/questions/3411771/best-way-to-replace-multiple-characters-in-a-string + self.search_string = ( + self.search_string + # Needed for matching backslashes e.g. "Dmel\CG5330" on Q29717230 + .replace("\\", "\\\\") + # Needed for when labels contain apostrophe + .replace("'", "\\'") + .replace(",", "") + .replace(":", "") + .replace(";", "") + .replace("(", "") + .replace(")", "") + .replace("[", "") + .replace("]", "") + ) + + def __execute__(self): + self.results = execute_sparql_query(self.query_string) + + def get_results(self): + """Do everything needed to get the results""" + self.__strip_bad_chars__() + self.__prepare_and_build_query__() + self.__execute__() + self.__parse_results__() + + def __prepare_and_build_query__(self): + pass + + def print_number_of_results(self): + logging.info( + f"Got {len(self.items)} items from " + f"WDQS using the search string {self.search_string}" + ) diff --git a/src/models/wikimedia/wikidata/query/preprint_article.py b/src/models/wikimedia/wikidata/query/preprint_article.py new file mode 100644 index 0000000..356422a --- /dev/null +++ b/src/models/wikimedia/wikidata/query/preprint_article.py @@ -0,0 +1,31 @@ +import config +from src.models.wikimedia.wikidata.query import Query + + +class PreprintArticleQuery(Query): + def __prepare_and_build_query__(self): + # We don't use CirrusSearch in this query because we can do it more easily in + # SPARQL on a small subgraph like this + # find all items that are ?main_subject_item wdt:P31/wd:P279* wd:Q1266946 + # minus the Qid we want to add + self.query_string = f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + ?item wdt:P31/wd:P279* wd:Q580922. # preprint + MINUS {{ + ?item wdt:P921 wd:{self.main_subject_item.id}; + }} + ?item rdfs:label ?label. + FILTER(CONTAINS( + LCASE(?label), " {self.search_string.lower()} " + @{self.main_subject_item.task.language_code.value}) || + REGEX(LCASE(?label), ".* {self.search_string.lower()}$" + @{self.main_subject_item.task.language_code.value}) || + REGEX(LCASE(?label), "^{self.search_string.lower()} .*" + @{self.main_subject_item.task.language_code.value}) + ) + MINUS {{?item wdt:P921/wdt:P279 wd:{self.main_subject_item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + """ diff --git a/src/models/wikimedia/wikidata/query/published_article.py b/src/models/wikimedia/wikidata/query/published_article.py new file mode 100644 index 0000000..578c454 --- /dev/null +++ b/src/models/wikimedia/wikidata/query/published_article.py @@ -0,0 +1,85 @@ +import config +from src.helpers.console import console +from src.models.wikimedia.wikidata.query import Query + + +class PublishedArticleQuery(Query): + cirrussearch_parameters: str = "" + + def __check_we_got_everything_we_need__(self): + if not self.main_subject_item: + raise ValueError("main_subject_item was None") + if not self.main_subject_item.args: + raise ValueError("main_subject_item.args was None") + # if self.main_subject_item.args.limit_to_items_without_p921: + # raise Exception( + # "Limiting to items without P921 is not " "supported yet for this task." + # ) + if self.main_subject_item.task is None: + raise ValueError("task was None") + if self.main_subject_item.task.language_code is None: + raise ValueError("task.language_code was None") + if self.main_subject_item.task is None: + raise ValueError("task was None") + if self.main_subject_item.task.language_code is None: + raise ValueError("task.language_code was None") + + def __prepare_and_build_query__( + self, + ): + self.__check_we_got_everything_we_need__() + self.__setup_cirrussearch_params__() + self.__build_query__() + + def __build_query__(self): + # This query uses https://www.w3.org/TR/sparql11-property-paths/ to + # find subjects that are subclass of one another up to 3 hops away + # This query also uses the https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI + # which has a hardcoded limit of 10,000 items so you will never get more matches than that + # This query use regex to match beginning, middle and end of the label of matched items + # The replacing lines should match the similar python replacements in cleaning.py + # The replacing with "\\\\\\\\" becomes "\\\\" after leaving python and then it works in + # SPARQL where it becomes "\\" and thus match a single backslash + self.query_string = f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + hint:Query hint:optimizer "None". + BIND(STR('{self.cirrussearch_parameters} \"{self.search_string}\"') as ?search_string) + SERVICE wikibase:mwapi {{ + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch ?search_string. + ?title wikibase:apiOutput mwapi:title. + }} + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + BIND(REPLACE(LCASE(?label), ",", "") as ?label1) + BIND(REPLACE(?label1, ":", "") as ?label2) + BIND(REPLACE(?label2, ";", "") as ?label3) + BIND(REPLACE(?label3, "\\\\(", "") as ?label4) + BIND(REPLACE(?label4, "\\\\)", "") as ?label5) + BIND(REPLACE(?label5, "\\\\[", "") as ?label6) + BIND(REPLACE(?label6, "\\\\]", "") as ?label7) + BIND(REPLACE(?label7, "\\\\\\\\", "") as ?label8) + BIND(?label8 as ?cleaned_label) + FILTER(CONTAINS(?cleaned_label, ' {self.search_string.lower()} '@{self.main_subject_item.task.language_code.value}) || + REGEX(?cleaned_label, '.* {self.search_string.lower()}$'@{self.main_subject_item.task.language_code.value}) || + REGEX(?cleaned_label, '^{self.search_string.lower()} .*'@{self.main_subject_item.task.language_code.value})) + MINUS {{?item wdt:P921/wdt:P279 wd:{self.main_subject_item.id}. }} + MINUS {{?item wdt:P921/wdt:P279/wdt:P279 wd:{self.main_subject_item.id}. }} + MINUS {{?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:{self.main_subject_item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + """ + + def __setup_cirrussearch_params__(self): + if self.main_subject_item.args.limit_to_items_without_p921: + console.print( + "Limiting to scholarly articles without P921 main subject only" + ) + self.cirrussearch_parameters = ( + f"haswbstatement:P31=Q13442814 -haswbstatement:P921" + ) + else: + self.cirrussearch_parameters = f"haswbstatement:P31=Q13442814 -haswbstatement:P921={self.main_subject_item.id}" diff --git a/src/models/wikimedia/wikidata/query/riksdagen_document.py b/src/models/wikimedia/wikidata/query/riksdagen_document.py new file mode 100644 index 0000000..fa06308 --- /dev/null +++ b/src/models/wikimedia/wikidata/query/riksdagen_document.py @@ -0,0 +1,33 @@ +import config +from src.models.wikimedia.wikidata.query import Query + + +class RiksdagenDocumentQuery(Query): + def __prepare_and_build_query__(self): + lang = self.main_subject_item.task.language_code.value + self.query_string = f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + hint:Query hint:optimizer "None". + SERVICE wikibase:mwapi {{ + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch 'haswbstatement:P8433 -haswbstatement:P921={self.main_subject_item.id} "{self.search_string}"' . + ?title wikibase:apiOutput mwapi:title. + }} + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + # We lowercase the label first and search for the + # string in both the beginning, middle and end of the label + FILTER(CONTAINS( + LCASE(?label), " {self.search_string.lower()} "@{lang}) || + REGEX(LCASE(?label), ".* {self.search_string.lower()}$"@{lang}) || + REGEX(LCASE(?label), "^{self.search_string.lower()} .*"@{lang}) + ) + # remove more specific forms of the main subject also + # Thanks to Jan Ainali for this improvement :) + MINUS {{?main_subject_item wdt:P921 ?topic. ?topic wdt:P279 wd:{self.main_subject_item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "sv". }} + }} + """ diff --git a/src/models/wikimedia/wikidata/query/thesis.py b/src/models/wikimedia/wikidata/query/thesis.py new file mode 100644 index 0000000..0e74935 --- /dev/null +++ b/src/models/wikimedia/wikidata/query/thesis.py @@ -0,0 +1,30 @@ +import config +from src.models.wikimedia.wikidata.query import Query + + +class ThesisQuery(Query): + def __prepare_and_build_query__(self): + self.query_string = f""" + #{config.user_agent} + SELECT DISTINCT ?item ?itemLabel + WHERE {{ + {{ + ?item wdt:P31/wd:P279* wd:Q1266946. # thesis + }} UNION + {{ + ?item wdt:P31/wd:P279* wd:Q1385450. # dissertation + }} UNION + {{ + ?item wdt:P31/wd:P279* wd:Q3099732. # technical report + }} + MINUS {{ + ?item wdt:P921 wd:{self.main_subject_item.id}; + }} + ?item rdfs:label ?label. + FILTER(CONTAINS(LCASE(?label), " {self.search_string.lower()} "@{self.main_subject_item.task.language_code.value}) || + REGEX(LCASE(?label), ".* {self.search_string.lower()}$"@{self.main_subject_item.task.language_code.value}) || + REGEX(LCASE(?label), "^{self.search_string.lower()} .*"@{self.main_subject_item.task.language_code.value})) + MINUS {{?item wdt:P921 ?topic. ?topic wdt:P279 wd:{self.main_subject_item.id}. }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} + }} + """ diff --git a/src/tasks.py b/src/tasks.py index 24d6f06..9a6bd80 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -7,7 +7,7 @@ Task( **dict( id=TaskIds.SCHOLARLY_ARTICLES, - label="Add main subject to scholarly articles and preprints", + label="Add main subject to scholarly articles, thesis' and preprints", language_code=SupportedLanguageCode.ENGLISH, best_practice_information=( "When adding Qid main subjects please try to first " @@ -16,8 +16,8 @@ "E.g. when searching for 'cancer screening' in Wikidata " "we find 'gastric cancer screening' in labels of " "scientific articles but there is " - "perhaps no item for this yet.\n" - "In this case it is preferred to first create that item " + "perhaps no main_subject_item for this yet.\n" + "In this case it is preferred to first create that main_subject_item " "(done in Q108532542 and add that as main subject and " "avoid the more general 'cancer screening' until all " "sub forms of screening have been matched." @@ -33,32 +33,32 @@ best_practice_information=None, ) ), - Task( - **dict( - id=TaskIds.THESIS, - label="Add main subject to thesis' and technical reports", - language_code=SupportedLanguageCode.ENGLISH, - best_practice_information=( - "When adding Qid main subjects please try to first " - "educate yourself about the subarea of science a little " - "and find/create items as specific as possible.\n" - "E.g. when searching for 'cancer screening' in Wikidata " - "we find 'gastric cancer screening' in labels of " - "scientific articles but there is " - "perhaps no item for this yet.\n" - "In this case it is preferred to first create that item " - "(done in Q108532542 and add that as main subject and " - "avoid the more general 'cancer screening' until all " - "sub forms of screening have been matched." - ), - ) - ), - Task( - **dict( - id=TaskIds.ACADEMIC_JOURNALS, - label="Add main subject to academic journals", - language_code=SupportedLanguageCode.ENGLISH, - best_practice_information=None, - ) - ), + # Task( + # **dict( + # id=TaskIds.THESIS, + # label="Add main subject to thesis' and technical reports", + # language_code=SupportedLanguageCode.ENGLISH, + # best_practice_information=( + # "When adding Qid main subjects please try to first " + # "educate yourself about the subarea of science a little " + # "and find/create items as specific as possible.\n" + # "E.g. when searching for 'cancer screening' in Wikidata " + # "we find 'gastric cancer screening' in labels of " + # "scientific articles but there is " + # "perhaps no main_subject_item for this yet.\n" + # "In this case it is preferred to first create that main_subject_item " + # "(done in Q108532542 and add that as main subject and " + # "avoid the more general 'cancer screening' until all " + # "sub forms of screening have been matched." + # ), + # ) + # ), + # Task( + # **dict( + # id=TaskIds.ACADEMIC_JOURNALS, + # label="Add main subject to academic journals", + # language_code=SupportedLanguageCode.ENGLISH, + # best_practice_information=None, + # ) + # ), ] diff --git a/tests/test_main_subject_item.py b/tests/test_main_subject_item.py new file mode 100644 index 0000000..4629604 --- /dev/null +++ b/tests/test_main_subject_item.py @@ -0,0 +1,37 @@ +import argparse +from unittest import TestCase + +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.tasks import tasks + + +class TestMainSubjectItem(TestCase): + def test_extract_search_strings(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__extract_search_strings__() + if not len(msi.search_strings) == 1: + self.fail() + + def test_extract_search_strings_with_problematic_alias(self): + # Note this will fail if anyone adds or remove an alias on the item. + msi = MainSubjectItem( + id="Q273510", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__fetch_label_and_description_and_aliases__() + msi.__extract_search_strings__() + msi.print_search_strings() + print(len(msi.search_strings)) + assert len(msi.search_strings) == 10 diff --git a/tests/test_preprint_article.py b/tests/test_preprint_article.py new file mode 100644 index 0000000..57db65c --- /dev/null +++ b/tests/test_preprint_article.py @@ -0,0 +1,51 @@ +import argparse +from unittest import TestCase + +from src import tasks +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.models.wikimedia.wikidata.query.preprint_article import PreprintArticleQuery + + +class TestPreprintArticle(TestCase): + def test_preprint_article_query(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__extract_search_strings__() + q = PreprintArticleQuery(main_subject_item=msi) + for string in msi.search_strings: + q.search_string = string + q.__prepare_and_build_query__() + print(q.query_string) + assert ( + q.query_string.replace(" ", "").strip() + == """ + #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q + SELECT DISTINCT ?item ?itemLabel + WHERE { + ?item wdt:P31/wd:P279* wd:Q580922. # preprint + MINUS { + ?item wdt:P921 wd:Q407541; + } + ?item rdfs:label ?label. + FILTER(CONTAINS( + LCASE(?label), " fentanyl " + @en) || + REGEX(LCASE(?label), ".* fentanyl$" + @en) || + REGEX(LCASE(?label), "^fentanyl .*" + @en) + ) + MINUS {?item wdt:P921/wdt:P279 wd:Q407541. } + SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + }""".replace( + " ", "" + ).strip() + ) + break diff --git a/tests/test_published_article.py b/tests/test_published_article.py new file mode 100644 index 0000000..137a1fe --- /dev/null +++ b/tests/test_published_article.py @@ -0,0 +1,65 @@ +import argparse +from unittest import TestCase + +from src import tasks +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.models.wikimedia.wikidata.query.published_article import PublishedArticleQuery + + +class TestPublishedArticleQuery(TestCase): + def test_published_article_query(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + limit_to_items_without_p921=dict(limit_to_items_without_p921=False), + ), + ) + msi.__extract_search_strings__() + q = PublishedArticleQuery(main_subject_item=msi) + for string in msi.search_strings: + q.search_string = string + q.__prepare_and_build_query__() + print(q.query_string) + assert ( + q.query_string.replace(" ", "").replace("\\", "").strip() + == """ + #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q + SELECT DISTINCT ?item ?itemLabel + WHERE { + hint:Query hint:optimizer "None". + BIND(STR('haswbstatement:P31=Q13442814 -haswbstatement:P921 "fentanyl"') as ?search_string) + SERVICE wikibase:mwapi { + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch ?search_string. + ?title wikibase:apiOutput mwapi:title. + } + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + BIND(REPLACE(LCASE(?label), ",", "") as ?label1) + BIND(REPLACE(?label1, ":", "") as ?label2) + BIND(REPLACE(?label2, ";", "") as ?label3) + BIND(REPLACE(?label3, "\\(", "") as ?label4) + BIND(REPLACE(?label4, "\\)", "") as ?label5) + BIND(REPLACE(?label5, "\\[", "") as ?label6) + BIND(REPLACE(?label6, "\\]", "") as ?label7) + BIND(REPLACE(?label7, "\\\\", "") as ?label8) + BIND(?label8 as ?cleaned_label) + FILTER(CONTAINS(?cleaned_label, ' fentanyl '@en) || + REGEX(?cleaned_label, '.* fentanyl$'@en) || + REGEX(?cleaned_label, '^fentanyl .*'@en)) + MINUS {?item wdt:P921/wdt:P279 wd:Q407541. } + MINUS {?item wdt:P921/wdt:P279/wdt:P279 wd:Q407541. } + MINUS {?item wdt:P921/wdt:P279/wdt:P279/wdt:P279 wd:Q407541. } + SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + }""".replace( + " ", "" + ) + .replace("\\", "") + .strip() + ) + break diff --git a/tests/test_riksdagen_document.py b/tests/test_riksdagen_document.py new file mode 100644 index 0000000..d66604a --- /dev/null +++ b/tests/test_riksdagen_document.py @@ -0,0 +1,58 @@ +import argparse +from unittest import TestCase + +from src import tasks +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.models.wikimedia.wikidata.query.riksdagen_document import ( + RiksdagenDocumentQuery, +) + + +class TestRiksdagenDocumentQuery(TestCase): + def test_riksdagen_document_query(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__extract_search_strings__() + q = RiksdagenDocumentQuery(main_subject_item=msi) + for string in msi.search_strings: + q.search_string = string + q.__prepare_and_build_query__() + print(q.query_string) + assert ( + q.query_string.replace(" ", "").strip() + == """ + #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q + SELECT DISTINCT ?item ?itemLabel + WHERE { + hint:Query hint:optimizer "None". + SERVICE wikibase:mwapi { + bd:serviceParam wikibase:api "Search"; + wikibase:endpoint "www.wikidata.org"; + mwapi:srsearch 'haswbstatement:P8433 -haswbstatement:P921=Q407541 "fentanyl"' . + ?title wikibase:apiOutput mwapi:title. + } + BIND(IRI(CONCAT(STR(wd:), ?title)) AS ?item) + ?item rdfs:label ?label. + # We lowercase the label first and search for the + # string in both the beginning, middle and end of the label + FILTER(CONTAINS( + LCASE(?label), " fentanyl "@en) || + REGEX(LCASE(?label), ".* fentanyl$"@en) || + REGEX(LCASE(?label), "^fentanyl .*"@en) + ) + # remove more specific forms of the main subject also + # Thanks to Jan Ainali for this improvement :) + MINUS {?main_subject_item wdt:P921 ?topic. ?topic wdt:P279 wd:Q407541. } + SERVICE wikibase:label { bd:serviceParam wikibase:language "sv". } + }""".replace( + " ", "" + ).strip() + ) + break diff --git a/tests/test_sparql_item.py b/tests/test_sparql_item.py index 185f3aa..2c4e2f1 100644 --- a/tests/test_sparql_item.py +++ b/tests/test_sparql_item.py @@ -1,7 +1,7 @@ from unittest import TestCase from src import console -from src.models.wikimedia.wikidata.sparql_item import SparqlItem, Value +from src.models.wikimedia.wikidata.item.sparql import SparqlItem, Value class TestSparqlItem(TestCase): diff --git a/tests/test_suggestion.py b/tests/test_suggestion.py deleted file mode 100644 index d585f50..0000000 --- a/tests/test_suggestion.py +++ /dev/null @@ -1,46 +0,0 @@ -import argparse -from unittest import TestCase - -from src.models.suggestion import Suggestion -from src.models.wikimedia.wikidata.sparql_item import SparqlItem, Value -from src.tasks import tasks - - -class TestSuggestion(TestCase): - def test_extract_search_strings(self): - item = SparqlItem( - item=Value(value="Q407541"), itemLabel=Value(value="fentanyl") - ) - item.validate_qid_and_copy_label() - suggestion = Suggestion( - item=item, - task=tasks[0], - args=argparse.Namespace( - no_aliases=dict(no_aliases=False), - show_search_urls=dict(show_search_urls=False), - ), - ) - suggestion.extract_search_strings() - # suggestion.print_search_strings() - if not len(suggestion.search_strings) == 1: - self.fail() - - def test_extract_search_strings_with_problematic_alias(self): - """This has a problematic alias "thrush" which is also a bird""" - item = SparqlItem( - item=Value(value="Q273510"), itemLabel=Value(value="candidadis") - ) - item.validate_qid_and_copy_label() - item.fetch_label_and_description_and_aliases(task=tasks[0]) - suggestion = Suggestion( - item=item, - task=tasks[0], - args=argparse.Namespace( - no_aliases=dict(no_aliases=False), - show_search_urls=dict(show_search_urls=False), - ), - ) - suggestion.extract_search_strings() - suggestion.print_search_strings() - print(len(suggestion.search_strings)) - assert len(suggestion.search_strings) == 10 diff --git a/tests/test_thesis.py b/tests/test_thesis.py new file mode 100644 index 0000000..784ac6b --- /dev/null +++ b/tests/test_thesis.py @@ -0,0 +1,54 @@ +import argparse +from unittest import TestCase + +from src import tasks +from src.models.wikimedia.wikidata.item.main_subject import MainSubjectItem +from src.models.wikimedia.wikidata.query.thesis import ThesisQuery + + +class TestThesisQuery(TestCase): + def test_thesis_query(self): + msi = MainSubjectItem( + id="Q407541", + label="fentanyl", + task=tasks[0], + args=argparse.Namespace( + no_aliases=dict(no_aliases=False), + show_search_urls=dict(show_search_urls=False), + ), + ) + msi.__extract_search_strings__() + q = ThesisQuery(main_subject_item=msi) + for string in msi.search_strings: + q.search_string = string + q.__prepare_and_build_query__() + print(q.query_string) + assert ( + q.query_string.replace(" ", "").strip() + == """ + #ItemSubjector (https://github.com/dpriskorn/ItemSubjector), User:So9q + SELECT DISTINCT ?item ?itemLabel + WHERE { + { + ?item wdt:P31/wd:P279* wd:Q1266946. # thesis + } UNION + { + ?item wdt:P31/wd:P279* wd:Q1385450. # dissertation + } UNION + { + ?item wdt:P31/wd:P279* wd:Q3099732. # technical report + } + MINUS { + ?item wdt:P921 wd:Q407541; + } + ?item rdfs:label ?label. + FILTER(CONTAINS(LCASE(?label), " fentanyl "@en) || + REGEX(LCASE(?label), ".* fentanyl$"@en) || + REGEX(LCASE(?label), "^fentanyl .*"@en)) + MINUS {?item wdt:P921 ?topic. ?topic wdt:P279 wd:Q407541. } + SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } + }""".replace( + " ", "" + ).strip() + ) + break