Skip to content
This repository has been archived by the owner on Jan 23, 2024. It is now read-only.

Commit

Permalink
Merge pull request #52
Browse files Browse the repository at this point in the history
Add ItemSubjector class
  • Loading branch information
dpriskorn authored Mar 31, 2022
2 parents 9827b3f + ba54de3 commit af76b9f
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 130 deletions.
8 changes: 5 additions & 3 deletions itemsubjector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging

import src
import config
from src import ItemSubjector

logging.basicConfig(level=logging.DEBUG)
src.main()
logging.basicConfig(level=config.loglevel)
itemsubjector = ItemSubjector()
itemsubjector.run()
255 changes: 128 additions & 127 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging

import pandas as pd # type: ignore
from pydantic import BaseModel
from wikibaseintegrator import wbi_login, wbi_config # type: ignore
from wikibaseintegrator.wbi_helpers import execute_sparql_query # type: ignore

Expand Down Expand Up @@ -45,134 +46,134 @@
logging.basicConfig(level=config.loglevel)


def login():
with console.status("Logging in with WikibaseIntegrator..."):
config.login_instance = wbi_login.Login(
auth_method="login",
user=config.username,
password=config.password,
debug=False,
)
# Set User-Agent
wbi_config.config["USER_AGENT_DEFAULT"] = config.user_agent
class ItemSubjector(BaseModel):
@staticmethod
def login():
with console.status("Logging in with WikibaseIntegrator..."):
config.login_instance = wbi_login.Login(
auth_method="login",
user=config.username,
password=config.password,
debug=False,
)
# Set User-Agent
wbi_config.config["USER_AGENT_DEFAULT"] = config.user_agent


def match_main_subjects_from_sparql(args: argparse.Namespace = None):
"""Collect subjects via SPARQL and call get_validated_main_subjects()
If we get any validated jobs we handle them"""
logger = logging.getLogger(__name__)
if args is None or args.sparql is None:
raise ValueError("args.sparql was None")
if "P1889" not in args.sparql:
console.print(
"Your SPARQL did not contain P1889 (different from). "
"Please include 'MINUS {?item wdt:P1889 [].}' "
"in your WHERE clause to avoid false positives."
)
exit(0)
else:
logger.info("Detected P1889 in the query")
with console.status("Running query on WDQS..."):
main_subjects = []
results = execute_sparql_query(
args.sparql.replace("{", "{{").replace("}", "}}"), debug=args.debug_sparql
)
for item_json in results["results"]["bindings"]:
logging.debug(f"item_json:{item_json}")
main_subjects.append(item_json["item"]["value"])
if len(main_subjects) > 0:
console.print(f"Got {len(main_subjects)} results")
batchjobs = get_validated_main_subjects_as_jobs(
args=args, main_subjects=main_subjects
)
handle_job_preparation_or_run_directly_if_any_jobs(
args=args, batchjobs=batchjobs
)
else:
console.print("Got 0 results. Try another query or debug it using --debug")


def export_jobs_to_dataframe():
logger = logging.getLogger(__name__)
logger.info("Exporting jobs to DataFrame. All jobs are appended to one frame")
batchjobs = parse_job_pickle()
if batchjobs is not None:
if batchjobs is not None and batchjobs.job_count > 0:
logger.info(f"Found {batchjobs.job_count} jobs")
df = pd.DataFrame()
count = 1
for job in batchjobs.jobs:
count += 1
logger.info(f"Working on job {count}/{batchjobs.job_count}")
job_df = pd.DataFrame()
for item in job.items.list:
job_df = job_df.append(
pd.DataFrame(
data=[
dict(
qid=item.id,
label=item.label,
description=item.description,
)
]
)
)
df = df.append(job_df)
logger.debug(f"Added {len(job.items.list)} items to the dataframe")
logger.debug(f"Exporting {len(df)} rows to pickle")
pickle_filename = "dataframe.pkl.gz"
df.to_pickle(pickle_filename)
console.print(f"Wrote to {pickle_filename} in the current directory")
else:
console.print(
"No jobs found. Create a job list first by using '--prepare-jobs'"
)


def main():
"""This is the main function that makes everything else happen"""
logger = logging.getLogger(__name__)
migrate_pickle_detection()
args = setup_argparse_and_return_args()
# console.print(args.list)
if args.remove_prepared_jobs is True:
remove_job_pickle()
console.print("Removed the job list.")
# exit(0)
if args.prepare_jobs is True:
logger.info("Preparing jobs")
if check_if_pickle_exists(config.job_pickle_file_path):
if ask_discard_existing_job_pickle():
remove_job_pickle(silent=True)
else:
console.print("Quitting.")
if args.run_prepared_jobs is True:
logger.info("Running prepared jobs")
batchjobs = parse_job_pickle()
if batchjobs is not None and len(batchjobs.jobs) > 0:
file_hash = get_hash_of_job_pickle()
batchjobs.run_jobs()
# Remove the pickle afterwards
remove_job_pickle(hash=file_hash)
elif args.export_jobs_to_dataframe:
export_jobs_to_dataframe()
elif args.sparql:
match_main_subjects_from_sparql(args=args)
else:
# if not args.run_prepared_jobs:
if args.add is None:
console.print("Got no QIDs. Quitting")
@staticmethod
def match_main_subjects_from_sparql(args: argparse.Namespace = None):
"""Collect subjects via SPARQL and call get_validated_main_subjects()
If we get any validated jobs we handle them"""
logger = logging.getLogger(__name__)
if args is None or args.sparql is None:
raise ValueError("args.sparql was None")
if "P1889" not in args.sparql:
console.print(
"Your SPARQL did not contain P1889 (different from). "
"Please include 'MINUS {?item wdt:P1889 [].}' "
"in your WHERE clause to avoid false positives."
)
exit(0)
task: Task = select_task()
if task is None:
raise ValueError("Got no task")
jobs = []
jobs.extend(process_user_supplied_qids_into_batch_jobs(args=args, task=task))
batchjobs = BatchJobs(jobs=jobs)
handle_job_preparation_or_run_directly_if_any_jobs(
args=args, batchjobs=batchjobs
)
else:
logger.info("Detected P1889 in the query")
with console.status("Running query on WDQS..."):
main_subjects = []
results = execute_sparql_query(
args.sparql.replace("{", "{{").replace("}", "}}"),
debug=args.debug_sparql,
)
for item_json in results["results"]["bindings"]:
logging.debug(f"item_json:{item_json}")
main_subjects.append(item_json["item"]["value"])
if len(main_subjects) > 0:
console.print(f"Got {len(main_subjects)} results")
batchjobs = get_validated_main_subjects_as_jobs(
args=args, main_subjects=main_subjects
)
handle_job_preparation_or_run_directly_if_any_jobs(
args=args, batchjobs=batchjobs
)
else:
console.print("Got 0 results. Try another query or debug it using --debug")

@staticmethod
def export_jobs_to_dataframe():
logger = logging.getLogger(__name__)
logger.info("Exporting jobs to DataFrame. All jobs are appended to one frame")
batchjobs = parse_job_pickle()
if batchjobs is not None:
if batchjobs is not None and batchjobs.job_count > 0:
logger.info(f"Found {batchjobs.job_count} jobs")
df = pd.DataFrame()
count = 1
for job in batchjobs.jobs:
count += 1
logger.info(f"Working on job {count}/{batchjobs.job_count}")
job_df = pd.DataFrame()
for item in job.items.list:
job_df = job_df.append(
pd.DataFrame(
data=[
dict(
qid=item.id,
label=item.label,
description=item.description,
)
]
)
)
df = df.append(job_df)
logger.debug(f"Added {len(job.items.list)} items to the dataframe")
logger.debug(f"Exporting {len(df)} rows to pickle")
pickle_filename = "dataframe.pkl.gz"
df.to_pickle(pickle_filename)
console.print(f"Wrote to {pickle_filename} in the current directory")
else:
console.print(
"No jobs found. Create a job list first by using '--prepare-jobs'"
)

if __name__ == "__main__":
main()
def run(self):
"""This is the main function that makes everything else happen"""
logger = logging.getLogger(__name__)
migrate_pickle_detection()
args = setup_argparse_and_return_args()
# console.print(args.list)
if args.remove_prepared_jobs is True:
remove_job_pickle()
console.print("Removed the job list.")
# exit(0)
if args.prepare_jobs is True:
logger.info("Preparing jobs")
if check_if_pickle_exists(config.job_pickle_file_path):
if ask_discard_existing_job_pickle():
remove_job_pickle(silent=True)
else:
console.print("Quitting.")
if args.run_prepared_jobs is True:
logger.info("Running prepared jobs")
batchjobs = parse_job_pickle()
if batchjobs is not None and len(batchjobs.jobs) > 0:
file_hash = get_hash_of_job_pickle()
batchjobs.run_jobs()
# Remove the pickle afterwards
remove_job_pickle(hash=file_hash)
elif args.export_jobs_to_dataframe:
self.export_jobs_to_dataframe()
elif args.sparql:
self.match_main_subjects_from_sparql(args=args)
else:
# if not args.run_prepared_jobs:
if args.add is None:
console.print("Got no QIDs. Quitting")
exit(0)
task: Task = select_task()
if task is None:
raise ValueError("Got no task")
jobs = []
jobs.extend(
process_user_supplied_qids_into_batch_jobs(args=args, task=task)
)
batchjobs = BatchJobs(jobs=jobs)
handle_job_preparation_or_run_directly_if_any_jobs(
args=args, batchjobs=batchjobs
)

0 comments on commit af76b9f

Please sign in to comment.