diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..63030ef --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +config.py +*.csv diff --git a/example.config.py b/example.config.py new file mode 100644 index 0000000..d3d84a7 --- /dev/null +++ b/example.config.py @@ -0,0 +1,3 @@ +report_url = 'https://library.cca.edu/cgi-bin/koha/svc/report?id=345' +opac_url = 'https://library.cca.edu/cgi-bin/koha/opac-detail.pl?biblionumber={id}' +log_filename = 'linkcheck.csv' diff --git a/linkcheck.py b/linkcheck.py new file mode 100644 index 0000000..e1aac3c --- /dev/null +++ b/linkcheck.py @@ -0,0 +1,47 @@ +import csv +import io +import logging +import urllib3 + +import requests + +import config + +# log to both file & console in CSV-like format, we have to get pretty hacky to +# do CSV formatting for logging a list (not single message value) +logging.basicConfig( + datefmt='%Y-%m-%d %H:%M:%S', + format='"%(asctime)s","%(levelname)s",%(message)s', + handlers=[ + logging.FileHandler(config.log_filename), + logging.StreamHandler(), + ], +) +logger = logging.getLogger() + + +def quote(list): + output = io.StringIO() + writer = csv.writer(output, quoting=csv.QUOTE_ALL) + writer.writerow(list) + return output.getvalue().strip() + +# our Koha cert isn't recognized but it's fine, silence this warning +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +report = requests.get(config.report_url, verify=False) + +for bib in report.json(): + # bibs are arrays like [urls string, title, biblionumber] + urls, title, id = bib + # urls are separated by " | " + urls = urls.split(' | ') + for url in urls: + try: + r = requests.get(url) + # distinguish between severity of 5XX & 4XX HTTP errors + if r.status_code >= 500: + logger.error(quote([title, config.opac_url.format(id=id), r.status_code, url])) + elif r.status_code >= 400: + logger.warning(quote([title, config.opac_url.format(id=id), r.status_code, url])) + except: + logger.error(quote([title, config.opac_url.format(id=id), 'HTTP Exception', url])) diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..23118c1 --- /dev/null +++ b/readme.md @@ -0,0 +1,11 @@ +# Check links in MARC records + +Takes a public Koha report and checks each URL (`856$u`) to see if they resolve successfully. + +## Notes + +Use the included `report.sql` to create a SQL report in Koha, be sure to set "Public" to "Yes" so the report JSON can be publicly accessed. + +The app prints URLs with non-200 HTTP response statuses. It also catches HTTP exceptions within the requests library, which can occur when a domain is unavailable. + +Some websites have poor server hygiene and send successful HTTP responses with non-200 error codes. Not a lot we can do about that. diff --git a/report.sql b/report.sql new file mode 100644 index 0000000..9d66adc --- /dev/null +++ b/report.sql @@ -0,0 +1,4 @@ +SELECT bi.url, b.title, b.biblionumber +FROM biblio b +JOIN biblioitems bi USING (biblionumber) +WHERE bi.url <> ''