Skip to content

Commit 3ce9865

Browse files
authored
Merge pull request #4 from datacite/simplified-resource-type-graph
Simplified resource type graph
2 parents 6514b8c + ade01e9 commit 3ce9865

6 files changed

Lines changed: 328 additions & 179 deletions

File tree

examples/api.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
from datacitekit.extractors import extract_doi
44
from datacitekit.related_works import get_full_corpus_doi_attributes
5-
from datacitekit.reports import RelatedWorkReports
5+
from datacitekit.resource_type_graph import RelatedWorkReports
66
from flask import Flask, jsonify
77

8-
DOI_API = os.getenv("DOI_API", "https://api.datacite.org/dois/")
8+
DOI_API = os.getenv("DOI_API", "https://api.stage.datacite.org/dois/")
99
app = Flask(__name__)
1010

1111

@@ -15,14 +15,20 @@ def related_works(doi):
1515
if not doi:
1616
return jsonify({"error": "Does not match DOI format"}), 400
1717

18-
full_doi_attributes = get_full_corpus_doi_attributes(doi, DOI_API)
18+
full_doi_attributes = get_full_corpus_doi_attributes(
19+
doi, RelatedWorkReports.parser, DOI_API
20+
)
1921
if not full_doi_attributes:
2022
return jsonify({"error": "DOI not found"}), 404
21-
report = RelatedWorkReports(full_doi_attributes)
2223

23-
graph = {"nodes": report.aggregate_counts, "edges": report.type_connection_report}
24+
report = RelatedWorkReports(full_doi_attributes)
2425

25-
return jsonify(graph)
26+
return jsonify(
27+
{
28+
"nodes": report.aggregate_counts,
29+
"edges": report.type_connection_report,
30+
}
31+
)
2632

2733

2834
if __name__ == "__main__":

src/datacitekit/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
""" Toolkit to fetch and parse data from DataCite """
2-
__version__ = "0.6.0"
2+
__version__ = "0.7.0"

src/datacitekit/related_works.py

Lines changed: 18 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,10 @@
11
# coding: utf-8
2-
from glom import Coalesce, Iter, glom
32

4-
from .extractors import extract_doi, extract_orcid, extract_ror_id
5-
from .reports import RelatedWorkReports
3+
from .extractors import extract_doi
4+
from .resource_type_graph import RelatedWorkReports
65
from .searchers import DoiListSearcher, DoiSearcher
76

87

9-
def is_a_doi(rid):
10-
return bool(extract_doi(rid.get("relatedIdentifier", "")))
11-
12-
13-
def parse_attributes(doi_result):
14-
doi_result = doi_result.get("attributes", {}) or doi_result
15-
if not doi_result:
16-
return {}
17-
spec = {
18-
"doi": ("doi"),
19-
"resourceTypeGeneral": Coalesce("types.resourceTypeGeneral", default=""),
20-
"resourceType": Coalesce("types.resourceType", default=""),
21-
"creator_orcid_ids": Coalesce(
22-
(
23-
"creators",
24-
[("nameIdentifiers", (["nameIdentifier"]))],
25-
Iter()
26-
.flatten()
27-
.map(lambda x: extract_orcid(x))
28-
.filter(lambda x: x is not None)
29-
.all(),
30-
),
31-
default=[],
32-
),
33-
"creator_ror_ids": Coalesce(
34-
(
35-
"creator",
36-
[("nameIdentifiers", (["nameIdentifier"]))],
37-
Iter()
38-
.flatten()
39-
.map(lambda x: extract_ror_id(x))
40-
.filter(lambda x: x is not None)
41-
.all(),
42-
),
43-
default=[],
44-
),
45-
"creator_affiliation_ror_ids": Coalesce(
46-
(
47-
"creators",
48-
[("affiliation", (["affiliationIdentifier"]))],
49-
Iter()
50-
.flatten()
51-
.map(lambda x: extract_ror_id(x))
52-
.filter(lambda x: x is not None)
53-
.all(),
54-
),
55-
default=[],
56-
),
57-
"contributor_orcid_ids": Coalesce(
58-
(
59-
"contributors",
60-
[("nameIdentifiers", (["nameIdentifier"]))],
61-
Iter()
62-
.flatten()
63-
.map(lambda x: extract_orcid(x))
64-
.filter(lambda x: x is not None)
65-
.all(),
66-
),
67-
default=[],
68-
),
69-
"contributor_ror_ids": Coalesce(
70-
(
71-
"contributors",
72-
[("nameIdentifiers", (["nameIdentifier"]))],
73-
Iter()
74-
.flatten()
75-
.map(lambda x: extract_ror_id(x))
76-
.filter(lambda x: x is not None)
77-
.all(),
78-
),
79-
default=[],
80-
),
81-
"contributor_affiliation_ror_ids": Coalesce(
82-
(
83-
"contributors",
84-
[("affiliation", (["affiliationIdentifier"]))],
85-
Iter()
86-
.flatten()
87-
.map(lambda x: extract_ror_id(x))
88-
.filter(lambda x: x is not None)
89-
.all(),
90-
),
91-
default="BOB",
92-
),
93-
"related_identifiers": Coalesce(
94-
(
95-
"relatedIdentifiers",
96-
Iter().filter(lambda r: is_a_doi(r)).all(),
97-
),
98-
default=[],
99-
),
100-
}
101-
return glom(doi_result, spec)
102-
103-
104-
def parse_list(doi_list):
105-
return {d["id"]: parse_attributes(d) for d in doi_list}
106-
107-
1088
def get_relation_types_grouped_by_doi(related_dois):
1099
res = {}
11010
for r in related_dois:
@@ -114,31 +14,37 @@ def get_relation_types_grouped_by_doi(related_dois):
11414
return res
11515

11616

117-
def get_incoming_and_primary_attributes(doi_query, doi_url):
17+
def parse_list(doi_list, parser):
18+
return {d["id"]: parser(d) for d in doi_list}
19+
20+
21+
def get_incoming_and_primary_attributes(doi_query, doi_url, parser):
11822
# Get incoming links and primary doi
11923
doi_list = DoiSearcher(doi_query, doi_url).search()
120-
doi_attributes = parse_list(doi_list)
24+
doi_attributes = parse_list(doi_list, parser)
12125
return doi_attributes
12226

12327

124-
def get_outgoing_link_attributes(primary_doi, doi_url):
28+
def get_outgoing_link_attributes(primary_doi, doi_url, parser):
12529
relations_grouped_by_doi = get_relation_types_grouped_by_doi(
12630
primary_doi.get("related_identifiers", [])
12731
)
12832
# Get outgoing links
12933
outgoing_dois = relations_grouped_by_doi.keys()
13034
outgoing_doi_list = DoiListSearcher(outgoing_dois, doi_url).search()
131-
outgoing_doi_attributes = parse_list(outgoing_doi_list)
35+
outgoing_doi_attributes = parse_list(outgoing_doi_list, parser)
13236
return outgoing_doi_attributes
13337

13438

13539
def get_full_corpus_doi_attributes(
136-
doi_query, api_url="https://api.stage.datacite.org/dois/"
40+
doi_query, parser, api_url="https://api.stage.datacite.org/dois/"
13741
):
138-
doi_attributes = get_incoming_and_primary_attributes(doi_query, api_url)
42+
doi_attributes = get_incoming_and_primary_attributes(doi_query, api_url, parser)
13943
if doi_query in doi_attributes.keys():
14044
primary_doi = doi_attributes.get(doi_query, {})
141-
outgoing_doi_attributes = get_outgoing_link_attributes(primary_doi, api_url)
45+
outgoing_doi_attributes = get_outgoing_link_attributes(
46+
primary_doi, api_url, parser
47+
)
14248
else:
14349
outgoing_doi_attributes = {}
14450

@@ -164,7 +70,9 @@ def _get_query():
16470
DOI_API = "https://api.stage.datacite.org/dois/"
16571
DOI_API = "https://api.datacite.org/dois/"
16672
doi_query = _get_query()
167-
full_doi_attributes = get_full_corpus_doi_attributes(doi_query, DOI_API)
73+
full_doi_attributes = get_full_corpus_doi_attributes(
74+
doi_query, RelatedWorkReports.parser, DOI_API
75+
)
16876
report = RelatedWorkReports(full_doi_attributes)
16977

17078
graph = {"nodes": report.aggregate_counts, "edges": report.type_connection_report}

0 commit comments

Comments
 (0)