Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Major changes and bug fixing #162

Merged
merged 11 commits into from
Feb 28, 2024
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ name: Continuous integration Unit tests

on:
push:
branches: [ master ]
branches: [ main ]
pull_request:
branches: [ master ]
branches: [ main ]

jobs:
PythonBlack:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ name: Python application

on:
push:
branches: [ master ]
branches: [ main ]
pull_request:
branches: [ master ]
branches: [ main ]

jobs:
build:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ name: Python package

on:
push:
branches: [ master ]
branches: [ main ]
pull_request:
branches: [ master ]
branches: [ main ]

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion sdrf_pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.24"
__version__ = "0.0.25"
51 changes: 39 additions & 12 deletions sdrf_pipelines/sdrf/sdrf_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ def ontology_term_parser(cell_value: str = None):

class SDRFColumn(Column):
def __init__(
self,
name: str,
validations: typing.Iterable["_BaseValidation"] = None,
optional_validations: typing.Iterable["_BaseValidation"] = None,
allow_empty=False,
optional_type=True,
self,
name: str,
validations: typing.Iterable["_BaseValidation"] = None,
optional_validations: typing.Iterable["_BaseValidation"] = None,
allow_empty=False,
optional_type=True,
):
if validations is None:
validations = []
Expand Down Expand Up @@ -145,8 +145,8 @@ def validate(self, series: pd.Series) -> pd.Series:

if ontology_terms is not None:
query_labels = [o["label"].lower() for o in ontology_terms]
for label in query_labels:
labels.append(label)
if term[TERM_NAME] in query_labels:
labels.append(term[TERM_NAME])
if self._not_available:
labels.append(NOT_AVAILABLE)
if self._not_applicable:
Expand Down Expand Up @@ -179,6 +179,10 @@ def validate(self, panda_sdrf: sdrf = None) -> typing.List[LogicError]:
)
errors.append(LogicError(error_message, error_type=logging.WARN))

empty_cells_errors = self.validate_empty_cells(panda_sdrf)
if empty_cells_errors:
errors.extend(empty_cells_errors)

# Check the mandatory fields
error_mandatory = self.validate_mandatory_columns(panda_sdrf)
if error_mandatory is not None:
Expand Down Expand Up @@ -218,9 +222,9 @@ def validate_column_names(self, panda_sdrf):
errors.append(cname)
elif m.group().startswith("factor value"):
if (
m.group().replace("factor value", "comment") not in panda_sdrf.columns
and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns
and m.group() not in panda_sdrf.columns
m.group().replace("factor value", "comment") not in panda_sdrf.columns
and m.group().replace("factor value", "characteristics") not in panda_sdrf.columns
and m.group() not in panda_sdrf.columns
):
error_message = "The " + cname + " column should also be in the characteristics or comment"
logerror.append(LogicError(error_message, error_type=logging.ERROR))
Expand Down Expand Up @@ -260,7 +264,7 @@ def validate_columns_order(panda_sdrf):
error_message = "The column " + column + "cannot be before the assay name"
error_columns_order.append(LogicError(error_message, error_type=logging.ERROR))
if (
"characteristics" in column or ("material type" in column and "factor value" not in column)
"characteristics" in column or ("material type" in column and "factor value" not in column)
) and cnames.index(column) > index:
error_message = "The column " + column + "cannot be after the assay name"
error_columns_order.append(LogicError(error_message, error_type=logging.ERROR))
Expand Down Expand Up @@ -310,6 +314,29 @@ def check_recommendations(self, panda_sdrf):
warnings += column.validate_optional(series)
return sorted(warnings, key=lambda e: e.row)

def validate_empty_cells(self, panda_sdrf):
"""
Check for empty cells in the SDRF. This method will return a list of errors if any empty cell is found.
:param panda_sdrf: SDRF dataframe
:return: List of errors
"""
errors = []

def validate_string(cell_value):
return cell_value is not None and cell_value != "nan" and len(cell_value.strip()) > 0

# Apply the validation function element-wise
validation_results = panda_sdrf.map(validate_string)

# Get the indices where the validation fails
failed_indices = [(row, col) for row in validation_results.index for col in validation_results.columns if
not validation_results.at[row, col]]

for row, col in failed_indices:
message = f"Empty value found Row: {row}, Column: {col}"
errors.append(LogicError(message, error_type=logging.ERROR))
return errors


default_schema = SDRFSchema(
[
Expand Down
77 changes: 56 additions & 21 deletions sdrf_pipelines/zooma/ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,17 @@ def get_ancestors(self, ont, iri):
raise ex

def search(
self,
name,
query_fields=None,
ontology=None,
field_list=None,
children_of=None,
exact=None,
bytype="class",
self,
name: str,
query_fields=None,
ontology: str=None,
field_list=None,
children_of=None,
exact: bool=None,
bytype: str="class",
rows: int=10,
num_retries:int=10,
start: int=0,
):
"""
Searches the OLS with the given term
Expand All @@ -124,6 +127,8 @@ def search(
@:param exact: Forces exact match if not `None`
@:param bytype: restrict to terms one of {class,property,individual,ontology}
@:param childrenOf: Search only under a certain term.
@:param rows: number of rows to query on each call of OLS search
@:param num_retries: Number of retries to OLS when it fails.
"""
params = {"q": name}
if ontology is not None:
Expand All @@ -135,6 +140,9 @@ def search(
if bytype:
params["type"] = _concat_str_or_list(bytype)

if rows:
params["rows"] = rows

if ontology:
params["ontology"] = _concat_str_or_list(ontology)
elif self.ontology:
Expand All @@ -155,26 +163,53 @@ def search(
if len(children_of) > 0:
params["childrenOf"] = _concat_str_or_list(children_of)

retry_num = 0
if start:
params["start"] = start

docs_found = []

while retry_num < 10:
for retry_num in range(num_retries):
try:
req = self.session.get(self.ontology_search, params=params)
logger.debug("Request to OLS search API: %s - %s", req.status_code, name)
logger.debug("Request to OLS search API term %s, status code %s", name, req.status_code)

req.raise_for_status()
if req.json()["response"]["numFound"]:
return req.json()["response"]["docs"]
if exact:
logger.debug("OLS exact search returned empty response for %s", name)
if req.status_code != 200:
logger.error("OLS search term %s error tried number %s", name, retry_num)
req.raise_for_status()
else:
logger.debug("OLS search returned empty response for %s", name)
return None
if req.json()["response"]["numFound"] == 0:
if exact:
logger.debug("OLS exact search returned empty response for %s", name)
else:
logger.debug("OLS search returned empty response for %s", name)
return docs_found
elif len(req.json()["response"]["docs"]) < rows:
return req.json()["response"]["docs"]
else:
docs_found = req.json()["response"]["docs"]
docs_found.extend(self.search(name, query_fields=query_fields, ontology=ontology,
field_list=field_list, children_of=children_of, exact=exact,
bytype=bytype, rows=rows, num_retries=num_retries,
start=(rows + (start))))
return docs_found

if req.status_code == 200 and req.json()["response"]["numFound"] == 0:
if exact:
logger.debug("OLS exact search returned empty response for %s", name)
else:
logger.debug("OLS search returned empty response for %s", name)
return None
elif req.status_code != 200 and req.json()["response"]["numFound"] > 0:
if len(req.json()["response"]["docs"]) <= rows:
return req.json()["response"]["docs"]
else:
start = 0
docs_found = req.json()["response"]["docs"]

except Exception as ex:
retry_num += 1
logger.debug("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex)
logger.exception("OLS error searching the following term -- %s iteration %s.\n%e", req.url, retry_num, ex)

return None
return docs_found

def suggest(self, name, ontology=None):
"""Suggest terms from an optional list of ontologies
Expand Down