Skip to content

Commit 01cf113

Browse files
committed
add more error handling for API responses
1 parent b9b8d92 commit 01cf113

3 files changed

Lines changed: 194 additions & 5 deletions

File tree

.dlt/config.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,16 @@ http_show_error_body = "true"
66
log_level = "INFO"
77
log_format = "JSON"
88

9+
# REST Client retries
10+
# default: 60
11+
request_timeout = 60
12+
# default: 5
13+
request_max_attempts = 2
14+
# default: 1
15+
request_backoff_factor = 1.5
16+
# default: 300
17+
request_max_retry_delay = 600
18+
919
# do not raise an error if a job fails
1020
load.raise_on_failed_jobs = "false"
1121

src/cdm_data_loaders/pipelines/ncbi_rest_api.py

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
)
2020
from pydantic import AliasChoices, Field, model_validator
2121
from pydantic_settings import CliSuppress, SettingsConfigDict
22+
from requests.exceptions import HTTPError
2223

2324
from cdm_data_loaders.pipelines.core import (
2425
run_cli,
@@ -38,6 +39,7 @@
3839

3940
DATASET = "dataset"
4041
ANNOTATION = "annotation"
42+
ERROR = "error"
4143

4244
dlt_logger = logging.getLogger("dlt")
4345

@@ -143,6 +145,49 @@ def save_raw_response(config: Settings, response: Response, *_: Any, **__: Any)
143145
)
144146

145147

148+
def add_error(
149+
error_list: list[dict[str, Any]],
150+
error: Exception,
151+
error_from: str,
152+
assembly_id: str | None = None,
153+
assembly_id_list: list[str] | None = None,
154+
) -> None:
155+
"""Add an error to the list of output errors.
156+
157+
:param error_list: running list of errors
158+
:type error_list: list[dict[str, Any]]
159+
:param error: the error object from the exception handler
160+
:type error: Exception
161+
:param error_from: what type of request was being made when the error occurred
162+
:type error_from: str
163+
:param assembly_id: ID of the assembly being fetched when the error occurred, defaults to None
164+
:type assembly_id: str | None, optional
165+
:param assembly_id_list: list of IDs being fetched when the error occurred, defaults to None
166+
:type assembly_id_list: list[str] | None, optional
167+
"""
168+
err_args = {
169+
"assembly_id": assembly_id or None,
170+
"assembly_id_list": assembly_id_list or None,
171+
"error_class": type(error).__name__,
172+
"error_from": error_from,
173+
"message": str(error),
174+
"request_url": None,
175+
"status": None,
176+
"reason": None,
177+
}
178+
179+
if isinstance(error, HTTPError):
180+
# save the URL, status code, and error message
181+
err_args = {
182+
**err_args,
183+
"request_url": error.request.url,
184+
"status": error.response.status_code,
185+
"reason": error.response.reason,
186+
}
187+
188+
error_list.append(err_args)
189+
190+
146191
def get_assembly_reports(assembly_id_list: list[str]) -> dict[str, Any]:
147192
"""Retrieve dataset and annotation reports for a list of IDs from the NCBI datasets API.
148193
@@ -154,14 +199,27 @@ def get_assembly_reports(assembly_id_list: list[str]) -> dict[str, Any]:
154199
if not assembly_id_list:
155200
return {}
156201

202+
errors = []
203+
157204
# N.b. invalid IDs will not be present in dataset_reports
158-
dataset_reports = get_dataset_reports(assembly_id_list)
159-
annotation_reports = {assembly_id: get_annotation_report(assembly_id) for assembly_id in assembly_id_list}
205+
dataset_reports = {}
206+
try:
207+
dataset_reports = get_dataset_reports(assembly_id_list)
208+
except Exception as e: # noqa: BLE001
209+
add_error(errors, e, "dataset_report", assembly_id_list=assembly_id_list)
210+
211+
annotation_reports: dict[str, Any] = {}
212+
for assembly_id in assembly_id_list:
213+
try:
214+
annotation_reports[assembly_id] = get_annotation_report(assembly_id)
215+
except Exception as e: # noqa: BLE001
216+
add_error(errors, e, "annotation_report", assembly_id=assembly_id)
160217

161218
# ensure every assembly_id in the list has either the downloaded dataset_report or None
162219
return {
163220
DATASET: {assembly_id: dataset_reports.get(assembly_id) for assembly_id in assembly_id_list},
164221
ANNOTATION: {assembly_id: annotation_reports.get(assembly_id) for assembly_id in assembly_id_list},
222+
ERROR: errors,
165223
}
166224

167225

@@ -172,6 +230,7 @@ def get_dataset_reports(assembly_id_list: list[str]) -> dict[str, None | dict[st
172230

173231
dlt_logger.info("fetching dataset reports for:\n%s", ", ".join(sorted(assembly_id_list)))
174232
assembly_dataset_reports = []
233+
175234
for page in ncbi_genome_client.paginate(
176235
f"{'%2C'.join(assembly_id_list)}/dataset_report",
177236
params={
@@ -182,7 +241,7 @@ def get_dataset_reports(assembly_id_list: list[str]) -> dict[str, None | dict[st
182241
assembly_dataset_reports.extend(page)
183242

184243
# return dataset reports, indexed by assembly_id
185-
# invalid IDs are silently dropped by the API
244+
# invalid IDs are silently dropped by the NCBI REST API
186245
datasets = {report.get("accession"): report for report in assembly_dataset_reports}
187246
# fill in the missing gaps in assembly_id_list with None
188247
return {assembly_id: datasets.get(assembly_id) for assembly_id in assembly_id_list}
@@ -192,6 +251,7 @@ def get_annotation_report(assembly_id: str) -> list[dict[str, Any]] | None:
192251
"""Fetch the annotation report for an assembly from the NCBI datasets REST API."""
193252
dlt_logger.info("fetching annotation report for %s", assembly_id)
194253
page_data = []
254+
195255
for page in ncbi_genome_client.paginate(
196256
f"{assembly_id}/annotation_report",
197257
params={
@@ -200,6 +260,7 @@ def get_annotation_report(assembly_id: str) -> list[dict[str, Any]] | None:
200260
hooks=REST_CLIENT_HOOKS, # type: ignore[reportArgumentType]
201261
):
202262
page_data.extend(page)
263+
203264
# page_data is empty if the ID is invalid
204265
return page_data or None
205266

@@ -239,6 +300,11 @@ def assembly_report_parser(
239300

240301
dataset_reports: dict[str, dict[str, Any]] = assembly_reports.get(DATASET) # type: ignore[reportAssignmentType]
241302
annotation_reports: dict[str, list[dict[str, Any]]] = assembly_reports.get(ANNOTATION) # type: ignore[reportAssignmentType]
303+
error_reports: list[dict[str, Any]] = assembly_reports.get(ERROR) # type: ignore[reportAssignmentType]
304+
305+
if error_reports:
306+
yield dlt.mark.with_table_name(error_reports, "ncbi_import_error")
307+
242308
# yield the raw data to save as tables
243309
yield dlt.mark.with_table_name(
244310
[{"assembly_id": assembly_id, **(report or {})} for assembly_id, report in dataset_reports.items()],

tests/pipelines/test_ncbi_rest_api.py

Lines changed: 115 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,21 @@
22

33
from pathlib import Path
44
from typing import Any
5+
from unittest import mock
56
from unittest.mock import MagicMock, patch
67

78
import pytest
89
from pydantic import ValidationError
910
from pydantic_settings import CliApp
11+
from requests import HTTPError
1012

1113
from cdm_data_loaders.pipelines import ncbi_rest_api as ncbi_module
1214
from cdm_data_loaders.pipelines.cts_defaults import INPUT_MOUNT, OUTPUT_MOUNT, VALID_DESTINATIONS
1315
from cdm_data_loaders.pipelines.ncbi_rest_api import (
1416
ANNOTATION,
1517
DATASET,
1618
DATASET_NAME,
19+
ERROR,
1720
Settings,
1821
assembly_list,
1922
cli,
@@ -148,7 +151,7 @@ def test_invalid_destination_raises(bad: str) -> None:
148151
@pytest.mark.parametrize("output", ["-o", "--output"])
149152
@pytest.mark.parametrize("dev_mode_flag", ["--dev", "--dev-mode", "--dev_mode"])
150153
@pytest.mark.parametrize("batch_size", ["-b", "--batch-size", "--batch_size"])
151-
def test_cli_all_variants(
154+
def test_cli_all_variants( # noqa: PLR0913
152155
input_dir: str,
153156
destination: str,
154157
use_output_dir_for_pipeline_metadata: str,
@@ -378,6 +381,14 @@ def test_get_annotation_report_multi_page() -> None:
378381
check_annotation_report(annotation_report, ID_WITH_2K_ANNOTS)
379382

380383

384+
@mock.patch("tenacity.nap.time.sleep", MagicMock())
385+
@pytest.mark.vcr
386+
def test_get_annotation_report_multi_page_err() -> None:
387+
"""An error in the middle of a multi-page retrieval should stop the whole retrieval process."""
388+
with pytest.raises(HTTPError, match="500 Server Error: Internal Server Error for url"):
389+
get_annotation_report(ID_WITH_2K_ANNOTS)
390+
391+
381392
@pytest.mark.default_cassette("test_get_assembly_reports.yaml")
382393
@pytest.mark.vcr
383394
def test_get_annotation_report_invalid_id() -> None:
@@ -394,13 +405,115 @@ def test_get_assembly_reports_empty_id_list() -> None:
394405
def test_get_assembly_reports() -> None:
395406
"""Test the retrieval of annotation and dataset reports."""
396407
assembly_reports = get_assembly_reports(ALL_IDS)
397-
assert set(assembly_reports) == {DATASET, ANNOTATION}
408+
assert set(assembly_reports) == {DATASET, ANNOTATION, ERROR}
398409
for datatype in [DATASET, ANNOTATION]:
399410
assert set(assembly_reports[datatype]) == set(ALL_IDS)
400411
assert assembly_reports[datatype][INVALID_ID] is None
401412
for assembly_id in [ID_WITH_2K_ANNOTS, ID_WITH_500_ANNOTS]:
402413
check_annotation_report(assembly_reports[ANNOTATION][assembly_id], assembly_id)
403414
check_dataset_report(assembly_reports[DATASET][assembly_id], assembly_id)
415+
assert assembly_reports[ERROR] == []
416+
417+
418+
@mock.patch("tenacity.nap.time.sleep", MagicMock())
419+
@pytest.mark.default_cassette("test_get_assembly_reports_annotation_report_errors.yaml")
420+
@pytest.mark.vcr
421+
def test_get_assembly_reports_annotation_report_errors() -> None:
422+
"""Test the retrieval of assembly data when errors occur fetching annotation reports."""
423+
assembly_reports = get_assembly_reports(ALL_IDS)
424+
assert set(assembly_reports) == {DATASET, ANNOTATION, ERROR}
425+
for datatype in [DATASET, ANNOTATION]:
426+
assert set(assembly_reports[datatype]) == set(ALL_IDS)
427+
assert assembly_reports[datatype][INVALID_ID] is None
428+
for assembly_id in [ID_WITH_2K_ANNOTS, ID_WITH_500_ANNOTS]:
429+
check_dataset_report(assembly_reports[DATASET][assembly_id], assembly_id)
430+
# ID_WITH_500 succeeds, ID_WITH_2K does not
431+
check_annotation_report(assembly_reports[ANNOTATION][ID_WITH_500_ANNOTS], ID_WITH_500_ANNOTS)
432+
assert assembly_reports[ANNOTATION][ID_WITH_2K_ANNOTS] is None
433+
434+
assert assembly_reports[ERROR] == [
435+
{
436+
"assembly_id": ID_WITH_2K_ANNOTS,
437+
"assembly_id_list": None,
438+
"error_class": "HTTPError",
439+
"error_from": "annotation_report",
440+
"message": '500 Server Error: Internal Server Error for url: https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000003135.1/annotation_report?page_size=1000&page_token=eNrjYos2NDAwjAUABagBiw\nResponse: {"error":"Internal Server Error","code":500,"message":"Internal Server Error (For more help, see the NCBI Datasets Documentation at https://www.ncbi.nlm.nih.gov/datasets/docs/) (1D3311AB4E92F955000055F41870A1E3.1.1)"}\n',
441+
"request_url": "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000003135.1/annotation_report?page_size=1000&page_token=eNrjYos2NDAwjAUABagBiw",
442+
"status": 500,
443+
"reason": "Internal Server Error",
444+
}
445+
]
446+
447+
448+
@mock.patch("tenacity.nap.time.sleep", MagicMock())
449+
@pytest.mark.vcr
450+
def test_get_assembly_reports_dataset_report_errors() -> None:
451+
"""Test the retrieval of assembly data when an error occurs fetching dataset reports."""
452+
assembly_reports = get_assembly_reports(ALL_IDS)
453+
assert set(assembly_reports) == {DATASET, ANNOTATION, ERROR}
454+
for datatype in [DATASET, ANNOTATION]:
455+
assert set(assembly_reports[datatype]) == set(ALL_IDS)
456+
assert assembly_reports[datatype][INVALID_ID] is None
457+
for assembly_id in [ID_WITH_2K_ANNOTS, ID_WITH_500_ANNOTS]:
458+
check_annotation_report(assembly_reports[ANNOTATION][assembly_id], assembly_id)
459+
assert assembly_reports[DATASET][assembly_id] is None
460+
461+
assert assembly_reports[ERROR] == [
462+
{
463+
"assembly_id": None,
464+
"assembly_id_list": ALL_IDS,
465+
"error_class": "HTTPError",
466+
"error_from": "dataset_report",
467+
"message": '404 Client Error: Not Found for url: https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000007725.1%2CGCF_000003135.1%2Cinvalid_id/dataset_report?page_size=1000\nResponse: {"error":"Not Found","code":404,"message":"Your request is invalid. (For more help, see the NCBI Datasets Documentation at https://www.ncbi.nlm.nih.gov/datasets/docs/)"}\n',
468+
"request_url": "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000007725.1%2CGCF_000003135.1%2Cinvalid_id/dataset_report?page_size=1000",
469+
"status": 404,
470+
"reason": "Not Found",
471+
}
472+
]
473+
474+
475+
@mock.patch("tenacity.nap.time.sleep", MagicMock())
476+
@pytest.mark.vcr
477+
def test_get_assembly_reports_total_wipeout() -> None:
478+
"""Test the retrieval of assembly data when all queries fail."""
479+
# TODO: another type of error?
480+
output = get_assembly_reports(ALL_IDS)
481+
assert output == {
482+
DATASET: dict.fromkeys(ALL_IDS),
483+
ANNOTATION: dict.fromkeys(ALL_IDS),
484+
ERROR: [
485+
{
486+
"assembly_id": None,
487+
"assembly_id_list": ALL_IDS,
488+
"error_class": "HTTPError",
489+
"error_from": "dataset_report",
490+
"message": '404 Client Error: Not Found for url: https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000007725.1%2CGCF_000003135.1%2Cinvalid_id/dataset_report?page_size=1000\nResponse: {"error":"Not Found","code":404,"message":"Your request is invalid. (For more help, see the NCBI Datasets Documentation at https://www.ncbi.nlm.nih.gov/datasets/docs/)"}\n',
491+
"request_url": "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000007725.1%2CGCF_000003135.1%2Cinvalid_id/dataset_report?page_size=1000",
492+
"status": 404,
493+
"reason": "Not Found",
494+
},
495+
{
496+
"assembly_id": ID_WITH_500_ANNOTS,
497+
"assembly_id_list": None,
498+
"error_class": "HTTPError",
499+
"error_from": "annotation_report",
500+
"message": '404 Client Error: Not Found for url: https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000007725.1/annotation_report?page_size=1000\nResponse: {"error":"Not Found","code":404,"message":"Your request is invalid. (For more help, see the NCBI Datasets Documentation at https://www.ncbi.nlm.nih.gov/datasets/docs/)"}\n',
501+
"request_url": "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000007725.1/annotation_report?page_size=1000",
502+
"status": 404,
503+
"reason": "Not Found",
504+
},
505+
{
506+
"assembly_id": ID_WITH_2K_ANNOTS,
507+
"assembly_id_list": None,
508+
"error_class": "HTTPError",
509+
"error_from": "annotation_report",
510+
"message": '500 Server Error: Internal Server Error for url: https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000003135.1/annotation_report?page_size=1000&page_token=eNrjYos2NDAwjAUABagBiw\nResponse: {"error":"Internal Server Error","code":500,"message":"Internal Server Error (For more help, see the NCBI Datasets Documentation at https://www.ncbi.nlm.nih.gov/datasets/docs/) (1D32DF2AAA9FBB1500003B3C46C243D3.1.1)"}\n',
511+
"request_url": "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_000003135.1/annotation_report?page_size=1000&page_token=eNrjYos2NDAwjAUABagBiw",
512+
"status": 500,
513+
"reason": "Internal Server Error",
514+
},
515+
],
516+
}
404517

405518

406519
@pytest.mark.skip("FIXME: not working, possibly due to parallelization?")

0 commit comments

Comments
 (0)