Skip to content

The zenodo submodule

This module provides functions to:

  1. handle the publication of datasets after they have been validated using safedata_validate, including the generation of HTML descriptions of datasets.
  2. maintain local copies of datasets in the folder structure expected by the safedata R package.
  3. compile a RIS format bibliographic file for published datasets.

create_deposit(zen_res, new_version=None)

Create a new deposit.

Creates a new deposit draft, possibly as a new version of an existing published record. Creating a new version requires the Zenodo ID of an existing dataset: this has to be the ID of the most recently published version of a dataset, not the concept ID used to group datasets or any of the older versions.

Parameters:

Name Type Description Default
new_version int | None

Optionally, create a new version of the dataset with the provided Zenodo ID.

None
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration.

required

Returns:

Type Description
ZenodoResponse

See here.

Source code in safedata_validator/zenodo.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
def create_deposit(
    zen_res: ZenodoResources,
    new_version: int | None = None,
) -> ZenodoResponse:
    """Create a new deposit.

    Creates a new deposit draft, possibly as a new version of an existing published
    record. Creating a new version requires the Zenodo ID of an existing dataset: this
    has to be the ID of the most recently published version of a dataset, not the
    concept ID used to group datasets or any of the older versions.

    Args:
        new_version: Optionally, create a new version of the dataset with the provided
            Zenodo ID.
        zen_res: The zenodo resources from the safedata_validator configuration.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoResponse].
    """

    # get the correct draft api
    if new_version is None:
        api = f"{zen_res.api}/deposit/depositions"
    else:
        api = f"{zen_res.api}/deposit/depositions/{new_version}/actions/newversion"

    # Create the draft
    create_response = ZenodoResponse(requests.post(api, params=zen_res.token, json={}))

    # Return the response on failure or if the request is not for a new version
    if not create_response.ok or new_version is None:
        return create_response

    # For new versions, the response is an update to the existing copy,
    # so need to separately retrieve the new draft and return that
    api = create_response.json_data["links"]["latest_draft"]

    return ZenodoResponse(requests.get(api, params=zen_res.token, json={}))

get_deposit(deposit_id, zen_res)

Download the metadata of a Zenodo deposit.

Parameters:

Name Type Description Default
deposit_id int

The Zenodo record id of an existing dataset.

required
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration.

required

Returns:

Type Description
ZenodoResponse

See here.

Source code in safedata_validator/zenodo.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def get_deposit(deposit_id: int, zen_res: ZenodoResources) -> ZenodoResponse:
    """Download the metadata of a Zenodo deposit.

    Args:
        deposit_id: The Zenodo record id of an existing dataset.
        zen_res: The zenodo resources from the safedata_validator configuration.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoResponse].
    """

    # Return the processed response object
    return ZenodoResponse(
        requests.get(
            f"{zen_res.api}/deposit/depositions/{deposit_id}",
            params=zen_res.token,
            json={},
        )
    )

upload_metadata(metadata, zenodo, zen_res)

Upload dataset metadata.

Takes a dictionary of dataset metadata, converts it to a JSON payload of Zenodo metadata and uploads it to a deposit.

Parameters:

Name Type Description Default
metadata dict

The metadata dictionary for a dataset

required
zenodo dict

The zenodo metadata dictionary for a deposit

required
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration.

required

Returns:

Type Description
ZenodoResponse

See here.

Source code in safedata_validator/zenodo.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
def upload_metadata(
    metadata: dict, zenodo: dict, zen_res: ZenodoResources
) -> ZenodoResponse:
    """Upload dataset metadata.

    Takes a dictionary of dataset metadata, converts it to a JSON payload of Zenodo
    metadata and uploads it to a deposit.

    Args:
        metadata: The metadata dictionary for a dataset
        zenodo: The zenodo metadata dictionary for a deposit
        zen_res: The zenodo resources from the safedata_validator configuration.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoResponse].
    """

    # basic contents
    zen_md = {
        "metadata": {
            "upload_type": "dataset",
            # "publication_date": datetime.date.today().isoformat(),
            "title": metadata["title"],
            "keywords": metadata["keywords"],
            "license": "cc-by",
            "communities": [{"identifier": zen_res.community}],
        }
    }

    # Add a contact name to contributors if provided in config
    if zen_res.name is not None:
        zen_md["metadata"]["contributors"] = [
            {
                "name": zen_res.name,
                "type": "ContactPerson",
                "affiliation": zen_res.affiliation,
                "orcid": zen_res.orcid,
            }
        ]

    # set up the access rights
    dataset_access = metadata["access"].lower()
    if dataset_access == "embargo":
        zen_md["metadata"]["access_right"] = "embargoed"
        zen_md["metadata"]["embargo_date"] = metadata["embargo_date"]
    elif dataset_access == "open":
        zen_md["metadata"]["access_right"] = "open"
    elif dataset_access == "restricted":
        zen_md["metadata"]["access_right"] = "restricted"
        zen_md["metadata"]["access_conditions"] = metadata["access_conditions"]
    else:
        raise ValueError("Unknown access status")

    # set up the dataset creators - the format has already been checked and names
    # should be present and correct. Everything else is optional, so strip None
    # values and pass the rest to Zenodo
    zen_md["metadata"]["creators"] = [
        {ky: auth[ky] for ky in auth if auth[ky] is not None and ky != "email"}
        for auth in metadata["authors"]
    ]

    # Add the html description
    zen_md["metadata"]["description"] = dataset_description(
        dataset_metadata=metadata, resources=zen_res.resources
    )

    # Process the response from putting the metadata
    return ZenodoResponse(
        requests.put(zenodo["links"]["self"], params=zen_res.token, json=zen_md)
    )

upload_files(zenodo, filepaths, zen_res, progress_bar=True)

Upload file to Zenodo.

Uploads a list of files to an unpublished Zenodo deposit. If any filenames already exists in the deposit, they will be replaced with the new content

Parameters:

Name Type Description Default
zenodo dict

The Zenodo metadata dictionary for a deposit

required
filepaths list[Path]

The path to the file to be uploaded

required
progress_bar bool

Should the upload progress be displayed

True
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration.

required

Returns:

Type Description
ZenodoResponse

See here.

Source code in safedata_validator/zenodo.py
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
def upload_files(
    zenodo: dict,
    filepaths: list[Path],
    zen_res: ZenodoResources,
    progress_bar: bool = True,
) -> ZenodoResponse:
    """Upload file to Zenodo.

    Uploads a list of files to an unpublished Zenodo deposit. If any filenames already
    exists in the deposit, they will be replaced with the new content

    Args:
        zenodo: The Zenodo metadata dictionary for a deposit
        filepaths: The path to the file to be uploaded
        progress_bar: Should the upload progress be displayed
        zen_res: The zenodo resources from the safedata_validator configuration.


    Returns:
        See [here][safedata_validator.zenodo.ZenodoResponse].
    """

    # Ensure filepaths are paths, resolve them and check they are all existing files
    filepaths = [Path(f) for f in filepaths]
    filepaths = [f.resolve() for f in filepaths]
    bad_paths = [str(f) for f in filepaths if not (f.exists() and f.is_file())]

    if bad_paths:
        raise OSError(f"Filepaths unknown or not a file: {','.join(bad_paths)} ")

    # Collect response for each file
    response_content = []

    # Upload each file
    for fpath in filepaths:
        # upload the file
        # - https://gist.github.com/tyhoff/b757e6af83c1fd2b7b83057adf02c139
        file_size = fpath.stat().st_size
        api = f"{zenodo['links']['bucket']}/{fpath.name}"

        with open(fpath, "rb") as file_io:
            print(f"Uploading {fpath.name}")
            if progress_bar:
                with tqdm(
                    total=file_size, unit="B", unit_scale=True, unit_divisor=1024
                ) as upload_monitor:
                    # Upload the wrapped file
                    wrapped_file = CallbackIOWrapper(
                        upload_monitor.update, file_io, "read"
                    )
                    file_response = ZenodoResponse(
                        requests.put(api, data=wrapped_file, params=zen_res.token)
                    )
            else:
                file_response = ZenodoResponse(
                    requests.put(api, data=file_io, params=zen_res.token)
                )

        # trap errors in uploading file
        # - no success or mismatch in md5 checksums
        if not file_response.ok:
            return file_response

        # TODO - could this be inside the tqdm with call?
        #      - both are looping over the file contents
        # https://medium.com/codex/chunked-uploads-with-binary-files-in-python-f0c48e373a91
        local_hash = _compute_md5(fpath)

        if file_response.json_data["checksum"] != f"md5:{local_hash}":
            # TODO - this is a bit of a hack - not really a response failure
            file_response.ok = False
            file_response.error_message = "Mismatch in local and uploaded MD5 hashes"
            return file_response

        response_content.append(file_response.json_data)

    return file_response

discard_deposit(zenodo, zen_res)

Discard a deposit.

Deposits can be discarded - the associated files and metadata will be deleted and the Zenodo ID no longer exists. Once deposits are published to records, they cannot be deleted via the API - contact the Zenodo team for help.

Parameters:

Name Type Description Default
zenodo dict

The Zenodo metadata dictionary for a deposit

required
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration.

required

Returns:

Type Description
ZenodoResponse

See here.

Source code in safedata_validator/zenodo.py
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def discard_deposit(zenodo: dict, zen_res: ZenodoResources) -> ZenodoResponse:
    """Discard a deposit.

    Deposits can be discarded - the associated files and metadata will be deleted and
    the Zenodo ID no longer exists. Once deposits are published to records, they cannot
    be deleted via the API - contact the Zenodo team for help.

    Args:
        zenodo: The Zenodo metadata dictionary for a deposit
        zen_res: The zenodo resources from the safedata_validator configuration.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoResponse].
    """

    return ZenodoResponse(
        requests.delete(zenodo["links"]["self"], params=zen_res.token)
    )

publish_deposit(zenodo, zen_res)

Publish a created deposit.

Parameters:

Name Type Description Default
zenodo dict

The dataset metadata dictionary for a deposit

required
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration..

required

Returns:

Type Description
ZenodoResponse

See here.

Source code in safedata_validator/zenodo.py
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
def publish_deposit(zenodo: dict, zen_res: ZenodoResources) -> ZenodoResponse:
    """Publish a created deposit.

    Args:
        zenodo: The dataset metadata dictionary for a deposit
        zen_res: The zenodo resources from the safedata_validator configuration..

    Returns:
        See [here][safedata_validator.zenodo.ZenodoResponse].
    """

    # Return the processed publish request
    return ZenodoResponse(
        requests.post(zenodo["links"]["publish"], params=zen_res.token)
    )

delete_files(metadata, filenames, zen_res)

Delete an uploaded file from an unpublished Zenodo deposit.

Parameters:

Name Type Description Default
metadata dict

The Zenodo metadata dictionary for a deposit

required
filenames list[str]

A list of files to delete from the deposit

required
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration.

required

Returns:

Type Description
ZenodoResponse

See here.

Source code in safedata_validator/zenodo.py
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
def delete_files(
    metadata: dict,
    filenames: list[str],
    zen_res: ZenodoResources,
) -> ZenodoResponse:
    """Delete an uploaded file from an unpublished Zenodo deposit.

    Args:
        metadata: The Zenodo metadata dictionary for a deposit
        filenames: A list of files to delete from the deposit
        zen_res: The zenodo resources from the safedata_validator configuration.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoResponse].
    """

    # get an up to date list of existing files (metadata might be outdated)
    files_response = ZenodoResponse(
        requests.get(metadata["links"]["files"], params=zen_res.token)
    )

    # check the result of the files request
    if not files_response.ok:
        # failed to get the files
        return files_response

    # Get a dictionary of the available file links
    files_dict = {f["filename"]: f["links"]["self"] for f in files_response.json_data}

    # Get matching files
    unknown_files = []
    delete_links = []
    for file in filenames:
        if file in files_dict:
            delete_links.append(files_dict[file])
        else:
            unknown_files.append(file)

    if unknown_files:
        files_response.error_message = (
            f"Files not found in the deposit: {','.join(unknown_files)}"
        )
        return files_response

    return _delete_files_from_links(delete_links=delete_links, params=zen_res.token)

dataset_description(dataset_metadata, resources=None)

Create an HTML dataset description.

This function takes the dataset metadata exported by safedata_validate and uses it to populate an HTML template file. The resulting HTML can then be used to to provide a summary description of the dataset, either for local use or to upload as the description component of the Zenodo metadata,

A default template is provided with the safedata_validator package, but users can provide bespoke templates via the configuration file.

Parameters:

Name Type Description Default
dataset_metadata dict

The dataset metadata

required
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
div | str

Either a string of rendered HTML or a dominate.tags.div object.

Source code in safedata_validator/zenodo.py
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
def dataset_description(
    dataset_metadata: dict,
    resources: Resources | None = None,
) -> tags.div | str:
    """Create an HTML dataset description.

    This function takes the dataset metadata exported by safedata_validate and uses it
    to populate an HTML template file. The resulting HTML can then be used to to provide
    a summary description of the dataset, either for local use or to upload as the
    description component of the Zenodo metadata,

    A default template is provided with the safedata_validator package, but users can
    provide bespoke templates via the configuration file.

    Args:
        dataset_metadata: The dataset metadata
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        Either a string of rendered HTML or a dominate.tags.div object.
    """

    # NOTE - this could conceivably just pass the complete dataset metadata dictionary
    #        straight to the Jinja template context. That would make the template more
    #        complex but would also expose all of the metadata.

    # Load resources if needed
    if resources is None:
        resources = Resources()

    # Get the template path elements
    if resources.zenodo.html_template is None:
        template_path = il_resources.files("safedata_validator.templates").joinpath(
            "description_template.html"
        )
    else:
        template_path = Path(resources.zenodo.html_template)
        if not template_path.exists():
            raise FileNotFoundError(
                f"Configured html template not found: {resources.zenodo.html_template}"
            )

    # Using autoescape=False is not generally recommended, but the title and taxa
    # context elements contain HTML tags
    # - mypy: importlib returns a Traversable, which is a protocol that Path complies
    #         with, but the attribute isn't being recognized
    env = Environment(
        loader=FileSystemLoader(template_path.parent),  # type: ignore [attr-defined]
        autoescape=False,
    )

    template = env.get_template(template_path.name)

    # Build the context dictionary that will be used to populate the Jinja templage
    # - the dataset title and authors are populated in different fields by Zenodo from
    #   zenodo metadata, where this function just maintains the dataset description
    #   element of the Zenodo metadata

    # Description from the summary table
    context_dict = dict(
        description=dataset_metadata["description"].replace("\n", "</br>")
    )

    # Project details if available.
    # Generate project urls
    if dataset_metadata["project_ids"] is not None:
        context_dict["project_urls"] = [
            resources.zenodo.project_url.replace("PROJECT_ID", str(pid))
            for pid in dataset_metadata["project_ids"]
        ]
    else:
        context_dict["project_urls"] = []

    # proj_url = URL('projects', 'project_view', args=[metadata['project_id']],
    #               scheme=True, host=True)
    # desc += P(B('Project: '), 'This dataset was collected as part of the following '
    #                          'SAFE research project: ', A(B(title), _href=proj_url))
    ##

    # Funding information
    context_dict["funders"] = dataset_metadata["funders"]
    context_dict["permits"] = dataset_metadata["permits"]

    # Filenames associated with the dataset
    context_dict["dataset_filename"] = dataset_metadata["filename"]
    # TODO - the external file default in the metadata definition should be an
    #        empty list, not None
    context_dict["external_files"] = (
        []
        if dataset_metadata["external_files"] is None
        else dataset_metadata["external_files"]
    )

    context_dict["all_filenames"] = [context_dict["dataset_filename"]] + [
        f["file"] for f in context_dict["external_files"]
    ]

    # Group the sheets by their 'external' file - which is None for sheets in the
    # submitted workbook - and collect them into a dictionary by source file. Because
    # you can't sort a mix of strings and None elements, this substitutes in
    # '__internal__' to represent internal sheets.
    tables_by_source = [
        (sh["external"] or "__internal__", sh)
        for sh in dataset_metadata["dataworksheets"]
    ]

    # Now group into a dictionary keyed by __internal__ or external file names
    tables_by_source.sort(key=lambda sh: sh[0])
    tables_grouped_by_source = groupby(tables_by_source, key=lambda sh: sh[0])

    # Convert to a list of table information, keyed by file.
    tables_dict_by_source = {
        ky: [val[1] for val in tpl] for ky, tpl in tables_grouped_by_source
    }

    # We've now  a dictionary of table descriptions that might have an entry for each
    # provided file. Get the internal tables separately in the context
    if "__internal__" in tables_dict_by_source:
        context_dict["internal_tables"] = tables_dict_by_source.pop("__internal__")
    else:
        context_dict["internal_tables"] = []

    # Now need to pair any external table metadata with the external file descriptions.
    # TODO - the external file default in the metadata definition should be an
    #        empty list, not None
    if dataset_metadata["external_files"] is None:
        external_files = dict()
    else:
        # Repackage external metadata to be keyed by file name and provide description
        # and a default empty list of tables
        external_files = {
            vl["file"]: {"description": vl["description"], "tables": []}
            for vl in dataset_metadata["external_files"]
        }
        # Add the remaining table descriptions to the appropriate files.
        for extf_key, extf_tabs in tables_dict_by_source.items():
            external_files[extf_key]["tables"] = extf_tabs

    context_dict["external_file_data"] = external_files

    # Populate a list of filenames
    context_dict["all_filenames"] = [
        context_dict["dataset_filename"],
        *list(external_files.keys()),
    ]

    # Add extents if populated
    context_dict["temporal_extent"] = dataset_metadata["temporal_extent"]
    context_dict["latitudinal_extent"] = dataset_metadata["latitudinal_extent"]
    context_dict["longitudinal_extent"] = dataset_metadata["longitudinal_extent"]

    # Find taxa data from each database and convert to HTML representation. The metadata
    # will be an empty list if the dataset does not contain any taxa.
    context_dict["gbif_timestamp"] = dataset_metadata["gbif_timestamp"]

    gbif_taxon_index = dataset_metadata["gbif_taxa"]

    context_dict["gbif_taxa"] = (
        taxon_index_to_text(taxa=gbif_taxon_index, html=True)
        if gbif_taxon_index
        else None
    )

    # Save both taxon index and the database metadata as sections of the context dict.
    # Both of these are separated by sheet
    sequenced_taxa_sheets = dataset_metadata["sequenced_taxa"]
    if sequenced_taxa_sheets:
        context_dict["seq_taxa"] = {}
        for sheet_name, taxon_data in sequenced_taxa_sheets.items():
            context_dict["seq_taxa"][f"{sheet_name}"] = {
                "database_name": taxon_data["database_name"],
                "database_version": taxon_data["database_version"],
                "database_link": taxon_data["database_link"],
                "index": taxon_index_to_text(
                    taxa=taxon_data["taxon_index"], html=True, lowest_taxa="phylum"
                ),
            }
    else:
        context_dict["seq_taxa"] = None

    html = template.render(context_dict)

    return html

generate_inspire_xml(dataset_metadata, zenodo_metadata, resources, lineage_statement=None)

Convert dataset and zenodo metadata into GEMINI XML.

Produces an INSPIRE/GEMINI formatted XML record from dataset metadata, and Zenodo record metadata using a template XML file. The dataset URL defaults to the Zenodo record but can be replaced if a separate URL (such as a project specific website) is used. The Gemini XML standard requires a statement about the lineage of a dataset - this is automatically taken from the package configuration but can be overridden for individual datasets, for example to add dataset specific links, using the lineage_statement argument.

Parameters:

Name Type Description Default
dataset_metadata dict

A dictionary of the dataset metadata

required
zenodo_metadata dict

A dictionary of the Zenodo record metadata

required
resources Resources

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

required
lineage_statement str | None

An optional alternative lineage statement about the data.

None

Returns:

Type Description
str

A string containing GEMINI compliant XML.

Source code in safedata_validator/zenodo.py
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
def generate_inspire_xml(
    dataset_metadata: dict,
    zenodo_metadata: dict,
    resources: Resources,
    lineage_statement: str | None = None,
) -> str:
    """Convert dataset and zenodo metadata into GEMINI XML.

    Produces an INSPIRE/GEMINI formatted XML record from dataset metadata,
    and Zenodo record metadata using a template XML file. The dataset URL
    defaults to the Zenodo record but can be replaced if a separate URL (such as
    a project specific website) is used. The Gemini XML standard requires a
    statement about the lineage of a dataset - this is automatically taken from the
    package configuration but can be overridden for individual datasets, for example to
    add dataset specific links, using the `lineage_statement` argument.

    Args:
        dataset_metadata: A dictionary of the dataset metadata
        zenodo_metadata: A dictionary of the Zenodo record metadata
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.
        lineage_statement: An optional alternative lineage statement about the data.

    Returns:
        A string containing GEMINI compliant XML.
    """

    # Do the resources provide complete XML information
    if None in resources.xml.values():
        raise ValueError("XML configuration section is incomplete.")

    template_path = il_resources.files("safedata_validator.templates").joinpath(
        "gemini_xml_template.xml"
    )

    # Get the Jinja environment and load the template
    # - mypy: importlib returns a Traversable, which is a protocol that Path complies
    #         with, but the attribute isn't being recognized
    env = Environment(
        loader=FileSystemLoader(template_path.parent),  # type: ignore [attr-defined]
        autoescape=select_autoescape(),
    )

    template = env.get_template(template_path.name)

    # Build some reused values from the metadata
    # URIs -  form the DOI URL from the prereserved DOI metadata
    doi_url = f"https://doi.org/{zenodo_metadata['metadata']['prereserve_doi']['doi']}"

    # A true "publication" date is not available until a record is published, so use the
    # creation date of the deposit as a reasonable replacement, with the caveat that you
    # should generate the XML and publish on the same day.
    pub_date = dt.fromisoformat(zenodo_metadata["created"]).date()

    # A citation string
    authors = [au["name"] for au in dataset_metadata["authors"]]
    author_string = ", ".join(authors)
    if len(authors) > 1:
        author_string = author_string.replace(", " + authors[-1], " & " + authors[-1])

    citation_string = (
        f"{author_string} ({pub_date.year}) "
        f"{dataset_metadata['title']} [Dataset] {doi_url}"
    )

    # Resource constraints text
    if dataset_metadata["access"] == "embargo":
        access_statement = (
            f"This data is under embargo until {dataset_metadata['embargo_date']}."
            "After that date there are no restrictions to public access."
        )
    elif dataset_metadata["access"] == "restricted":
        access_statement = (
            "This dataset is currently not publicly available, please contact the "
            "Zenodo community owner to request access."
        )
    else:
        access_statement = "There are no restrictions to public access."

    # Get a copy of the project wide XML configuration from the resources. This provides
    # the following elements:
    # * languageCode, characterSet, contactCountry, contactEmail, epsgCode,
    #   topicCategories, lineageStatement
    context_dict = resources.xml.copy()

    # Generate project urls
    if dataset_metadata["project_ids"] is not None:
        project_urls = [
            resources.zenodo.project_url.replace("PROJECT_ID", str(pid))
            for pid in dataset_metadata["project_ids"]
        ]
    else:
        project_urls = []

    # Now update it with information also needed by Zenodo and the file specific
    # elements from the zenodo and dataset metadata
    context_dict.update(
        # Values also used on the Zenodo information or duplicated in the xml
        contactName=resources.zenodo.contact_name,
        contactOrcID=resources.zenodo.contact_orcid,
        pointofcontactName=resources.zenodo.contact_name,
        pointofcontactCountry=resources.xml.contactCountry,
        pointofcontactEmail=resources.xml.contactEmail,
        pointofcontactOrcID=resources.zenodo.contact_orcid,
        # Dataset specific information
        projectURL=project_urls,
        citationRSIdentifier=doi_url,
        dateStamp=pub_date.isoformat(),
        publicationDate=pub_date.isoformat(),
        fileIdentifier=str(zenodo_metadata["id"]),
        title=dataset_metadata["title"],
        authors=dataset_metadata["authors"],
        abstract=dataset_metadata["description"],
        keywords=dataset_metadata["keywords"],
        citationString=citation_string,
        embargoValue=access_statement,
        startDate=dataset_metadata["temporal_extent"][0][:10],
        endDate=dataset_metadata["temporal_extent"][1][:10],
        westBoundLongitude=_min_dp(dataset_metadata["longitudinal_extent"][0], 2),
        eastBoundLongitude=_min_dp(dataset_metadata["longitudinal_extent"][1], 2),
        southBoundLatitude=_min_dp(dataset_metadata["latitudinal_extent"][0], 2),
        northBoundLatitude=_min_dp(dataset_metadata["latitudinal_extent"][1], 2),
        downloadLink=doi_url,
    )

    # Override global lineage statement
    if lineage_statement is not None:
        context_dict["lineageStatement"] = lineage_statement

    xml = template.render(context_dict)

    return xml

download_ris_data(zen_res, ris_file=None)

Downloads Zenodo records into a RIS format bibliography file.

This function is used to maintain a bibliography file of the records uploaded to a safedata community on Zenodo. It accesses the Zenodo community specified in the resource configuration and downloads all records. It then optionally checks the list of downloaded DOIs against the content of an existing RIS file and then downloads citations for all new DOIs from datacite.org.

Parameters:

Name Type Description Default
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration.

required
ris_file Path | None

The path to an existing RIS format file containing previously downloaded records.

None

Returns:

Type Description
None

A list of strings containing RIS formatted citation data.

Source code in safedata_validator/zenodo.py
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
def download_ris_data(zen_res: ZenodoResources, ris_file: Path | None = None) -> None:
    """Downloads Zenodo records into a RIS format bibliography file.

    This function is used to maintain a bibliography file of the records
    uploaded to a safedata community on Zenodo. It accesses the Zenodo community
    specified in the resource configuration and downloads all records. It then
    optionally checks the list of downloaded DOIs against the content of an
    existing RIS file and then downloads citations for all new DOIs from
    datacite.org.

    Args:
        zen_res: The zenodo resources from the safedata_validator configuration.
        ris_file: The path to an existing RIS format file containing previously
            downloaded records.

    Returns:
        A list of strings containing RIS formatted citation data.
    """

    # Get a list of known DOI records from an existing RIS file if one is
    # provided
    known_recids = []
    new_doi = []

    if ris_file is not None and ris_file.exists():
        with open(ris_file) as bibliography_file:
            entries = rispy.load(bibliography_file)
            for entry in entries:
                record_id = int(entry["url"].split("/")[-1])
                known_recids.append(record_id)

    # Zenodo API call to return the records associated with the SAFE community

    api = f"{zen_res.api}/records/?q=communities:{zen_res.community}"

    # Provide feedback on DOI collection
    LOGGER.info(f"Fetching record DOIs from {api}:")
    FORMATTER.push()

    # The API is paged - it contains a set of records and a link that points
    # to the next page of records, so keep looping until there are no more next
    n_records = 0
    while True:
        # Get the data
        safe_data = requests.get(api)

        if safe_data.status_code != 200:
            raise OSError("Cannot access Zenodo API")
        else:
            # Retrieve the record data and store the DOI for each record
            safe_data_dict = safe_data.json()
            for hit in safe_data_dict["hits"]["hits"]:
                if hit["id"] not in known_recids:
                    new_doi.append(hit["doi"])

            # Reporting
            n_records += len(safe_data_dict["hits"]["hits"])
            LOGGER.info(f"{n_records}")

            # Update the link for the next page, unless there is no next page
            if "next" in safe_data_dict["links"]:
                api = safe_data_dict["links"]["next"]
            else:
                break

    # Use the datacite API to retrieve the citation data associated with the DOI
    # and save it out to a RIS format file
    if not new_doi:
        LOGGER.info("No new DOIs found")
        return

    # Get the DOI data
    data = []

    FORMATTER.pop()
    LOGGER.info(
        f"Retrieving citation data from Datacite for {len(new_doi)} new records"
    )
    FORMATTER.push()

    for doi in new_doi:
        ris_data = requests.get(
            f"https://data.datacite.org/application/x-research-info-systems/{doi}"
        )

        if ris_data.status_code != 200:
            LOGGER.warning(f"DOI {doi} not found in datacite.org")
        else:
            # Write the response content to the data list. It comes in as byte
            # data so needs to be decoded to a string variable
            LOGGER.info(f"Retrieved citation for DOI {doi}")
            data.append(ris_data.content.decode("utf-8") + "\r\n")

    FORMATTER.pop()

    # Writing only occurs if a ris file path has actually been provided
    if ris_file:
        if ris_file.exists():
            LOGGER.info(f"Appending RIS data for {len(data)} new records to {ris_file}")
            write_mode = "a"
        else:
            LOGGER.info(f"Writing RIS data for {len(data)} records to {ris_file}")
            write_mode = "w"

        with open(ris_file, write_mode) as ris_file_out:
            for this_entry in data:
                ris_file_out.write(this_entry)

sync_local_dir(datadir, zen_res, xlsx_only=True, replace_modified=False, dry_run=False)

Synchronise a local data directory with a Zenodo community.

The safedata R package defines a directory structure used to store metadata and files downloaded from a safedata community on Zenodo and from a safedata metadata server. This tool allows a safedata developer or community maintainer to create or update such a directory with all of the resources in the Zenodo community, regardless of their public access status. This forms a backup (although Zenodo is heavily backed up) but also provides local copies of the files for testing and development of the code packages.

This function requires that the resources are configured with access tokens for Zenodo and the details of the metadata server.

Parameters:

Name Type Description Default
datadir Path

The path to a local directory containing an existing safedata directory or an empty folder in which to create one.

required
zen_res ZenodoResources

The zenodo resources from the safedata_validator configuration.

required
xlsx_only bool

Should the download ignore large non-xlsx files, defaulting to True.

True
replace_modified bool

Should the synchronisation replace locally modified files with the archived version. By default, modified local files are left alone.

False
dry_run bool

Only report on the actions to be taken, without actually making any changes.

False
Source code in safedata_validator/zenodo.py
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
def sync_local_dir(
    datadir: Path,
    zen_res: ZenodoResources,
    xlsx_only: bool = True,
    replace_modified: bool = False,
    dry_run: bool = False,
) -> None:
    """Synchronise a local data directory with a Zenodo community.

    The safedata R package defines a directory structure used to store metadata and
    files downloaded from a safedata community on Zenodo and from a safedata metadata
    server. This tool allows a safedata developer or community maintainer to create or
    update such a directory with _all_ of the resources in the Zenodo community,
    regardless of their public access status. This forms a backup (although Zenodo is
    heavily backed up) but also provides local copies of the files for testing and
    development of the code packages.

    This function requires that the resources are configured with access tokens for
    Zenodo and the details of the metadata server.

    Args:
        datadir: The path to a local directory containing an existing safedata
            directory or an empty folder in which to create one.
        zen_res: The zenodo resources from the safedata_validator configuration.
        xlsx_only: Should the download ignore large non-xlsx files, defaulting
            to True.
        replace_modified: Should the synchronisation replace locally modified files with
            the archived version. By default, modified local files are left alone.
        dry_run: Only report on the actions to be taken, without actually making any
            changes.
    """

    # Private helper functions
    def _get_file(url: str, outf: Path, params: dict | None = None) -> None:
        """Download a file from a URL."""
        resource = requests.get(url, params=params, stream=True)

        with open(outf, "wb") as outf_obj:
            shutil.copyfileobj(resource.raw, outf_obj)

    # The dir argument should be an existing path
    if not (datadir.exists() and datadir.is_dir()):
        raise OSError(f"{datadir} is not an existing directory")

    # Get the configured metadata api
    api = zen_res.api

    # Check for an existing API url file and check it is congruent with config
    url_file = datadir / "url.json"

    if url_file.exists():
        with open(url_file) as urlf:
            dir_api = simplejson.load(urlf)["url"][0]

        if api != dir_api:
            raise RuntimeError(
                "Configured api does not match existing api in directory"
            )
    else:
        with open(url_file, "w") as urlf:
            simplejson.dump({"url": [api]}, urlf)

    # Download index files - don't bother to check for updates, this isn't
    # a frequent thing to do
    LOGGER.info("Downloading index files")
    _get_file(f"{api}/api/index", datadir / "index.json")
    _get_file(f"{api}/api/gazetteer", datadir / "gazetteer.geojson")
    _get_file(f"{api}/api/location_aliases", datadir / "location_aliases.csv")

    # Get the deposits associated with the account, which includes a list of download
    # links. Need to set the page parameter to the API to track paginated results.
    params = zen_res.token.copy()
    params["page"] = 1
    deposits: list = []

    LOGGER.info("Scanning Zenodo deposits")
    while True:
        this_page = ZenodoResponse(
            requests.get(
                f"{zen_res.api}/deposit/depositions",
                params=params,
                json={},
                headers={"Content-Type": "application/json"},
            )
        )

        if not this_page.ok:
            raise RuntimeError(this_page.error_message)

        if this_page.json_data:
            deposits += this_page.json_data
            LOGGER.info(f"Page {params['page']}")
            params["page"] += 1
        else:
            break

    LOGGER.info(f"Processing {len(deposits)} deposits")

    # Download the files
    for dep in deposits:
        con_rec_id = str(dep["conceptrecid"])
        rec_id = str(dep["record_id"])

        if not dep["submitted"]:
            LOGGER.info(f"Unsubmitted draft {con_rec_id}/{rec_id}")
            continue

        LOGGER.info(f"Processing deposit {con_rec_id}/{rec_id}")
        FORMATTER.push()

        # Create the directory structure if needed
        rec_dir = datadir / con_rec_id / rec_id
        if not rec_dir.exists():
            LOGGER.info("Creating directory")
            if not dry_run:
                rec_dir.mkdir()
        else:
            LOGGER.info("Directory found")

        # loop over the files in the record
        for this_file in dep["files"]:
            if xlsx_only and not this_file["filename"].endswith(".xlsx"):
                LOGGER.info(f"Skipping non-excel file {this_file['filename']}")
                continue

            LOGGER.info(f"Processing {this_file['filename']}")
            FORMATTER.push()

            outf = rec_dir / this_file["filename"]
            local_copy = outf.exists()

            if not local_copy:
                LOGGER.info("Downloading")
                if not dry_run:
                    _get_file(this_file["links"]["download"], outf, params=params)
            elif local_copy and _compute_md5(outf) != this_file["checksum"]:
                if replace_modified:
                    LOGGER.info("Replacing locally modified file")
                    if not dry_run:
                        _get_file(this_file["links"]["download"], outf, params=params)
                else:
                    LOGGER.warning("Local copy modified")
            else:
                LOGGER.info("Already present")

            FORMATTER.pop()

        # Get the metadata json
        metadata = rec_dir / f"{rec_id}.json"
        if metadata.exists():
            LOGGER.info("JSON Metadata found")
        else:
            LOGGER.info("Downloading JSON metadata ")
            if not dry_run:
                _get_file(f"{api}/api/record/{rec_id}", metadata)

        FORMATTER.pop()

taxon_index_to_text(taxa, html=False, indent_width=4, lowest_taxa=None)

Render a taxon index as text or html.

This function takes a taxon index and renders the contents into either a text or html representation of the taxonomic hierarchy used in the dataset. Taxonomic ranks are indented to render a nested hierarchy.

Parameters:

Name Type Description Default
taxa list[dict]

A list of taxon dictionaries containing the taxa for a dataset.

required
html bool

Render as html or text.

False
indent_width int

The indentation width to use for successive taxonomic ranks.

4
lowest_taxa str | None

The lowest taxonomic rank that the index renders, if no rank is provided then the index is rendered for all ranks.

None

Returns:

Type Description
str | div

Either a HTML or text representation of the taxa tree.

Source code in safedata_validator/taxa.py
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
def taxon_index_to_text(
    taxa: list[dict],
    html: bool = False,
    indent_width: int = 4,
    lowest_taxa: str | None = None,
) -> str | tags.div:
    """Render a taxon index as text or html.

    This function takes a taxon index and renders the contents into either a text or
    html representation of the taxonomic hierarchy used in the dataset. Taxonomic ranks
    are indented to render a nested hierarchy.

    Args:
        taxa: A list of taxon dictionaries containing the taxa for a dataset.
        html: Render as html or text.
        indent_width: The indentation width to use for successive taxonomic ranks.
        lowest_taxa: The lowest taxonomic rank that the index renders, if no rank is
            provided then the index is rendered for all ranks.

    Returns:
        Either a HTML or text representation of the taxa tree.
    """

    def _indent(n: int, use_html: bool = html):
        if use_html:
            return raw("&ensp;-&ensp;" * n)
        else:
            return " " * indent_width * (n - 1)

    def _format_name(tx: dict, use_html: bool = html):
        # format the canonical name
        if tx["taxon_rank"] in ["genus", "species", "subspecies"]:
            if use_html:
                return tags.i(tx["taxon_name"])
            else:
                return f"_{tx['taxon_name']}_"
        elif tx["taxon_rank"] in ["morphospecies", "functional group"]:
            return f"[{tx['worksheet_name']}, {tx['taxon_rank']}]"
        else:
            return tx["taxon_name"]

    # Container type depends on whether or not html output is required
    if html:
        # Container to hold the output
        html_out = tags.div()
    else:
        html_out = StringIO()

    # group by parent taxon, substituting 0 for None
    # secondary order is then alphabetic based on taxon name
    taxa.sort(key=lambda x: (x["parent_id"] or 0, x["taxon_name"]))

    # Preallocate container to store identity of surplus taxa
    surp_tx_ids = []
    # Define keys that would match in unwanted repeated entries
    match_keys = [
        "taxon_id",
        "parent_id",
        "taxon_name",
        "taxon_rank",
        "taxon_status",
    ]

    # Loop over taxa to filter for repeated entries
    for idx, taxon in enumerate(taxa):
        # Identify elements in taxa where all 5 of the desired keys match
        matches = list(
            map(
                lambda x: x == 5,
                [sum([taxon[k] == item[k] for k in match_keys]) for item in taxa],
            )
        )
        if sum(matches) > 1:
            # Generate reduced list of matching taxa
            taxa_mtch = list(compress(taxa, matches))
            ws_names = [item["worksheet_name"] for item in taxa_mtch]
            # Find first non-None worksheet names
            first_nm = next(name for name in ws_names if name is not None)
            # If it doesn't match worksheet name of taxon, add index to be deleted
            if first_nm != taxon["worksheet_name"]:
                surp_tx_ids.append(idx)

    # Eliminate any taxa with ranks below the minimum
    if lowest_taxa:
        # Check that the lowest rank appears in the full set of taxa
        if lowest_taxa not in ALL_BACKBONE_RANKS:
            raise ValueError(
                f"Rank provided to render taxa tree down to {lowest_taxa} is not a "
                f"backbone rank! Should be one of: {ALL_BACKBONE_RANKS}"
            )

        # Generate the full list of ranks that should be rendered
        rendered_ranks = ALL_BACKBONE_RANKS[: ALL_BACKBONE_RANKS.index(lowest_taxa) + 1]

        # Then add any taxa that have ranks that aren't in the list of rendered ranks to
        # the superfluous taxa index
        for idx, taxon in enumerate(taxa):
            if taxon["taxon_rank"] not in rendered_ranks:
                surp_tx_ids.append(idx)

    # Delete taxa that are superfluous by index
    for index in sorted(set(surp_tx_ids), reverse=True):
        del taxa[index]

    # group taxa by their parent id
    grouped = {k: list(v) for k, v in groupby(taxa, lambda x: x["parent_id"])}

    # start the stack with root taxa, which will have None as a parent (kingdoms for
    # GBIF, kingdoms/superkingdoms/domains for sequenced taxa)
    stack = [({"current": grouped[None][0]}, {"next": grouped[None][1:]})]

    while stack:
        # Handle the current top of the stack: format the canonical name
        current = stack[-1][0]["current"]
        canon_name = _format_name(current)

        # Look for a non-None entry in next that shares the same worksheet name
        next_ws_names = [
            tx["worksheet_name"]
            for tx in stack[-1][1]["next"]
            if tx["worksheet_name"] is not None
        ]

        if current["worksheet_name"] in next_ws_names:
            # pop out the matching entry and find which is 'accepted'
            name_pair = stack[-1][1]["next"].pop(
                next_ws_names.index(current["worksheet_name"])
            )
            if current["taxon_status"] == "accepted":
                as_name = _format_name(name_pair)
                as_status = name_pair["taxon_status"]
            else:
                as_name = canon_name
                as_status = current["taxon_status"]
                canon_name = _format_name(name_pair)

            if html:
                html_txt = [
                    _indent(len(stack)),
                    canon_name,
                    " (as ",
                    as_status,
                    ": ",
                    as_name,
                    ")",
                    tags.br(),
                ]
            else:
                txt = (
                    f"{_indent(len(stack))} {canon_name} (as {as_status}: {as_name})\n"
                )
        else:
            if html:
                html_txt = [_indent(len(stack)), canon_name, tags.br()]
            else:
                txt = f"{_indent(len(stack))} {canon_name}\n"

        if html:
            html_out += html_txt
        else:
            html_out.write(txt)

        # Is this taxon a parent for other taxa - if so add that taxon to the top of
        # the stack, otherwise start looking for a next taxon to push onto the stack.
        # If there is none at the top, pop and look down.
        parent_id = current["taxon_id"]
        if parent_id in grouped:
            stack.append(
                ({"current": grouped[parent_id][0]}, {"next": grouped[parent_id][1:]})
            )
        else:
            while stack:
                push = stack.pop()
                if push[1]["next"]:
                    stack.append(
                        ({"current": push[1]["next"][0]}, {"next": push[1]["next"][1:]})
                    )
                    break

    if html:
        return html_out
    else:
        return html_out.getvalue()

ZenodoResponse dataclass

Zenodo response processor.

This dataclass is a processor around requests.Response objects from Zenodo calls. If the response is successful, it parses the returned data payload; otherwise it formats as much information as possible into an error message.

Source code in safedata_validator/zenodo.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
@dataclass
class ZenodoResponse:
    """Zenodo response processor.

    This dataclass is a processor around `requests.Response` objects from Zenodo calls.
    If the response is successful, it parses the returned data payload; otherwise it
    formats as much information as possible into an error message.
    """

    response: InitVar[requests.Response]
    """The incoming response from a Zenodo API call."""
    ok: bool = field(init=False)
    """Was the response ok."""
    status_code: int = field(init=False)
    """The status code returned by the response."""
    json_data: dict = field(init=False, default_factory=lambda: dict())
    """The JSON data payload from a successful response."""
    error_message: str | None = field(init=False, default=None)
    """A formatted error message from a failed response."""

    def __post_init__(self, response: requests.Response) -> None:
        """Populate the ZenodoResponse object."""
        # Basic status
        self.ok = response.ok
        self.status_code = response.status_code

        # Try and handle the response content as JSON data, but not all successful
        # responses (e.g. file deletion) provide any data payload
        try:
            self.json_data = response.json()
        except requests.exceptions.JSONDecodeError:
            self.json_data = {}

        # Build the error message on failure
        if not self.ok:
            self.build_error_message(response)

    def build_error_message(self, response) -> None:
        """Format a Zenodo JSON error response as a string."""

        # Report the immediate reason and code along with the URL endpoint with the
        # access token redacted
        url = re.sub("(?<=access_token=).*$", "<redacted>", response.url)
        return_string = (
            f"\n\nZenodo error: {response.reason} "
            f"({response.status_code})\nURL: {url}\n"
        )

        # Add the message entry from the JSON payload if present
        if "message" in self.json_data:
            return_string += f"Message: {self.json_data['message']}\n"

        # Add any error entries from the JSON payload
        errors = self.json_data.get("errors", [])
        if errors:
            return_string += "Errors:\n"
            for e in errors:
                messages = "\n    - ".join(e["messages"])
                return_string += (
                    f" * Messages for field {e['field']}:\n    - {messages}"
                )
            return_string += "\n"

        self.error_message = return_string

__post_init__(response)

Populate the ZenodoResponse object.

Source code in safedata_validator/zenodo.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def __post_init__(self, response: requests.Response) -> None:
    """Populate the ZenodoResponse object."""
    # Basic status
    self.ok = response.ok
    self.status_code = response.status_code

    # Try and handle the response content as JSON data, but not all successful
    # responses (e.g. file deletion) provide any data payload
    try:
        self.json_data = response.json()
    except requests.exceptions.JSONDecodeError:
        self.json_data = {}

    # Build the error message on failure
    if not self.ok:
        self.build_error_message(response)

build_error_message(response)

Format a Zenodo JSON error response as a string.

Source code in safedata_validator/zenodo.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def build_error_message(self, response) -> None:
    """Format a Zenodo JSON error response as a string."""

    # Report the immediate reason and code along with the URL endpoint with the
    # access token redacted
    url = re.sub("(?<=access_token=).*$", "<redacted>", response.url)
    return_string = (
        f"\n\nZenodo error: {response.reason} "
        f"({response.status_code})\nURL: {url}\n"
    )

    # Add the message entry from the JSON payload if present
    if "message" in self.json_data:
        return_string += f"Message: {self.json_data['message']}\n"

    # Add any error entries from the JSON payload
    errors = self.json_data.get("errors", [])
    if errors:
        return_string += "Errors:\n"
        for e in errors:
            messages = "\n    - ".join(e["messages"])
            return_string += (
                f" * Messages for field {e['field']}:\n    - {messages}"
            )
        return_string += "\n"

    self.error_message = return_string

error_message = field(init=False, default=None) class-attribute instance-attribute

A formatted error message from a failed response.

json_data = field(init=False, default_factory=(lambda: dict())) class-attribute instance-attribute

The JSON data payload from a successful response.

ok = field(init=False) class-attribute instance-attribute

Was the response ok.

status_code = field(init=False) class-attribute instance-attribute

The status code returned by the response.

ZenodoResources dataclass

Packaging for Zenodo specific resources.

This dataclass is used to package the Zenodo specific elements of the configuration. It resolves the Zenodo API to use and provides top level attributes for the key Zenodo configuration components. The instance still contains the full resource configuration details as the resources attribute for pass through to functions that need wider configuration details.

Source code in safedata_validator/zenodo.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@dataclass
class ZenodoResources:
    """Packaging for Zenodo specific resources.

    This dataclass is used to package the Zenodo specific elements of the configuration.
    It resolves the Zenodo API to use and provides top level attributes for the key
    Zenodo configuration components. The instance still contains the full resource
    configuration details as the `resources` attribute for pass through to functions
    that need wider configuration details.
    """

    # TODO - Hmm. the resolution could be done when Resources is created, removing the
    # need for this extra class. But that does then assume that all Resources will set
    # require the API and params. So maybe keep.

    resources: Resources | None = None
    """A safedata_validator resources instance."""
    api: str = field(init=False)
    """The configured Zenodo API to be used."""
    token: dict[str, Any] = field(init=False)
    """A dictionary providing the authentication token for the API."""
    community: str = field(init=False)
    """The community name to be used to publish datasets."""
    name: str = field(init=False)
    """The configured name for the community data contact."""
    affiliation: str | None = field(init=False)
    """The configured affiliation for the community data contact."""
    orcid: str | None = field(init=False)
    """The configured OrcID for the community data contact."""

    def __post_init__(self) -> None:
        """Populate the post init attributes."""

        # Get the configuration from file if not provided
        if self.resources is None:
            self.resources = Resources()

        # Check the sandbox setting
        sandbox = self.resources.zenodo.use_sandbox
        if sandbox is None:
            raise RuntimeError("safedata_validator config does not set 'use_sandbox'")

        # Get the appropriate API and token
        if sandbox:
            self.api = "https://sandbox.zenodo.org/api"
            token_name = "zenodo_sandbox_token"
        else:
            self.api = "https://zenodo.org/api"
            token_name = "zenodo_token"

        token = getattr(self.resources.zenodo, token_name)
        if token is None:
            raise RuntimeError(f"safedata_validator config does not set {token_name}")

        self.token = {"access_token": token}

        # Get the contact details if used
        self.community = self.resources.zenodo.community_name
        self.name = self.resources.zenodo.contact_name
        self.affiliation = self.resources.zenodo.contact_affiliation
        self.orcid = self.resources.zenodo.contact_orcid

__post_init__()

Populate the post init attributes.

Source code in safedata_validator/zenodo.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def __post_init__(self) -> None:
    """Populate the post init attributes."""

    # Get the configuration from file if not provided
    if self.resources is None:
        self.resources = Resources()

    # Check the sandbox setting
    sandbox = self.resources.zenodo.use_sandbox
    if sandbox is None:
        raise RuntimeError("safedata_validator config does not set 'use_sandbox'")

    # Get the appropriate API and token
    if sandbox:
        self.api = "https://sandbox.zenodo.org/api"
        token_name = "zenodo_sandbox_token"
    else:
        self.api = "https://zenodo.org/api"
        token_name = "zenodo_token"

    token = getattr(self.resources.zenodo, token_name)
    if token is None:
        raise RuntimeError(f"safedata_validator config does not set {token_name}")

    self.token = {"access_token": token}

    # Get the contact details if used
    self.community = self.resources.zenodo.community_name
    self.name = self.resources.zenodo.contact_name
    self.affiliation = self.resources.zenodo.contact_affiliation
    self.orcid = self.resources.zenodo.contact_orcid

affiliation = field(init=False) class-attribute instance-attribute

The configured affiliation for the community data contact.

api = field(init=False) class-attribute instance-attribute

The configured Zenodo API to be used.

community = field(init=False) class-attribute instance-attribute

The community name to be used to publish datasets.

name = field(init=False) class-attribute instance-attribute

The configured name for the community data contact.

orcid = field(init=False) class-attribute instance-attribute

The configured OrcID for the community data contact.

resources = None class-attribute instance-attribute

A safedata_validator resources instance.

token = field(init=False) class-attribute instance-attribute

A dictionary providing the authentication token for the API.