Skip to content

The zenodo submodule

This module provides functions to:

  1. handle the publication of datasets after they have been validated using safedata_validate, including the generation of HTML descriptions of datasets.
  2. maintain local copies of datasets in the folder structure expected by the safedata R package.
  3. compile a RIS format bibliographic file for published datasets.

create_deposit(concept_id=None, resources=None)

Create a new deposit.

Creates a new deposit draft, possibly as a new version of an existing published record.

Parameters:

Name Type Description Default
concept_id int | None

An optional concept id of a published record to create a new version of an existing dataset.

None
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
ZenodoFunctionResponseType

See here.

Source code in safedata_validator/zenodo.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
def create_deposit(
    concept_id: int | None = None, resources: Resources | None = None
) -> ZenodoFunctionResponseType:
    """Create a new deposit.

    Creates a new deposit draft, possibly as a new version of an existing published
    record.

    Args:
        concept_id: An optional concept id of a published record to create a new version
            of an existing dataset.
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoFunctionResponseType].
    """

    # Get resource configuration
    zres = _resources_to_zenodo_api(resources)
    zenodo_api = zres["zapi"]
    params = zres["ztoken"]

    # get the correct draft api
    if concept_id is None:
        api = f"{zenodo_api}/deposit/depositions"
    else:
        api = f"{zenodo_api}/deposit/depositions/{concept_id}/actions/newversion"

    # Create the draft
    new_draft = requests.post(api, params=params, json={})

    # trap errors in creating the new version (not 201: created)
    if new_draft.status_code != 201:
        return {}, _zenodo_error_message(new_draft)

    if concept_id is None:
        return new_draft.json(), None

    # For new versions, the response is an update to the existing copy,
    # so need to separately retrieve the new draft
    api = new_draft.json()["links"]["latest_draft"]
    dep = requests.get(api, params=params, json={})

    # trap errors in creating the resource - successful creation of new version
    #  drafts returns 200
    if dep.status_code != 200:
        return {}, _zenodo_error_message(dep)
    else:
        return dep.json(), None

get_deposit(deposit_id, resources=None)

Download the metadata of a Zenodo deposit.

Parameters:

Name Type Description Default
deposit_id int

The Zenodo record id of an existing dataset.

required
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
ZenodoFunctionResponseType

See here.

Source code in safedata_validator/zenodo.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def get_deposit(
    deposit_id: int, resources: Resources | None = None
) -> ZenodoFunctionResponseType:
    """Download the metadata of a Zenodo deposit.

    Args:
        deposit_id: The Zenodo record id of an existing dataset.
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoFunctionResponseType].
    """

    zres = _resources_to_zenodo_api(resources)
    zenodo_api = zres["zapi"]
    params = zres["ztoken"]

    # request the deposit
    dep = requests.get(
        f"{zenodo_api}/deposit/depositions/{deposit_id}", params=params, json={}
    )

    # check for success and return the information.
    if dep.status_code == 200:
        return dep.json(), None
    else:
        return {}, _zenodo_error_message(dep)

upload_metadata(metadata, zenodo, resources=None)

Upload dataset metadata.

Takes a dictionary of dataset metadata, converts it to a JSON payload of Zenodo metadata and uploads it to a deposit.

Parameters:

Name Type Description Default
metadata dict

The metadata dictionary for a dataset

required
zenodo dict

The zenodo metadata dictionary for a deposit

required
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
ZenodoFunctionResponseType

See here.

Source code in safedata_validator/zenodo.py
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
def upload_metadata(
    metadata: dict, zenodo: dict, resources: Resources | None = None
) -> ZenodoFunctionResponseType:
    """Upload dataset metadata.

    Takes a dictionary of dataset metadata, converts it to a JSON payload of Zenodo
    metadata and uploads it to a deposit.

    Args:
        metadata: The metadata dictionary for a dataset
        zenodo: The zenodo metadata dictionary for a deposit
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoFunctionResponseType].
    """

    # Get resource configuration
    zres = _resources_to_zenodo_api(resources)

    # basic contents
    zen_md = {
        "metadata": {
            "upload_type": "dataset",
            # "publication_date": datetime.date.today().isoformat(),
            "title": metadata["title"],
            "keywords": metadata["keywords"],
            "license": "cc-by",
            "communities": [{"identifier": zres["zcomm"]}],
        }
    }

    # Add a contact name to contributors if provided in config
    if zres["zcname"] is not None:
        zen_md["metadata"]["contributors"] = [
            {
                "name": zres["zcname"],
                "type": "ContactPerson",
                "affiliation": zres["zcaffil"],
                "orcid": zres["zcorc"],
            }
        ]

    # set up the access rights
    dataset_access = metadata["access"].lower()
    if dataset_access == "embargo":
        zen_md["metadata"]["access_right"] = "embargoed"
        zen_md["metadata"]["embargo_date"] = metadata["embargo_date"]
    elif dataset_access == "open":
        zen_md["metadata"]["access_right"] = "open"
    elif dataset_access == "restricted":
        zen_md["metadata"]["access_right"] = "restricted"
        zen_md["metadata"]["access_conditions"] = metadata["access_conditions"]
    else:
        raise ValueError("Unknown access status")

    # set up the dataset creators - the format has already been checked and names
    # should be present and correct. Everything else is optional, so strip None
    # values and pass the rest to Zenodo
    zen_md["metadata"]["creators"] = [
        {ky: auth[ky] for ky in auth if auth[ky] is not None and ky != "email"}
        for auth in metadata["authors"]
    ]

    zen_md["metadata"]["description"] = dataset_description(
        metadata, zenodo, render=True, resources=resources
    )

    # attach the metadata to the deposit resource
    mtd = requests.put(zenodo["links"]["self"], params=zres["ztoken"], json=zen_md)

    # trap errors in uploading metadata and tidy up
    if mtd.status_code != 200:
        return {}, mtd.reason
    else:
        return {}, None

update_published_metadata(zenodo, resources=None)

Update published deposit metadata.

Updates the metadata on a published deposit, for example to modify the access status of deposit. In general, metadata should be updated by releasing a new version of the dataset, and this function should only be used where it is essential that the published version by altered.

Parameters:

Name Type Description Default
zenodo dict

A Zenodo metadata dictionary, with an updated metadata section

required
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
ZenodoFunctionResponseType

See here.

Source code in safedata_validator/zenodo.py
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def update_published_metadata(
    zenodo: dict,
    resources: Resources | None = None,
) -> ZenodoFunctionResponseType:
    """Update published deposit metadata.

    Updates the metadata on a published deposit, for example to modify the access status
    of deposit. In general, metadata should be updated by releasing a new version of the
    dataset, and this function should only be used where it is essential that the
    published version by altered.

    Args:
        zenodo: A Zenodo metadata dictionary, with an updated metadata section
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoFunctionResponseType].
    """

    # Get resource configuration
    zres = _resources_to_zenodo_api(resources)

    links = zenodo["links"]

    # Unlock the published deposit for editing
    edt = requests.post(links["edit"], params=zres["ztoken"])

    if edt.status_code != 201:
        return {}, edt.json()

    # # Amend the metadata
    # for key, val in new_values.items():
    #     if val is not None:
    #         metadata[key] = val
    #     elif key in metadata:
    #         metadata.pop(key)

    # If any API calls from now fail, we need to tidy up the edit
    # status of the record, or it will block subsequent attempts

    upd = requests.put(
        links["self"],
        params=zres["ztoken"],
        headers={"Content-Type": "application/json"},
        data=simplejson.dumps({"metadata": zenodo["metadata"]}),
    )

    success_so_far = 0 if upd.status_code != 200 else 1
    ret = upd.json()

    # Republish to save the changes
    if success_so_far:
        pub = requests.post(links["publish"], params=zres["ztoken"])
        success_so_far = 0 if pub.status_code != 202 else 1
        ret = pub.json()

    # If all steps have been successful, return a 0 code, otherwise
    # try to discard the edits and return the most recent failure
    # notice

    if success_so_far:
        return ret, None
    else:
        dsc = requests.post(links["discard"], params=zres["ztoken"])
        success_so_far = 0 if dsc.status_code != 201 else 1
        if not success_so_far:
            ret = dsc.json()

        return {}, ret

upload_file(metadata, filepath, zenodo_filename=None, progress_bar=True, resources=None)

Upload a file to Zenodo.

Uploads the contents of a specified file to an unpublished Zenodo deposit, optionally using an alternative filename. If the file already exists in the deposit, it will be replaced.

Parameters:

Name Type Description Default
metadata dict

The Zenodo metadata dictionary for a deposit

required
filepath str

The path to the file to be uploaded

required
zenodo_filename str | None

An optional alternative file name to be used on Zenodo

None
progress_bar bool

Should the upload progress be displayed

True
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
ZenodoFunctionResponseType

See here.

Source code in safedata_validator/zenodo.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
def upload_file(
    metadata: dict,
    filepath: str,
    zenodo_filename: str | None = None,
    progress_bar: bool = True,
    resources: Resources | None = None,
) -> ZenodoFunctionResponseType:
    """Upload a file to Zenodo.

    Uploads the contents of a specified file to an unpublished Zenodo deposit,
    optionally using an alternative filename. If the file already exists in the deposit,
    it will be replaced.

    Args:
        metadata: The Zenodo metadata dictionary for a deposit
        filepath: The path to the file to be uploaded
        zenodo_filename: An optional alternative file name to be used on Zenodo
        progress_bar: Should the upload progress be displayed
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoFunctionResponseType].
    """

    # Get resource configuration
    zres = _resources_to_zenodo_api(resources)
    params = zres["ztoken"]

    # Check the file and get the filename if an alternative is not provided
    filepath = os.path.abspath(filepath)
    if not (os.path.exists(filepath) and os.path.isfile(filepath)):
        raise OSError(f"The file path is either a directory or not found: {filepath} ")

    if zenodo_filename is None:
        file_name = os.path.basename(filepath)
    else:
        file_name = zenodo_filename

    # upload the file
    # - https://gist.github.com/tyhoff/b757e6af83c1fd2b7b83057adf02c139
    file_size = os.stat(filepath).st_size
    api = f"{metadata['links']['bucket']}/{file_name}"

    with open(filepath, "rb") as file_io:
        if progress_bar:
            with tqdm(
                total=file_size, unit="B", unit_scale=True, unit_divisor=1024
            ) as upload_monitor:
                # Upload the wrapped file
                wrapped_file = CallbackIOWrapper(upload_monitor.update, file_io, "read")
                fls = requests.put(api, data=wrapped_file, params=params)
        else:
            fls = requests.put(api, data=file_io, params=params)

    # trap errors in uploading file
    # - no success or mismatch in md5 checksums
    if fls.status_code != 201:
        return {}, _zenodo_error_message(fls)

    # TODO - could this be inside with above? - both are looping over the file contents
    # https://medium.com/codex/chunked-uploads-with-binary-files-in-python-f0c48e373a91
    local_hash = _compute_md5(filepath)

    if fls.json()["checksum"] != f"md5:{local_hash}":
        return {}, "Mismatch in local and uploaded MD5 hashes"
    else:
        return fls.json(), None

discard_deposit(metadata, resources=None)

Discard a deposit.

Deposits can be discarded - the associated files and metadata will be deleted and the Zenodo ID no longer exists. Once deposits are published to records, they cannot be deleted via the API - contact the Zenodo team for help.

Parameters:

Name Type Description Default
metadata dict

The Zenodo metadata dictionary for a deposit

required
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
ZenodoFunctionResponseType

See here.

Source code in safedata_validator/zenodo.py
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
def discard_deposit(
    metadata: dict, resources: Resources | None = None
) -> ZenodoFunctionResponseType:
    """Discard a deposit.

    Deposits can be discarded - the associated files and metadata will be deleted and
    the Zenodo ID no longer exists. Once deposits are published to records, they cannot
    be deleted via the API - contact the Zenodo team for help.

    Args:
        metadata: The Zenodo metadata dictionary for a deposit
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoFunctionResponseType].
    """

    # Get resource configuration
    zres = _resources_to_zenodo_api(resources)
    params = zres["ztoken"]

    delete = requests.delete(metadata["links"]["self"], params=params)

    if delete.status_code == 204:
        return {"result": "success"}, None
    else:
        return {}, _zenodo_error_message(delete)

publish_deposit(zenodo, resources=None)

Publish a created deposit.

Parameters:

Name Type Description Default
zenodo dict

The dataset metadata dictionary for a deposit

required
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
ZenodoFunctionResponseType

See here.

Source code in safedata_validator/zenodo.py
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
def publish_deposit(
    zenodo: dict, resources: Resources | None = None
) -> ZenodoFunctionResponseType:
    """Publish a created deposit.

    Args:
        zenodo: The dataset metadata dictionary for a deposit
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoFunctionResponseType].
    """

    # Get resource configuration
    zres = _resources_to_zenodo_api(resources)
    params = zres["ztoken"]

    # publish
    pub = requests.post(zenodo["links"]["publish"], params=params)

    # trap errors in publishing, otherwise return the publication metadata
    if pub.status_code != 202:
        return {}, pub.json()
    else:
        return pub.json(), None

delete_file(metadata, filename, resources=None)

Delete an uploaded file from an unpublished Zenodo deposit.

Parameters:

Name Type Description Default
metadata dict

The Zenodo metadata dictionary for a deposit

required
filename str

The file to delete from the deposit

required
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
ZenodoFunctionResponseType

See here.

Source code in safedata_validator/zenodo.py
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
def delete_file(
    metadata: dict, filename: str, resources: Resources | None = None
) -> ZenodoFunctionResponseType:
    """Delete an uploaded file from an unpublished Zenodo deposit.

    Args:
        metadata: The Zenodo metadata dictionary for a deposit
        filename: The file to delete from the deposit
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        See [here][safedata_validator.zenodo.ZenodoFunctionResponseType].
    """

    # Get resource configuration
    zres = _resources_to_zenodo_api(resources)
    params = zres["ztoken"]

    # get an up to date list of existing files (metadata
    # might be outdated)
    files = requests.get(metadata["links"]["files"], params=params)

    # check the result of the files request
    if files.status_code != 200:
        # failed to get the files
        return {}, _zenodo_error_message(files)

    # get a dictionary of file links
    files_dict = {f["filename"]: f["links"]["self"] for f in files.json()}

    if filename not in files_dict:
        return {}, f"{filename} is not a file in the deposit"

    # get the delete link to the file and call
    delete_api = files_dict[filename]
    file_del = requests.delete(delete_api, params=params)

    if file_del.status_code != 204:
        return {}, _zenodo_error_message(file_del)
    else:
        return {"result": "success"}, None

dataset_description(dataset_metadata, zenodo_metadata, render=True, extra=None, resources=None)

Create an HTML dataset description.

This function turns a dataset metadata JSON into html for inclusion in published datasets. This content is used to populate the dataset description section in the Zenodo metadata. Zenodo has a limited set of permitted HTML tags, so this is quite simple HTML.

The available tags are: a, p, br, blockquote, strong, b, u, i, em, ul, ol, li, sub, sup, div, strike. Note that <a> is currently only available on Zenodo when descriptions are uploaded programmatically as a bug in their web interface strips links.

The description can be modified for specific uses by including HTML via the extra argument. This content is inserted below the dataset description.

Parameters:

Name Type Description Default
dataset_metadata dict

The dataset metadata

required
zenodo_metadata dict

The Zenodo deposit metadata

required
render bool

Should the html be returned as text or as the underlying dominate.tags.div object.

True
extra str | None

Additional HTML content to include in the description.

None
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None

Returns:

Type Description
div | str

Either a string of rendered HTML or a dominate.tags.div object.

Source code in safedata_validator/zenodo.py
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
def dataset_description(
    dataset_metadata: dict,
    zenodo_metadata: dict,
    render: bool = True,
    extra: str | None = None,
    resources: Resources | None = None,
) -> tags.div | str:
    """Create an HTML dataset description.

    This function turns a dataset metadata JSON into html for inclusion in
    published datasets. This content is used to populate the dataset description
    section in the Zenodo metadata. Zenodo has a limited set of permitted HTML
    tags, so this is quite simple HTML.

    The available tags are: a, p, br, blockquote, strong, b, u, i, em, ul, ol,
    li, sub, sup, div, strike. Note that `<a>` is currently only available on
    Zenodo when descriptions are uploaded programmatically as a bug in their
    web interface strips links.

    The description can be modified for specific uses by including HTML via the
    extra argument. This content is inserted below the dataset description.

    Args:
        dataset_metadata: The dataset metadata
        zenodo_metadata: The Zenodo deposit metadata
        render: Should the html be returned as text or as the underlying
            dominate.tags.div object.
        extra: Additional HTML content to include in the description.
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.

    Returns:
        Either a string of rendered HTML or a dominate.tags.div object.
    """

    # zres = _resources_to_zenodo_api(resources)
    # metadata_api = zres["mdapi"]

    # PROJECT Title and authors are added by Zenodo from zenodo metadata
    # TODO - option to include here?

    desc = tags.div()

    # Dataset summary
    desc += tags.b("Description: ")
    desc += tags.p(dataset_metadata["description"].replace("\n", "</br>"))

    # Extra
    if extra is not None:
        desc += raw(extra)

    # proj_url = URL('projects', 'project_view', args=[metadata['project_id']],
    #               scheme=True, host=True)
    # desc += P(B('Project: '), 'This dataset was collected as part of the following '
    #                          'SAFE research project: ', A(B(title), _href=proj_url))
    ##

    # Funding information
    if dataset_metadata["funders"]:
        funder_info = []

        for fnd in dataset_metadata["funders"]:
            funder_details = [fnd["body"], "(", fnd["type"]]

            if fnd["ref"]:
                funder_details.append(str(fnd["ref"]))
            if fnd["url"]:
                funder_details.append(tags.a(fnd["url"], _href=fnd["url"]))

            funder_details.append(")")
            funder_info.append(tags.li(funder_details))

        desc += [
            tags.p(
                tags.b("Funding: "),
                "These data were collected as part of research funded by: ",
                tags.ul(funder_info),
            ),
            tags.p(
                "This dataset is released under the CC-BY 4.0 licence, requiring that "
                "you cite the dataset in any outputs, but has the additional condition "
                "that you acknowledge the contribution of these funders in any outputs."
            ),
        ]

    # Permits
    if dataset_metadata["permits"]:
        desc += tags.p(
            tags.b("Permits: "),
            "These data were collected under permit from the following authorities:",
            tags.ul(
                [
                    tags.li(
                        f"{pmt['authority']} ({pmt['type']} licence {pmt['number']})"
                    )
                    for pmt in dataset_metadata["permits"]
                ]
            ),
        )

    # Present a description of the file or files including 'external' files
    # (data files loaded directly to Zenodo).
    ds_files = [dataset_metadata["filename"]]
    n_ds_files = 1
    ex_files = []

    if dataset_metadata["external_files"]:
        ex_files = dataset_metadata["external_files"]
        ds_files += [f["file"] for f in ex_files]
        n_ds_files += len(ex_files)

    desc += tags.p(
        tags.b("Files: "),
        f"This dataset consists of {n_ds_files} files: ",
        ", ".join(ds_files),
    )

    # Group the sheets by their 'external' file - which is None for sheets
    # in the submitted workbook - and collect them into a dictionary by source
    # file. get() is used here for older data where external was not present.

    tables_by_source = dataset_metadata["dataworksheets"]

    # Now group into a dictionary keyed by external source file - cannot sort
    # None (no comparison operators) so use a substitute
    tables_by_source.sort(key=lambda sh: sh.get("external") or False)
    tables_by_source = groupby(
        tables_by_source, key=lambda sh: sh.get("external") or False
    )
    tables_by_source = {g: list(v) for g, v in tables_by_source}

    # We've now got a set of files (worksheet + externals) and a dictionary of table
    # descriptions that might have an entry for each file.

    # Report the worksheet first
    desc += tags.p(tags.b(dataset_metadata["filename"]))

    # Report internal tables
    if False in tables_by_source:
        int_tabs = tables_by_source[False]
        desc += tags.p(
            f"This file contains dataset metadata and {len(int_tabs)} data tables:"
        )
        desc += tags.ol([tags.li(table_description(tab)) for tab in int_tabs])
    else:
        # No internal tables at all.
        desc += tags.p("This file only contains metadata for the files below")

    # Report on the other files
    for exf in ex_files:
        desc += tags.p(
            tags.b(exf["file"]), tags.p(f"Description: {exf['description']}")
        )

        if exf["file"] in tables_by_source:
            # Report table description
            ext_tabs = tables_by_source[exf["file"]]
            desc += tags.p(f"This file contains {len(ext_tabs)} data tables:")
            desc += tags.ol([tags.li(table_description(tab)) for tab in ext_tabs])

    # Add extents if populated
    if dataset_metadata["temporal_extent"] is not None:
        desc += tags.p(
            tags.b("Date range: "),
            "{0[0]} to {0[1]}".format(
                [x[:10] for x in dataset_metadata["temporal_extent"]]
            ),
        )
    if dataset_metadata["latitudinal_extent"] is not None:
        desc += tags.p(
            tags.b("Latitudinal extent: "),
            "{0[0]:.4f} to {0[1]:.4f}".format(dataset_metadata["latitudinal_extent"]),
        )
    if dataset_metadata["longitudinal_extent"] is not None:
        desc += tags.p(
            tags.b("Longitudinal extent: "),
            "{0[0]:.4f} to {0[1]:.4f}".format(dataset_metadata["longitudinal_extent"]),
        )

    # Find taxa data from each database (if they exist)
    gbif_taxon_index = dataset_metadata.get("gbif_taxa")
    ncbi_taxon_index = dataset_metadata.get("ncbi_taxa")

    # When NCBI is absent use the old format for backwards compatibility
    if gbif_taxon_index or ncbi_taxon_index:
        desc += tags.p(
            tags.b("Taxonomic coverage: "),
            tags.br(),
            "This dataset contains data associated with taxa and these have been "
            "validated against appropriate taxonomic authority databases.",
        )

    if gbif_taxon_index:
        desc += tags.p(
            tags.u("GBIF taxa details: "),
            tags.br(),
            tags.br(),
            "The following taxa were validated against the GBIF backbone dataset."
            "If a dataset uses a synonym, the accepted usage is shown followed by the "
            "dataset usage in brackets. Taxa that cannot be validated, including new "
            "species and other unknown taxa, morphospecies, functional groups and "
            "taxonomic levels not used in the GBIF backbone are shown in square "
            "brackets.",
            taxon_index_to_text(gbif_taxon_index, True, auth="GBIF"),
        )

    if ncbi_taxon_index:
        desc += tags.p(
            tags.u("NCBI taxa details: "),
            tags.br(),
            tags.br(),
            "The following taxa were validated against the NCBI taxonomy dataset."
            " If a dataset uses a synonym, the accepted usage is shown followed by the "
            "dataset usage in brackets. Taxa that cannot be validated, e.g. new or "
            "unknown species are shown in square brackets. Non-backbone taxonomic "
            "ranks (e.g. strains or subphyla) can be validated using the NCBI "
            "database. However, they will only be shown if the user explicitly "
            "provided a non-backbone taxon. When they are shown they will be "
            "accompanied by an message stating their rank.",
            taxon_index_to_text(ncbi_taxon_index, True, auth="NCBI"),
        )

    if render:
        return desc.render()
    else:
        return desc

table_description(tab)

Convert a dict containing table contents into an HTML table.

Function to return a description for an individual source file in a dataset. Typically datasets only have a single source file - the Excel workbook that also contains the metadata - but they may also report on external files loaded directly to Zenodo, and which uses the same mechanism.

Parameters:

Name Type Description Default
tab dict

A dict describing a data table

required

Returns:

Type Description
div

A dominate.tags.div instance containing an HTML description of the table

Source code in safedata_validator/zenodo.py
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
def table_description(tab: dict) -> tags.div:
    """Convert a dict containing table contents into an HTML table.

    Function to return a description for an individual source file in a dataset.
    Typically datasets only have a single source file - the Excel workbook that
    also contains the metadata - but they may also report on external files loaded
    directly to Zenodo, and which uses the same mechanism.

    Args:
        tab: A dict describing a data table

    Returns:
        A `dominate.tags.div` instance containing an HTML description of the table
    """

    # table summary
    tab_desc = tags.div(
        tags.p(tags.b(tab["title"]), f" (described in worksheet {tab['name']})"),
        tags.p(f"Description: {tab['description']}"),
        tags.p(f"Number of fields: {tab['max_col'] - 1}"),
    )

    # The explicit n_data_row key isn't available for older records
    if "n_data_row" in tab:
        if tab["n_data_row"] == 0:
            tab_desc += tags.p(
                "Number of data rows: Unavailable (table metadata description only)."
            )
        else:
            tab_desc += tags.p(f"Number of data rows: {tab['n_data_row']}")
    else:
        tab_desc += tags.p(
            f"Number of data rows: {tab['max_row'] - len(tab['descriptors'])}"
        )

    # add fields
    tab_desc += tags.p("Fields: ")

    # fields summary
    flds = tags.ul()
    for each_fld in tab["fields"]:
        flds += tags.li(
            tags.b(each_fld["field_name"]),
            f": {each_fld['description']} (Field type: {each_fld['field_type']})",
        )

    tab_desc += flds

    return tab_desc

generate_inspire_xml(dataset_metadata, zenodo_metadata, resources, lineage_statement=None)

Convert dataset and zenodo metadata into GEMINI XML.

Produces an INSPIRE/GEMINI formatted XML record from dataset metadata, and Zenodo record metadata using a template XML file. The dataset URL defaults to the Zenodo record but can be replaced if a separate URL (such as a project specific website) is used. The Gemini XML standard requires a statement about the lineage of a dataset - this is automatically taken from the package configuration but can be overridden for individual datasets, for example to add dataset specific links, using the lineage_statement argument.

Parameters:

Name Type Description Default
dataset_metadata dict

A dictionary of the dataset metadata

required
zenodo_metadata dict

A dictionary of the Zenodo record metadata

required
resources Resources

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

required
lineage_statement str | None

An optional alternative lineage statement about the data.

None

Returns:

Type Description
str

A string containing GEMINI compliant XML.

Source code in safedata_validator/zenodo.py
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
def generate_inspire_xml(
    dataset_metadata: dict,
    zenodo_metadata: dict,
    resources: Resources,
    lineage_statement: str | None = None,
) -> str:
    """Convert dataset and zenodo metadata into GEMINI XML.

    Produces an INSPIRE/GEMINI formatted XML record from dataset metadata,
    and Zenodo record metadata using a template XML file. The dataset URL
    defaults to the Zenodo record but can be replaced if a separate URL (such as
    a project specific website) is used. The Gemini XML standard requires a
    statement about the lineage of a dataset - this is automatically taken from the
    package configuration but can be overridden for individual datasets, for example to
    add dataset specific links, using the `lineage_statement` argument.

    Args:
        dataset_metadata: A dictionary of the dataset metadata
        zenodo_metadata: A dictionary of the Zenodo record metadata
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.
        lineage_statement: An optional alternative lineage statement about the data.

    Returns:
        A string containing GEMINI compliant XML.
    """

    template_path = il_resources.files("safedata_validator.templates").joinpath(
        "gemini_xml_template.xml"
    )

    # Get the Jinja environment and load the template
    # - mypy: importlib returns a Traversable, which is a protocol that Path complies
    #         with, but the attribute isn't being recognized
    env = Environment(
        loader=FileSystemLoader(template_path.parent),  # type: ignore [attr-defined]
        autoescape=select_autoescape(),
    )

    template = env.get_template(template_path.name)

    # Build some reused values from the metadata
    # URIs -  form the DOI URL from the prereserved DOI metadata
    doi_url = f"https://doi.org/{zenodo_metadata['metadata']['prereserve_doi']['doi']}"

    # A true "publication" date is not available until a record is published, so use the
    # creation date of the deposit as a reasonable replacement, with the caveat that you
    # should generate the XML and publish on the same day.
    pub_date = dt.fromisoformat(zenodo_metadata["created"]).date()

    # A citation string
    authors = [au["name"] for au in dataset_metadata["authors"]]
    author_string = ", ".join(authors)
    if len(authors) > 1:
        author_string = author_string.replace(", " + authors[-1], " & " + authors[-1])

    citation_string = (
        f"{author_string} ({pub_date.year}) "
        f"{dataset_metadata['title']} [Dataset] {doi_url}"
    )

    # Resource constraints text
    if dataset_metadata["access"] == "embargo":
        access_statement = (
            f"This data is under embargo until {dataset_metadata['embargo_date']}."
            "After that date there are no restrictions to public access."
        )
    elif dataset_metadata["access"] == "restricted":
        access_statement = (
            "This dataset is currently not publicly available, please contact the "
            "Zenodo community owner to request access."
        )
    else:
        access_statement = "There are no restrictions to public access."

    # Get a copy of the project wide XML configuration from the resources and update it
    # with the file specific elements from the zenodo and dataset metadata
    context_dict = resources.xml.copy()

    context_dict.update(
        # Values also used on the Zenodo information or duplicated in the xml
        contactName=resources.zenodo.contact_name,
        contactOrcID=resources.zenodo.contact_orcid,
        pointofcontactName=resources.zenodo.contact_name,
        pointofcontactCountry=resources.xml.contactCountry,
        pointofcontactEmail=resources.xml.contactEmail,
        pointofcontactOrcID=resources.zenodo.contact_name,
        # Dataset specific information
        citationRSIdentifier=doi_url,
        dateStamp=pub_date.isoformat(),
        publicationDate=pub_date.isoformat(),
        fileIdentifier=str(zenodo_metadata["id"]),
        title=dataset_metadata["title"],
        authors=dataset_metadata["authors"],
        abstract=dataset_metadata["description"],
        keywords=dataset_metadata["keywords"],
        citationString=citation_string,
        embargoValue=access_statement,
        startDate=dataset_metadata["temporal_extent"][0][:10],
        endDate=dataset_metadata["temporal_extent"][1][:10],
        westBoundLongitude=_min_dp(dataset_metadata["longitudinal_extent"][0], 2),
        eastBoundLongitude=_min_dp(dataset_metadata["longitudinal_extent"][1], 2),
        southBoundLatitude=_min_dp(dataset_metadata["latitudinal_extent"][0], 2),
        northBoundLatitude=_min_dp(dataset_metadata["latitudinal_extent"][1], 2),
        downloadLink=doi_url,
    )

    # Override global lineage statement
    if lineage_statement is not None:
        context_dict["lineageStatement"] = lineage_statement

    xml = template.render(context_dict)

    return xml

download_ris_data(resources=None, ris_file=None)

Downloads Zenodo records into a RIS format bibliography file.

This function is used to maintain a bibliography file of the records uploaded to a safedata community on Zenodo. It accesses the Zenodo community specified in the resource configuration and downloads all records. It then optionally checks the list of downloaded DOIs against the content of an existing RIS file and then downloads citations for all new DOIs from datacite.org.

Parameters:

Name Type Description Default
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None
ris_file str | None

The path to an existing RIS format file containing previously downloaded records.

None

Returns:

Type Description
None

A list of strings containing RIS formatted citation data.

Source code in safedata_validator/zenodo.py
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
def download_ris_data(
    resources: Resources | None = None, ris_file: str | None = None
) -> None:
    """Downloads Zenodo records into a RIS format bibliography file.

    This function is used to maintain a bibliography file of the records
    uploaded to a safedata community on Zenodo. It accesses the Zenodo community
    specified in the resource configuration and downloads all records. It then
    optionally checks the list of downloaded DOIs against the content of an
    existing RIS file and then downloads citations for all new DOIs from
    datacite.org.

    Args:
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.
        ris_file: The path to an existing RIS format file containing previously
            downloaded records.

    Returns:
        A list of strings containing RIS formatted citation data.
    """

    if resources is None:
        resources = Resources()

    # Get a list of known DOI records from an existing RIS file if one is
    # provided
    known_recids = []
    new_doi = []

    if ris_file and os.path.exists(ris_file):
        with open(ris_file) as bibliography_file:
            entries = rispy.load(bibliography_file)
            for entry in entries:
                record_id = int(entry["url"].split("/")[-1])
                known_recids.append(record_id)

    # Zenodo API call to return the records associated with the SAFE community
    zres = _resources_to_zenodo_api(resources)
    z_api = zres["zapi"]
    z_cname = zres["zcomm"]

    api = f"{z_api}/records/?q=communities:{z_cname}"

    # Provide feedback on DOI collection
    LOGGER.info(f"Fetching record DOIs from {api}:")
    FORMATTER.push()

    # The API is paged - it contains a set of records and a link that points
    # to the next page of records, so keep looping until there are no more next
    n_records = 0
    while True:
        # Get the data
        safe_data = requests.get(api)

        if safe_data.status_code != 200:
            raise OSError("Cannot access Zenodo API")
        else:
            # Retrieve the record data and store the DOI for each record
            safe_data_dict = safe_data.json()
            for hit in safe_data_dict["hits"]["hits"]:
                if hit["id"] not in known_recids:
                    new_doi.append(hit["doi"])

            # Reporting
            n_records += len(safe_data_dict["hits"]["hits"])
            LOGGER.info(f"{n_records}")

            # Update the link for the next page, unless there is no next page
            if "next" in safe_data_dict["links"]:
                api = safe_data_dict["links"]["next"]
            else:
                break

    # Use the datacite API to retrieve the citation data associated with the DOI
    # and save it out to a RIS format file
    if not new_doi:
        LOGGER.info("No new DOIs found")
        return

    # Get the DOI data
    data = []

    FORMATTER.pop()
    LOGGER.info(
        f"Retrieving citation data from Datacite for {len(new_doi)} new records"
    )
    FORMATTER.push()

    for doi in new_doi:
        ris_data = requests.get(
            f"https://data.datacite.org/application/x-research-info-systems/{doi}"
        )

        if ris_data.status_code != 200:
            LOGGER.warning(f"DOI {doi} not found in datacite.org")
        else:
            # Write the response content to the data list. It comes in as byte
            # data so needs to be decoded to a string variable
            LOGGER.info(f"Retrieved citation for DOI {doi}")
            data.append(ris_data.content.decode("utf-8") + "\r\n")

    FORMATTER.pop()

    # Writing only occurs if a ris file path has actually been provided
    if ris_file:
        if os.path.exists(ris_file):
            LOGGER.info(f"Appending RIS data for {len(data)} new records to {ris_file}")
            write_mode = "a"
        else:
            LOGGER.info(f"Writing RIS data for {len(data)} records to {ris_file}")
            write_mode = "w"

        with open(ris_file, write_mode) as ris_file_out:
            for this_entry in data:
                ris_file_out.write(this_entry)

sync_local_dir(datadir, xlsx_only=True, replace_modified=False, resources=None)

Synchronise a local data directory with a Zenodo community.

The safedata R package defines a directory structure used to store metadata and files downloaded from a safedata community on Zenodo and from a safedata metadata server. This tool allows a safedata developer or community maintainer to create or update such a directory with all of the resources in the Zenodo community, regardless of their public access status. This forms a backup (although Zenodo is heavily backed up) but also provides local copies of the files for testing and development of the code packages.

This function requires that the resources are configured with access tokens for Zenodo and the details of the metadata server.

Parameters:

Name Type Description Default
datadir str

The path to a local directory containing an existing safedata directory or an empty folder in which to create one.

required
resources Resources | None

The safedata_validator resource configuration to be used. If none is provided, the standard locations are checked.

None
xlsx_only bool

Should the download ignore large non-xlsx files, defaulting to True.

True
replace_modified bool

Should the synchronisation replace locally modified files with the archived version. By default, modified local files are left alone.

False
Source code in safedata_validator/zenodo.py
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
def sync_local_dir(
    datadir: str,
    xlsx_only: bool = True,
    replace_modified: bool = False,
    resources: Resources | None = None,
) -> None:
    """Synchronise a local data directory with a Zenodo community.

    The safedata R package defines a directory structure used to store metadata and
    files downloaded from a safedata community on Zenodo and from a safedata metadata
    server. This tool allows a safedata developer or community maintainer to create or
    update such a directory with _all_ of the resources in the Zenodo community,
    regardless of their public access status. This forms a backup (although Zenodo is
    heavily backed up) but also provides local copies of the files for testing and
    development of the code packages.

    This function requires that the resources are configured with access tokens for
    Zenodo and the details of the metadata server.

    Args:
        datadir: The path to a local directory containing an existing safedata
            directory or an empty folder in which to create one.
        resources: The safedata_validator resource configuration to be used. If
            none is provided, the standard locations are checked.
        xlsx_only: Should the download ignore large non-xlsx files, defaulting
            to True.
        replace_modified: Should the synchronisation replace locally modified files with
            the archived version. By default, modified local files are left alone.
    """

    # Private helper functions
    def _get_file(url: str, outf: str, params: dict | None = None) -> None:
        """Download a file from a URL."""
        resource = requests.get(url, params=params, stream=True)

        with open(outf, "wb") as outf_obj:
            shutil.copyfileobj(resource.raw, outf_obj)

    # Get resource configuration
    zres = _resources_to_zenodo_api(resources)
    zenodo_api = zres["zapi"]
    params = zres["ztoken"]

    # The dir argument should be an existing path
    if not (os.path.exists(datadir) and os.path.isdir(datadir)):
        raise OSError(f"{datadir} is not an existing directory")

    # Get the configured metadata api
    api = zres["mdapi"]

    # Check for an existing API url file and check it is congruent with config
    url_file = os.path.join(datadir, "url.json")

    if os.path.exists(url_file):
        with open(url_file) as urlf:
            dir_api = simplejson.load(urlf)["url"][0]

        if api != dir_api:
            raise RuntimeError(
                "Configured api does not match existing api in directory"
            )
    else:
        with open(url_file, "w") as urlf:
            simplejson.dump({"url": [api]}, urlf)

    # Download index files - don't bother to check for updates, this isn't
    # a frequent thing to do
    LOGGER.info("Downloading index files")
    _get_file(f"{api}/api/index", os.path.join(datadir, "index.json"))
    _get_file(f"{api}/api/gazetteer", os.path.join(datadir, "gazetteer.geojson"))
    _get_file(
        f"{api}/api/location_aliases", os.path.join(datadir, "location_aliases.csv")
    )

    # Get the deposits associated with the account, which includes a list of download
    # links
    params["page"] = 1
    deposits = []

    LOGGER.info("Scanning Zenodo deposits")
    while True:
        this_page = requests.get(
            zenodo_api + "/deposit/depositions",
            params=params,
            json={},
            headers={"Content-Type": "application/json"},
        )

        if not this_page.ok:
            raise RuntimeError("Could not connect to Zenodo API. Invalid token?")

        if this_page.json():
            deposits += this_page.json()
            LOGGER.info(f"Page {params['page']}")
            params["page"] += 1
        else:
            break

    LOGGER.info(f"Processing {len(deposits)} deposits")

    # Download the files
    for dep in deposits:
        con_rec_id = str(dep["conceptrecid"])
        rec_id = str(dep["record_id"])

        if not dep["submitted"]:
            LOGGER.info(f"Unsubmitted draft {con_rec_id}/{rec_id}")
            continue

        LOGGER.info(f"Processing deposit {con_rec_id}/{rec_id}")
        FORMATTER.push()

        # Create the directory structure if needed
        rec_dir = os.path.join(datadir, con_rec_id, rec_id)
        if not os.path.exists(rec_dir):
            LOGGER.info("Creating directory")
            os.makedirs(rec_dir)
        else:
            LOGGER.info("Directory found")

        # loop over the files in the record
        for this_file in dep["files"]:
            if xlsx_only and not this_file["filename"].endswith(".xlsx"):
                LOGGER.info(f"Skipping non-excel file {this_file['filename']}")
                continue

            LOGGER.info(f"Processing {this_file['filename']}")
            FORMATTER.push()

            outf = os.path.join(rec_dir, this_file["filename"])
            local_copy = os.path.exists(outf)

            if not local_copy:
                LOGGER.info("Downloading")
                _get_file(this_file["links"]["download"], outf, params=params)
            elif local_copy and _compute_md5(outf) != this_file["checksum"]:
                if replace_modified:
                    LOGGER.info("Replacing locally modified file")
                    _get_file(this_file["links"]["download"], outf, params=params)
                else:
                    LOGGER.warning("Local copy modified")
            else:
                LOGGER.info("Already present")

            FORMATTER.pop()

        # Get the metadata json
        metadata = os.path.join(rec_dir, f"{rec_id}.json")
        if os.path.exists(metadata):
            LOGGER.info("JSON Metadata found")
        else:
            LOGGER.info("Downloading JSON metadata ")
            _get_file(f"{api}/api/record/{rec_id}", metadata)

        FORMATTER.pop()

taxon_index_to_text(taxa, html=False, indent_width=4, auth='GBIF')

Render a taxon index as text or html.

This function takes a taxon index and renders the contents into either a text or html representation of the taxonomic hierarchy used in the dataset. Taxonomic ranks are indented to render a nested hierarchy. The auth argument is used to set whether the taxa are validated using GBIF or NCBI and this only affects the formatting of the names in the representation.

Parameters:

Name Type Description Default
taxa list[dict]

A list of taxon dictionaries containing the taxa for a dataset.

required
html bool

Render as html or text.

False
indent_width int

The indentation width to use for successive taxonomic ranks.

4
auth str

The taxonomic authority that the taxa are taken from.

'GBIF'

Returns:

Type Description
str | div

Either a HTML or text representation of the taxa tree.

Source code in safedata_validator/taxa.py
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
def taxon_index_to_text(
    taxa: list[dict], html: bool = False, indent_width: int = 4, auth: str = "GBIF"
) -> str | tags.div:
    """Render a taxon index as text or html.

    This function takes a taxon index and renders the contents into either a text or
    html representation of the taxonomic hierarchy used in the dataset. Taxonomic ranks
    are indented to render a nested hierarchy. The `auth` argument is used to set
    whether the taxa are validated using GBIF or NCBI and this only affects the
    formatting of the names in the representation.

    Args:
        taxa: A list of taxon dictionaries containing the taxa for a dataset.
        html: Render as html or text.
        indent_width: The indentation width to use for successive taxonomic ranks.
        auth: The taxonomic authority that the taxa are taken from.

    Returns:
        Either a HTML or text representation of the taxa tree.
    """

    def _indent(n: int, use_html: bool = html):
        if use_html:
            return raw("&ensp;-&ensp;" * n)
        else:
            return " " * indent_width * (n - 1)

    def _format_name(tx: dict, use_html: bool = html, auth: str = "GBIF"):
        if auth == "GBIF":
            # format the canonical name
            if tx["taxon_rank"] in ["genus", "species", "subspecies"]:
                if use_html:
                    return tags.i(tx["taxon_name"])
                else:
                    return f"_{tx['taxon_name']}_"
            elif tx["taxon_rank"] in ["morphospecies", "functional group"]:
                return f"[{tx['worksheet_name']}, {tx['taxon_rank']}]"
            else:
                return tx["taxon_name"]

        elif auth == "NCBI":
            # format the canonical name
            if tx["taxon_status"] == "user":
                if tx["taxon_rank"] in NCBI_BACKBONE_RANKS:
                    return f"[{tx['taxon_name']}]"
                else:
                    return (
                        f"[{tx['taxon_name']}]  (non-backbone rank: {tx['taxon_rank']})"
                    )
            else:
                if tx["taxon_rank"] in ["genus", "species", "subspecies"]:
                    if use_html:
                        return tags.i(tx["taxon_name"])
                    else:
                        return f"_{tx['taxon_name']}_"
                elif tx["taxon_rank"] not in NCBI_BACKBONE_RANKS:
                    return f"{tx['taxon_name']} (non-backbone rank: {tx['taxon_rank']})"
                else:
                    return tx["taxon_name"]
        else:
            raise ValueError(f"Unknown auth value: {auth}")

    # Container type depends on whether or not html output is required
    if html:
        # Container to hold the output
        html_out = tags.div()
    else:
        html_out = StringIO()

    # group by parent taxon, substituting 0 for None
    # secondary order is then alphabetic based on taxon name
    taxa.sort(key=lambda x: (x["parent_id"] or 0, x["taxon_name"]))

    # Preallocate container to store identity of surplus taxa
    surp_tx_ids = []
    # Define keys that would match in unwanted repeated entries
    match_keys = [
        "taxon_id",
        "parent_id",
        "taxon_name",
        "taxon_rank",
        "taxon_status",
    ]

    # Loop over taxa to filter for repeated entries
    for idx, taxon in enumerate(taxa):
        # Identify elements in taxa where all 5 of the desired keys match
        matches = list(
            map(
                lambda x: x == 5,
                [sum([taxon[k] == item[k] for k in match_keys]) for item in taxa],
            )
        )
        if sum(matches) > 1:
            # Generate reduced list of matching taxa
            taxa_mtch = list(compress(taxa, matches))
            ws_names = [item["worksheet_name"] for item in taxa_mtch]
            # Find first non-None worksheet names
            first_nm = next(name for name in ws_names if name is not None)
            # If it doesn't match worksheet name of taxon, add index to be deleted
            if first_nm != taxon["worksheet_name"]:
                surp_tx_ids.append(idx)

    # Delete taxa that are superfluous by index
    for index in sorted(surp_tx_ids, reverse=True):
        del taxa[index]

    # group taxa by their parent id
    grouped = {k: list(v) for k, v in groupby(taxa, lambda x: x["parent_id"])}

    # start the stack with root taxa, which will have None as a parent (kingdoms for
    # GBIF and superkingdoms for NCBI)
    stack = [({"current": grouped[None][0]}, {"next": grouped[None][1:]})]

    while stack:
        # Handle the current top of the stack: format the canonical name
        current = stack[-1][0]["current"]
        canon_name = _format_name(current)

        # Look for a non-None entry in next that shares the same worksheet name
        next_ws_names = [
            tx["worksheet_name"]
            for tx in stack[-1][1]["next"]
            if tx["worksheet_name"] is not None
        ]

        if current["worksheet_name"] in next_ws_names:
            # pop out the matching entry and find which is 'accepted'
            name_pair = stack[-1][1]["next"].pop(
                next_ws_names.index(current["worksheet_name"])
            )
            if current["taxon_status"] == "accepted":
                as_name = _format_name(name_pair)
                as_status = name_pair["taxon_status"]
            else:
                as_name = canon_name
                as_status = current["taxon_status"]
                canon_name = _format_name(name_pair)

            if html:
                html_txt = [
                    _indent(len(stack)),
                    canon_name,
                    " (as ",
                    as_status,
                    ": ",
                    as_name,
                    ")",
                    tags.br(),
                ]
            else:
                txt = (
                    f"{_indent(len(stack))} {canon_name} (as {as_status}: {as_name})\n"
                )
        else:
            if html:
                html_txt = [_indent(len(stack)), canon_name, tags.br()]
            else:
                txt = f"{_indent(len(stack))} {canon_name}\n"

        if html:
            html_out += html_txt
        else:
            html_out.write(txt)

        # Is this taxon a parent for other taxa - if so add that taxon to the top of
        # the stack, otherwise start looking for a next taxon to push onto the stack.
        # If there is none at the top, pop and look down.
        parent_id = current["taxon_id"]
        if parent_id in grouped:
            stack.append(
                ({"current": grouped[parent_id][0]}, {"next": grouped[parent_id][1:]})
            )
        else:
            while stack:
                push = stack.pop()
                if push[1]["next"]:
                    stack.append(
                        ({"current": push[1]["next"][0]}, {"next": push[1]["next"][1:]})
                    )
                    break

    if html:
        return html_out
    else:
        return html_out.getvalue()

ZenodoFunctionResponseType = tuple[dict, Optional[str]] module-attribute

Function return value

The functions interacting with Zenodo all return a common format of tuple of length 2:

  • A dictionary containing the response content. For responses that do not generate a response content but just indicate success via HTTP status codes, an empty dictionary is returned. An empty dictionary is also returned when the function results in an error.
  • An error message on failure or None on success

So, for example:

({'key': 'value'}, None)
({}, 'Something went wrong')

The expected use pattern is then:

response, error = zenodo_function(args)