Skip to content

The field submodule

This module contains the Dataset, Dataworksheet and BaseField classes, along with subclasses of BaseField for different data types. These are the core functions of safedata_validator, responsible for opening a file containing formatted data, and loading and validating the field metadata and data rows in the main data tables.

Dataset

Process the components of a safedata_validator formatted dataset.

The Dataset class links a dataset with a particular configuration of the safedata_validator resources, and then loads the components of the dataset.

Parameters:

Name Type Description Default
resources Resources | None

An Resources instance configuring safedata_validator to process a dataset.

None
Source code in safedata_validator/field.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
class Dataset:
    """Process the components of a safedata_validator formatted dataset.

    The Dataset class links a dataset with a particular configuration of the
    safedata_validator resources, and then loads the components of the dataset.

    Args:
        resources: An Resources instance configuring safedata_validator to process a
            dataset.
    """

    def __init__(self, resources: Resources | None = None) -> None:
        # Try and load the default resources if None provided
        if resources is None:
            resources = Resources()

        self.filename: str | None = None
        self.resources = resources
        self.summary = Summary(resources)
        self.taxa = Taxa(resources)
        self.dataworksheets: list[DataWorksheet] = []
        self.n_errors = 0
        self.passed = False

        # Extents - these can be loaded from the Summary or compiled from the
        # data, so the Summary and dataset extents are held separately so that
        # they can be validated against one another once all data is checked.

        self.temporal_extent = Extent(
            "temporal extent",
            (datetime.date,),
            hard_bounds=resources.extents.temporal_hard_extent,
            soft_bounds=resources.extents.temporal_soft_extent,
        )
        self.latitudinal_extent = Extent(
            "latitudinal extent",
            (float, int),
            hard_bounds=resources.extents.latitudinal_hard_extent,
            soft_bounds=resources.extents.latitudinal_soft_extent,
        )
        self.longitudinal_extent = Extent(
            "longitudinal extent",
            (float, int),
            hard_bounds=resources.extents.longitudinal_hard_extent,
            soft_bounds=resources.extents.longitudinal_soft_extent,
        )

        # Create Locations and pass in dataset extents
        self.locations = Locations(
            resources,
            latitudinal_extent=self.latitudinal_extent,
            longitudinal_extent=self.longitudinal_extent,
        )

    def load_from_workbook(
        self,
        filename: str,
        validate_doi: bool = False,
        chunk_size: int = 1000,
        console_log: bool = True,
    ) -> None:
        """Populate a Dataset instance from a safedata_validator formatted Excel file.

        This method populates a Dataset using the safedata_validator format for Excel
        workbooks in .xlsx format.

        Logging messages are used to report the progress of the validation process.
        These are always captured in an internal log (`logging.LOG`) but are also
        printed by default to the command line. The `console_log` argument can be used
        to suppress console logging, when this command is being run programmatically
        rather than via the `safedata_validate` command line script.

        Args:
            filename: A path to the workbook containing the dataset
            validate_doi: Should DOIs in the dataset summary be validated
            chunk_size: Data is read from worksheets in chunks of rows - this
                argument sets the size of that chunk.
            console_log: Suppress command line logging.
        """

        # Handle logging details - flush and reset from previous runs.
        handler = get_handler()
        handler.reset()

        if console_log:
            handler.setLevel("DEBUG")
        else:
            handler.setLevel("CRITICAL")

        # Open the workbook with:
        #  - read_only to use the memory optimised read_only implementation.
        #    This is a bit restricted as it only exposes row by row iteration
        #    to avoid expensive XML traversal but has a lower memory footprint.
        #  - data_only to load values not formulae for equations.
        self.filename = os.path.basename(filename)
        wb = load_workbook(filename, read_only=True, data_only=True)

        # Populate summary
        if "Summary" in wb.sheetnames:
            self.summary.load(
                wb["Summary"],
                wb.sheetnames,
                validate_doi=validate_doi,
            )
        else:
            # No summary is impossible - so an error and no dataworksheets
            # will be loaded, but taxon and locations could be checked if present
            LOGGER.error("No summary worksheet found - moving on")

        # Populate locations
        if "Locations" in wb.sheetnames:
            self.locations.load(wb["Locations"])
        else:
            # No locations is pretty implausible - lab experiments?
            LOGGER.warning("No locations worksheet found - moving on")

        # Throw an error if both Taxa and GBIFTaxa have been given as worksheet names
        gbif_sheets = {"GBIFTaxa", "Taxa"}.intersection(wb.sheetnames)

        if len(gbif_sheets) == 2:
            LOGGER.error(
                "Both Taxa and GBIFTaxa provided as sheet names, this "
                "is not allowed! Only checking GBIFTaxa sheet"
            )
            self.taxa.gbif_taxa.load(wb["GBIFTaxa"])
        # Otherwise populate gbif_taxa from the one that has been provided
        elif len(gbif_sheets) == 1:
            self.taxa.gbif_taxa.load(wb[list(gbif_sheets)[0]])

        # Populate ncbi taxa
        ncbi_sheet = "NCBITaxa" in wb.sheetnames
        if ncbi_sheet:
            self.taxa.ncbi_taxa.load(wb["NCBITaxa"])

        if not gbif_sheets and not ncbi_sheet:
            # Leave the default empty Taxa object
            LOGGER.warning("Neither Taxa worksheet found - moving on")

        # Load data worksheets
        for sheet_meta in self.summary.data_worksheets:
            sheet_name = sheet_meta["name"]

            if sheet_name in wb.sheetnames:
                dwsh = DataWorksheet(sheet_meta, self)
                dwsh.load_from_worksheet(wb[sheet_name], row_chunk_size=chunk_size)
                self.dataworksheets.append(dwsh)

        # Run final checks
        self.final_checks()

    @loggerinfo_push_pop("Running final checking")
    def final_checks(self) -> None:
        """Run final validation checks on a Dataset instance.

        This method checks that:
        1. all locations and taxa provided have been used in the data worksheets scanned
        2. no worksheet taxon names are duplicated between GBIFTaxa and NCBITaxa.
        3. extents in the data are congruent with the summary extents

        Finally, the method reports the total number of errors and warnings from
        processing the dataset.
        """

        LOGGER.info("Checking provided locations and taxa all used in data worksheets")
        FORMATTER.push()

        # check locations and taxa
        # - Can't validate when there are external files which might use any unused ones
        # - If the sets are not the same, then the worksheet reports will catch
        #   undocumented ones so here only look for ones that are provided but not used
        #   in the data.
        if not self.locations.is_empty:
            if self.summary.external_files:
                LOGGER.warning(
                    "Location list cannot be validated when external data files are"
                    " used"
                )
            elif self.locations.locations_used == self.locations.locations:
                LOGGER.info("Provided locations all used in datasets")
            elif self.locations.locations_used == (
                self.locations.locations & self.locations.locations_used
            ):
                LOGGER.error(
                    "Provided locations not used: ",
                    extra={
                        "join": self.locations.locations - self.locations.locations_used
                    },
                )

        # check taxa
        if not self.taxa.is_empty:
            if self.summary.external_files:
                LOGGER.warning(
                    "Taxon list cannot be validated when external data files are used"
                )
            else:
                if self.taxa.taxon_names_used == self.taxa.taxon_names:
                    LOGGER.info("Provided taxa all used in datasets")
                elif self.taxa.taxon_names_used != self.taxa.taxon_names.union(
                    self.taxa.taxon_names_used
                ):
                    LOGGER.error(
                        "Provided taxa not used: ",
                        extra={
                            "join": self.taxa.taxon_names - self.taxa.taxon_names_used
                        },
                    )

                if self.taxa.repeat_names != set():
                    LOGGER.error(
                        "The following taxa are defined in both GBIFTaxa and "
                        "NCBITaxa: ",
                        extra={"join": self.taxa.repeat_names},
                    )

        # Check the extents - there are both summary and dataset extents so
        # check at least one is populated for each extent and that they are
        # compatible
        FORMATTER.pop()
        LOGGER.info("Checking temporal and geographic extents")
        FORMATTER.push()

        extents_to_check = (
            ("Temporal", "temporal_extent"),
            ("Latitudinal", "latitudinal_extent"),
            ("Longitudinal", "longitudinal_extent"),
        )

        for label, this_extent in extents_to_check:
            dataset_extent = getattr(self, this_extent)
            summary_extent = getattr(self.summary, this_extent)

            # If neither: need to provide in summary. If both: consistent.
            if not (dataset_extent.populated or summary_extent.populated):
                LOGGER.error(
                    f"{label} extent not set from data or provided in summary: "
                    "add extents to dataset Summary"
                )
            elif (dataset_extent.populated and summary_extent.populated) and (
                (dataset_extent.extent[0] < summary_extent.extent[0])
                or (dataset_extent.extent[1] > summary_extent.extent[1])
            ):
                LOGGER.error(
                    f"{label} extent values from the data fall outside the extents "
                    "set in the Summary sheet "
                    f"({[str(x) for x in dataset_extent.extent]})"
                )
            elif dataset_extent.populated and summary_extent.populated:
                LOGGER.warning(
                    f"The {label} extent is set in Summary but also "
                    "is populated from the data - this may be deliberate!"
                )
            elif dataset_extent.populated and not summary_extent.populated:
                summary_extent = dataset_extent
                LOGGER.info(
                    f"The {label} extent in Summary is not set and has "
                    "been populated from the data"
                )

        # Dedent for final result
        FORMATTER.pop(n=2)
        handler = get_handler()

        if handler.counters["ERROR"] > 0:
            self.n_errors = handler.counters["ERROR"]
            if handler.counters["WARNING"] > 0:
                LOGGER.info(
                    f"FAIL: file contained {handler.counters['ERROR']} errors "
                    f"and {handler.counters['WARNING']} warnings"
                )
            else:
                LOGGER.info(f"FAIL: file contained {handler.counters['ERROR']} errors")
        else:
            self.passed = True

            if handler.counters["WARNING"] > 0:
                LOGGER.info(
                    "PASS: file formatted correctly but with "
                    f"{handler.counters['WARNING']} warnings"
                )
            else:
                LOGGER.info("PASS: file formatted correctly with no warnings")

    def to_json(self):
        """Export a Dataset to JSON.

        This method exports key data about the dataset in JSON format. This
        method is used to export a description of a dataset that can be used
        to populate a metadata server and publish datasets to Zenodo.
        """

        # TODO - continue with class.to_dict() methods?

        json_dict = dict(
            # Summary information
            title=self.summary.title,
            description=self.summary.description,
            authors=self.summary.authors,
            filename=self.filename,
            external_files=self.summary.external_files,
            access=self.summary.access["access"],
            embargo_date=self.summary.access["embargo_date"],
            access_conditions=self.summary.access["access_conditions"],
            funders=self.summary.funders,
            permits=self.summary.permits,
            keywords=self.summary.keywords,
            dataworksheets=[dwsh.to_dict() for dwsh in self.dataworksheets],
            # Taxa
            # Gbif taxa if they exist
            gbif_timestamp=self.resources.gbif_timestamp,
            gbif_taxa=[
                dict(
                    zip(
                        (
                            "worksheet_name",
                            "taxon_id",
                            "parent_id",
                            "taxon_name",
                            "taxon_rank",
                            "taxon_status",
                        ),
                        tx,
                    )
                )
                for tx in self.taxa.gbif_taxa.taxon_index
            ],
            # NCBI taxa if they exist
            ncbi_timestamp=self.resources.ncbi_timestamp,
            ncbi_taxa=[
                dict(
                    zip(
                        (
                            "worksheet_name",
                            "taxon_id",
                            "parent_id",
                            "taxon_name",
                            "taxon_rank",
                            "taxon_status",
                        ),
                        tx,
                    )
                )
                for tx in self.taxa.ncbi_taxa.taxon_index
            ],
            # Locations
            locations=[
                dict(zip(("name", "new_location", "wkt_wgs84"), lc))
                for lc in self.locations.location_index
            ],
            # Publication details - these are populated by the
            # Zenodo publication mechanism.
            zenodo_concept_id=None,
            zenodo_record_id=None,
            zenodo_publication_date=None,
        )

        # Extents - summary take priority over dataset.
        for ext in ("temporal_extent", "latitudinal_extent", "longitudinal_extent"):
            summary_extent = getattr(self.summary, ext)
            data_extent = getattr(self, ext)
            if summary_extent.populated:
                json_dict[ext] = summary_extent.extent
            else:
                json_dict[ext] = data_extent.extent

        return simplejson.dumps(json_dict, default=str, indent=2)

    @property
    def error_breakdown(self):
        """Get error counts broken down by dataset component."""
        return {
            "total": self.n_errors,
            "summary": self.summary.n_errors,
            "locations": self.locations.n_errors,
            "gbif": self.taxa.gbif_taxa.n_errors,
            "ncbi": self.taxa.ncbi_taxa.n_errors,
            "data": [(d.name, d.n_errors) for d in self.dataworksheets],
        }

error_breakdown property

Get error counts broken down by dataset component.

final_checks()

Run final validation checks on a Dataset instance.

This method checks that: 1. all locations and taxa provided have been used in the data worksheets scanned 2. no worksheet taxon names are duplicated between GBIFTaxa and NCBITaxa. 3. extents in the data are congruent with the summary extents

Finally, the method reports the total number of errors and warnings from processing the dataset.

Source code in safedata_validator/field.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
@loggerinfo_push_pop("Running final checking")
def final_checks(self) -> None:
    """Run final validation checks on a Dataset instance.

    This method checks that:
    1. all locations and taxa provided have been used in the data worksheets scanned
    2. no worksheet taxon names are duplicated between GBIFTaxa and NCBITaxa.
    3. extents in the data are congruent with the summary extents

    Finally, the method reports the total number of errors and warnings from
    processing the dataset.
    """

    LOGGER.info("Checking provided locations and taxa all used in data worksheets")
    FORMATTER.push()

    # check locations and taxa
    # - Can't validate when there are external files which might use any unused ones
    # - If the sets are not the same, then the worksheet reports will catch
    #   undocumented ones so here only look for ones that are provided but not used
    #   in the data.
    if not self.locations.is_empty:
        if self.summary.external_files:
            LOGGER.warning(
                "Location list cannot be validated when external data files are"
                " used"
            )
        elif self.locations.locations_used == self.locations.locations:
            LOGGER.info("Provided locations all used in datasets")
        elif self.locations.locations_used == (
            self.locations.locations & self.locations.locations_used
        ):
            LOGGER.error(
                "Provided locations not used: ",
                extra={
                    "join": self.locations.locations - self.locations.locations_used
                },
            )

    # check taxa
    if not self.taxa.is_empty:
        if self.summary.external_files:
            LOGGER.warning(
                "Taxon list cannot be validated when external data files are used"
            )
        else:
            if self.taxa.taxon_names_used == self.taxa.taxon_names:
                LOGGER.info("Provided taxa all used in datasets")
            elif self.taxa.taxon_names_used != self.taxa.taxon_names.union(
                self.taxa.taxon_names_used
            ):
                LOGGER.error(
                    "Provided taxa not used: ",
                    extra={
                        "join": self.taxa.taxon_names - self.taxa.taxon_names_used
                    },
                )

            if self.taxa.repeat_names != set():
                LOGGER.error(
                    "The following taxa are defined in both GBIFTaxa and "
                    "NCBITaxa: ",
                    extra={"join": self.taxa.repeat_names},
                )

    # Check the extents - there are both summary and dataset extents so
    # check at least one is populated for each extent and that they are
    # compatible
    FORMATTER.pop()
    LOGGER.info("Checking temporal and geographic extents")
    FORMATTER.push()

    extents_to_check = (
        ("Temporal", "temporal_extent"),
        ("Latitudinal", "latitudinal_extent"),
        ("Longitudinal", "longitudinal_extent"),
    )

    for label, this_extent in extents_to_check:
        dataset_extent = getattr(self, this_extent)
        summary_extent = getattr(self.summary, this_extent)

        # If neither: need to provide in summary. If both: consistent.
        if not (dataset_extent.populated or summary_extent.populated):
            LOGGER.error(
                f"{label} extent not set from data or provided in summary: "
                "add extents to dataset Summary"
            )
        elif (dataset_extent.populated and summary_extent.populated) and (
            (dataset_extent.extent[0] < summary_extent.extent[0])
            or (dataset_extent.extent[1] > summary_extent.extent[1])
        ):
            LOGGER.error(
                f"{label} extent values from the data fall outside the extents "
                "set in the Summary sheet "
                f"({[str(x) for x in dataset_extent.extent]})"
            )
        elif dataset_extent.populated and summary_extent.populated:
            LOGGER.warning(
                f"The {label} extent is set in Summary but also "
                "is populated from the data - this may be deliberate!"
            )
        elif dataset_extent.populated and not summary_extent.populated:
            summary_extent = dataset_extent
            LOGGER.info(
                f"The {label} extent in Summary is not set and has "
                "been populated from the data"
            )

    # Dedent for final result
    FORMATTER.pop(n=2)
    handler = get_handler()

    if handler.counters["ERROR"] > 0:
        self.n_errors = handler.counters["ERROR"]
        if handler.counters["WARNING"] > 0:
            LOGGER.info(
                f"FAIL: file contained {handler.counters['ERROR']} errors "
                f"and {handler.counters['WARNING']} warnings"
            )
        else:
            LOGGER.info(f"FAIL: file contained {handler.counters['ERROR']} errors")
    else:
        self.passed = True

        if handler.counters["WARNING"] > 0:
            LOGGER.info(
                "PASS: file formatted correctly but with "
                f"{handler.counters['WARNING']} warnings"
            )
        else:
            LOGGER.info("PASS: file formatted correctly with no warnings")

load_from_workbook(filename, validate_doi=False, chunk_size=1000, console_log=True)

Populate a Dataset instance from a safedata_validator formatted Excel file.

This method populates a Dataset using the safedata_validator format for Excel workbooks in .xlsx format.

Logging messages are used to report the progress of the validation process. These are always captured in an internal log (logging.LOG) but are also printed by default to the command line. The console_log argument can be used to suppress console logging, when this command is being run programmatically rather than via the safedata_validate command line script.

Parameters:

Name Type Description Default
filename str

A path to the workbook containing the dataset

required
validate_doi bool

Should DOIs in the dataset summary be validated

False
chunk_size int

Data is read from worksheets in chunks of rows - this argument sets the size of that chunk.

1000
console_log bool

Suppress command line logging.

True
Source code in safedata_validator/field.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def load_from_workbook(
    self,
    filename: str,
    validate_doi: bool = False,
    chunk_size: int = 1000,
    console_log: bool = True,
) -> None:
    """Populate a Dataset instance from a safedata_validator formatted Excel file.

    This method populates a Dataset using the safedata_validator format for Excel
    workbooks in .xlsx format.

    Logging messages are used to report the progress of the validation process.
    These are always captured in an internal log (`logging.LOG`) but are also
    printed by default to the command line. The `console_log` argument can be used
    to suppress console logging, when this command is being run programmatically
    rather than via the `safedata_validate` command line script.

    Args:
        filename: A path to the workbook containing the dataset
        validate_doi: Should DOIs in the dataset summary be validated
        chunk_size: Data is read from worksheets in chunks of rows - this
            argument sets the size of that chunk.
        console_log: Suppress command line logging.
    """

    # Handle logging details - flush and reset from previous runs.
    handler = get_handler()
    handler.reset()

    if console_log:
        handler.setLevel("DEBUG")
    else:
        handler.setLevel("CRITICAL")

    # Open the workbook with:
    #  - read_only to use the memory optimised read_only implementation.
    #    This is a bit restricted as it only exposes row by row iteration
    #    to avoid expensive XML traversal but has a lower memory footprint.
    #  - data_only to load values not formulae for equations.
    self.filename = os.path.basename(filename)
    wb = load_workbook(filename, read_only=True, data_only=True)

    # Populate summary
    if "Summary" in wb.sheetnames:
        self.summary.load(
            wb["Summary"],
            wb.sheetnames,
            validate_doi=validate_doi,
        )
    else:
        # No summary is impossible - so an error and no dataworksheets
        # will be loaded, but taxon and locations could be checked if present
        LOGGER.error("No summary worksheet found - moving on")

    # Populate locations
    if "Locations" in wb.sheetnames:
        self.locations.load(wb["Locations"])
    else:
        # No locations is pretty implausible - lab experiments?
        LOGGER.warning("No locations worksheet found - moving on")

    # Throw an error if both Taxa and GBIFTaxa have been given as worksheet names
    gbif_sheets = {"GBIFTaxa", "Taxa"}.intersection(wb.sheetnames)

    if len(gbif_sheets) == 2:
        LOGGER.error(
            "Both Taxa and GBIFTaxa provided as sheet names, this "
            "is not allowed! Only checking GBIFTaxa sheet"
        )
        self.taxa.gbif_taxa.load(wb["GBIFTaxa"])
    # Otherwise populate gbif_taxa from the one that has been provided
    elif len(gbif_sheets) == 1:
        self.taxa.gbif_taxa.load(wb[list(gbif_sheets)[0]])

    # Populate ncbi taxa
    ncbi_sheet = "NCBITaxa" in wb.sheetnames
    if ncbi_sheet:
        self.taxa.ncbi_taxa.load(wb["NCBITaxa"])

    if not gbif_sheets and not ncbi_sheet:
        # Leave the default empty Taxa object
        LOGGER.warning("Neither Taxa worksheet found - moving on")

    # Load data worksheets
    for sheet_meta in self.summary.data_worksheets:
        sheet_name = sheet_meta["name"]

        if sheet_name in wb.sheetnames:
            dwsh = DataWorksheet(sheet_meta, self)
            dwsh.load_from_worksheet(wb[sheet_name], row_chunk_size=chunk_size)
            self.dataworksheets.append(dwsh)

    # Run final checks
    self.final_checks()

to_json()

Export a Dataset to JSON.

This method exports key data about the dataset in JSON format. This method is used to export a description of a dataset that can be used to populate a metadata server and publish datasets to Zenodo.

Source code in safedata_validator/field.py
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
def to_json(self):
    """Export a Dataset to JSON.

    This method exports key data about the dataset in JSON format. This
    method is used to export a description of a dataset that can be used
    to populate a metadata server and publish datasets to Zenodo.
    """

    # TODO - continue with class.to_dict() methods?

    json_dict = dict(
        # Summary information
        title=self.summary.title,
        description=self.summary.description,
        authors=self.summary.authors,
        filename=self.filename,
        external_files=self.summary.external_files,
        access=self.summary.access["access"],
        embargo_date=self.summary.access["embargo_date"],
        access_conditions=self.summary.access["access_conditions"],
        funders=self.summary.funders,
        permits=self.summary.permits,
        keywords=self.summary.keywords,
        dataworksheets=[dwsh.to_dict() for dwsh in self.dataworksheets],
        # Taxa
        # Gbif taxa if they exist
        gbif_timestamp=self.resources.gbif_timestamp,
        gbif_taxa=[
            dict(
                zip(
                    (
                        "worksheet_name",
                        "taxon_id",
                        "parent_id",
                        "taxon_name",
                        "taxon_rank",
                        "taxon_status",
                    ),
                    tx,
                )
            )
            for tx in self.taxa.gbif_taxa.taxon_index
        ],
        # NCBI taxa if they exist
        ncbi_timestamp=self.resources.ncbi_timestamp,
        ncbi_taxa=[
            dict(
                zip(
                    (
                        "worksheet_name",
                        "taxon_id",
                        "parent_id",
                        "taxon_name",
                        "taxon_rank",
                        "taxon_status",
                    ),
                    tx,
                )
            )
            for tx in self.taxa.ncbi_taxa.taxon_index
        ],
        # Locations
        locations=[
            dict(zip(("name", "new_location", "wkt_wgs84"), lc))
            for lc in self.locations.location_index
        ],
        # Publication details - these are populated by the
        # Zenodo publication mechanism.
        zenodo_concept_id=None,
        zenodo_record_id=None,
        zenodo_publication_date=None,
    )

    # Extents - summary take priority over dataset.
    for ext in ("temporal_extent", "latitudinal_extent", "longitudinal_extent"):
        summary_extent = getattr(self.summary, ext)
        data_extent = getattr(self, ext)
        if summary_extent.populated:
            json_dict[ext] = summary_extent.extent
        else:
            json_dict[ext] = data_extent.extent

    return simplejson.dumps(json_dict, default=str, indent=2)

DataWorksheet

Process the contents of safedata_validator formatted data table.

This class is used to load and check the formatting and content of a data table using the safedata_validator format. It requires the containing Dataset as an argument to get access to Summary, Locations, Taxa and Extents attributes and the specific data table metadata from the Summary instance.

The workflow for methods validating a data table is:

  1. Create a DataWorksheet instance using the summary metadata.
  2. Parse the rows at the top of the file containing field metadata and use validate_field_meta to validate the contents.
  3. Parse the data rows below the field metadata using the validate_data_rows method. This can be done repeatedly to handle chunked inputs.
  4. Use the report method to obtain all of the logging associated with data validation and run any final checks.

This workflow is currently implemented for Excel format data worksheets in the load_from_worksheet method.

Parameters:

Name Type Description Default
sheet_meta dict

The metadata dictionary for this worksheet from the Summary instance, containing the worksheet name, title and description and the names of any external files to be associated with this worksheet.

required
dataset Dataset | None

A safedata_validator Dataset object, providing Taxa, Locations and Summary information.

None
Source code in safedata_validator/field.py
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
class DataWorksheet:
    """Process the contents of safedata_validator formatted data table.

    This class is used to load and check the formatting and content of a data table
    using the safedata_validator format. It requires the containing Dataset as an
    argument to get access to Summary, Locations, Taxa and Extents attributes and the
    specific data table metadata from the Summary instance.

    The workflow for methods validating a data table is:

    1. Create a DataWorksheet instance using the summary metadata.
    2. Parse the rows at the top of the file containing field metadata and use
       `validate_field_meta` to validate the contents.
    3. Parse the data rows below the field metadata using the `validate_data_rows`
       method. This can be done repeatedly to handle chunked inputs.
    4. Use the report method to obtain all of the logging associated with data
       validation and run any final checks.

    This workflow is currently implemented for Excel format data worksheets in the
    `load_from_worksheet` method.

    Args:
        sheet_meta: The metadata dictionary for this worksheet from
            the Summary instance, containing the worksheet name, title and description
            and the names of any external files to be associated with this worksheet.
        dataset: A safedata_validator Dataset object, providing Taxa, Locations and
            Summary information.
    """

    def __init__(
        self,
        sheet_meta: dict,
        dataset: Dataset | None = None,
    ) -> None:
        # Set initial values

        # TODO - checks on sheetmeta
        self.name = sheet_meta["name"]
        self.description = sheet_meta["description"]
        self.title = sheet_meta["title"]
        # For sheet meta with an external file, add in the external file name
        self.external = sheet_meta.get("external")

        # Links to dataset for dataset level information, and taxa and
        # location validation lists.
        self.dataset = dataset

        # Create the field meta attributes, populated using validate_field_meta
        self.fields_loaded = False
        self.field_meta: list[dict] = []
        self.field_types = None
        self.n_fields: int = 0
        self.n_trailing_empty_fields: int = 0
        self.n_descriptors: int
        self.descriptors: list = []
        self.fields: list[EmptyField | BaseField] = []
        self.taxa_fields: list = []

        # Keep track of row numbering
        self.n_row = 0
        self.current_row = 1
        self.row_numbers_sequential = True
        self.row_numbers_missing = False
        self.row_numbers_noninteger = False
        self.blank_rows = False
        self.trailing_blank_rows = False
        self.start_errors = get_handler().counters["ERROR"]
        self.n_errors = 0

    @loggerinfo_push_pop("Validating field metadata")
    def validate_field_meta(self, field_meta: dict) -> None:
        """Validate the field metadata for a data table.

        This method is used to add and validate field metadata. The field meta can
        include trailing empty fields - these will be checked when data rows are added.
        The method also then sets up a list of field validators, matching the order of
        the fields in the table, which are used for data validation.

        Note:
            This relies on the maintenance of the dictionary insertion order, which
            required OrderedDict in the past but is guaranteed for the standard dict in
            Python >= 3.7.
        """

        # Clean off descriptor whitespace padding?
        clean_descriptors = IsNotPadded(field_meta.keys())
        if not clean_descriptors:
            # Report whitespace padding and clean up tuples
            LOGGER.error(
                "Whitespace padding in descriptor names: ",
                extra={"join": clean_descriptors.failed},
            )

            # Order preserved in dict and validator
            cleaned_entries = [
                (ky, val) for ky, val in zip(clean_descriptors, field_meta.values())
            ]
            field_meta = dict(cleaned_entries)

        self.descriptors = list(field_meta.keys())
        self.n_descriptors = len(self.descriptors)

        # Checking field_meta structure - should be an equal length and non-zero tuple
        # of values for each descriptor. If not, return without setting fields_loaded.
        descriptor_tuple_lengths = {len(tp) for tp in field_meta.values()}
        if len(descriptor_tuple_lengths) > 1 or descriptor_tuple_lengths == {0}:
            LOGGER.error("Cannot load unequal length or empty field metadata")
            return

        # * Expected descriptors - do _not_ preclude user defined descriptors
        #   but warn about them to alert to typos. Missing descriptors vary
        #   with field type so are handled in BaseField.__init__()
        unknown_descriptors = set(field_meta.keys()).difference(
            set(MANDATORY_DESCRIPTORS).union(OPTIONAL_DESCRIPTORS)
        )
        if unknown_descriptors:
            LOGGER.warning(
                "Unknown field descriptors:", extra={"join": unknown_descriptors}
            )

        # * Check for field name as _last_ descriptor. This is primarily to
        #   make it easy to read the dataframe - just skip the other descriptors
        if list(field_meta.keys())[-1] != "field_name":
            LOGGER.error("Field_name row is not the last descriptor")

        # Repackage field metadata into a list of per field descriptor
        # dictionaries, _importantly_ preserving the column order.
        field_meta_list = [
            dict(zip(field_meta.keys(), val)) for val in zip(*field_meta.values())
        ]

        # Check provided field names are unique. This doesn't cause as many
        # problems as duplications in Taxa and Locations, which expect certain
        # fields, so warn and continue. Ignore empty mandatory values - these
        # are handled in field checking.
        field_names = [
            fld["field_name"]
            for fld in field_meta_list
            if fld["field_name"] is not None
        ]
        dupes = HasDuplicates(field_names)
        if dupes:
            LOGGER.error("Field names duplicated: ", extra={"join": dupes.duplicated})

        # Lowercase the field types
        for fld in field_meta_list:
            fld["field_type"] = (
                None if fld["field_type"] is None else fld["field_type"].lower()
            )

        # Populate the instance variables
        self.fields_loaded = True
        self.field_meta = field_meta_list
        self.n_fields = len(field_meta_list)
        self.n_trailing_empty_fields = 0

        # get taxa field names for cross checking observation and trait data
        self.taxa_fields = [
            fld["field_name"] for fld in self.field_meta if fld["field_type"] == "taxa"
        ]

        # Now initialise the Field objects using the mapping of field types
        # to the BaseField subclasses. This needs to handle trailing fields
        # containing _no_ metadata (e.g. from inaccurate Excel sheet bounds or
        # CSV comments) but also fields with missing field_type (but other
        # metadata).
        #
        # - trailing empty fields are assumed to be blank columns
        #   and validate_data_rows should test for content.
        # - otherwise, these are assumed to be fields and should
        #   trigger warnings about descriptors

        # Get logical flags for empty and trailing empty metadata
        field_meta_empty = [set(vl.values()) == {None} for vl in self.field_meta]
        trailing_empty = [
            all(field_meta_empty[-(n + 1) :]) for n in range(0, len(field_meta_empty))
        ]
        trailing_empty.reverse()
        self.n_trailing_empty_fields = sum(trailing_empty)

        # Get the type map and field list
        field_subclass_map = BaseField.field_type_map()
        unknown_field_types = set()

        for col_idx, (tr_empty, fd_empty, fmeta) in enumerate(
            zip(trailing_empty, field_meta_empty, self.field_meta)
        ):
            fmeta["col_idx"] = col_idx + 1

            # Consider cases
            if tr_empty:
                # Hopefully empty trailing field
                self.fields.append(EmptyField(fmeta))
            elif fd_empty:
                # Empty field within other fields
                self.fields.append(BaseField(fmeta, dwsh=self, dataset=self.dataset))
            elif fmeta["field_type"] is None:
                # Non-empty field of None type - use BaseField for basic checks
                self.fields.append(BaseField(fmeta, dwsh=self, dataset=self.dataset))
            elif fmeta["field_type"] not in field_subclass_map:
                #  Non-empty field of unknown type - use BaseField for basic checks
                unknown_field_types.add(fmeta["field_type"])
                self.fields.append(BaseField(fmeta, dwsh=self, dataset=self.dataset))
            else:
                # Known field type.
                fld_class = field_subclass_map[fmeta["field_type"]]
                self.fields.append(fld_class(fmeta, dwsh=self, dataset=self.dataset))

        if unknown_field_types:
            LOGGER.error("Unknown field types: ", extra={"join": unknown_field_types})

    def validate_data_rows(self, data_rows: list) -> None:
        """Validate the contents of data rows from a data table.

        This method is used to pass a list of rows of data from a data table into the
        appropriate field checkers created by the validate_field_meta method. Each row
        is represented as a list, with the row number as the first entry in the list.

        Args:
            data_rows: A list of rows containing data from a data table.
        """

        if not self.fields_loaded:
            LOGGER.critical("No fields defined - use validate_field_meta")
            return

        # Check the lengths of the rows - this should log critical
        # because it is likely to be a programming error, not a user error.
        if len(data_rows) == 0:
            LOGGER.critical("Empty data_rows passed to validate_data_rows")
            return

        row_lengths = {len(rw) for rw in data_rows}

        if len(row_lengths) != 1:
            LOGGER.critical("Data rows of unequal length - cannot validate")
            return
        elif row_lengths.pop() != (self.n_fields + 1):
            LOGGER.critical(
                "Data rows not of same length as field metadata - cannot validate"
            )
            return

        # Handle empty rows.
        blank_set = {None}
        blank_row = [set(vals) == blank_set for vals in data_rows]

        trailing_blank_row = [
            all(blank_row[-(n + 1) :]) for n in range(0, len(blank_row))
        ]
        trailing_blank_row.reverse()

        internal_blank_row = [
            blnk & ~trail for blnk, trail in zip(blank_row, trailing_blank_row)
        ]

        # Internals within a set of row are automatically bad, but trailing blank
        # rows are only bad if more data is loaded below them.
        if any(internal_blank_row):
            self.blank_rows = True

        if not self.trailing_blank_rows and any(trailing_blank_row):
            self.trailing_blank_rows = True
        elif self.trailing_blank_rows and not all(trailing_blank_row):
            self.blank_rows = True

        # Drop blank rows for further processing
        data_rows = [drw for drw, is_blank in zip(data_rows, blank_row) if not is_blank]

        # Check there is any remaining data, then convert the values into columns
        # and extract the row numbers
        n_row = len(data_rows)
        if n_row == 0:
            return
        else:
            self.n_row += n_row

        data_cols = list(zip(*data_rows))
        row_numbers = data_cols.pop(0)

        # Check for bad values (blanks, non integers) in row numbers
        valid_row_numbers = IsNotBlank(row_numbers, keep_failed=False)

        if not valid_row_numbers:
            self.row_numbers_missing = True

        row_numbers = valid_row_numbers.values

        if any([not isinstance(vl, int) for vl in row_numbers]):
            self.row_numbers_noninteger = True

        # Check the row numbering - skip if this has already failed or
        # the row numbers have already included None or non-integers
        if self.row_numbers_sequential and not (
            self.row_numbers_noninteger or self.row_numbers_missing
        ):
            expected_numbers = list(
                range(self.current_row, self.current_row + len(row_numbers))
            )
            self.current_row += len(row_numbers)

            if row_numbers != expected_numbers:
                self.row_numbers_sequential = False
        else:
            self.current_row += len(row_numbers)

        # Now feed the sets of values into the Field validation
        for data, field_inst in zip(data_cols, self.fields):
            field_inst.validate_data(list(data))

    def report(self) -> None:
        """Report data validation for a data table.

        As the validate_data_rows method works on chunks of rows, the DataWorksheet
        instance accumulates logging messages across individual fields. These are not
        reported until after all the data rows have been validated, when aggregated
        reports for each field can be made. This method causes these field logs to be
        emitted and then carries out final checks and reporting across all of the field
        data.
        """

        if not self.n_row:
            if self.external is None:
                LOGGER.error("No data passed for validation.")
            else:
                LOGGER.info(
                    "Data table description associated with "
                    f"external file {self.external}"
                )
        else:
            LOGGER.info("Validating field data")
            FORMATTER.push()

            for fld in self.fields:
                fld.report()
            FORMATTER.pop()

        # Report on row numbering
        if self.row_numbers_missing:
            LOGGER.error("Missing row numbers in data")

        if self.row_numbers_noninteger:
            LOGGER.error("Row numbers contain non-integer values")

        if (
            not (self.row_numbers_noninteger or self.row_numbers_missing)
            and not self.row_numbers_sequential
        ):
            LOGGER.error("Row numbers not consecutive or do not start with 1")

        # Internal blank rows?
        if self.blank_rows:
            LOGGER.error("Data contains empty rows")

        # report on detected size
        LOGGER.info(
            f"Worksheet '{self.name}' contains {self.n_descriptors} descriptors, "
            f"{self.n_row} data rows and "
            f"{self.n_fields - self.n_trailing_empty_fields} fields"
        )

        # reporting
        self.n_errors = get_handler().counters["ERROR"] - self.start_errors
        if self.n_errors > 0:
            LOGGER.info(f"Dataframe contains {self.n_errors} errors")
        else:
            LOGGER.info("Dataframe formatted correctly")

    def load_from_worksheet(
        self, worksheet: worksheet, row_chunk_size: int = 1000
    ) -> None:
        """Populate a Dataworksheet instance from an openpyxl worksheet.

        This method takes a newly initialised DataWorksheet instance and populates the
        details using the contents of a safedata_validator formatted data table in an
        Excel spreadsheet.

        To keep memory requirements low, data in the worksheet is validated by loading
        sets of rows containing at most `row_chunk_size` rows. Note that this will only
        actually reduce memory use for openpyxl ReadOnlyWorksheets, as these load data
        on demand, but will still work on standard Worksheets.

        Args:
            worksheet: An openpyxl worksheet instance
            row_chunk_size: The number of rows of data to load in each chunk
        """

        # Not using @loggerinfo_push_pop to provide access to WS name
        LOGGER.info(f"Checking worksheet '{self.name}'")
        FORMATTER.push()

        if self.fields_loaded:
            LOGGER.critical("Field metadata already loaded - use fresh instance.")
            FORMATTER.pop()
            return

        # get the data dimensions
        max_row = worksheet.max_row

        # trap completely empty worksheets
        if max_row == 0:
            LOGGER.error("Worksheet is empty")
            FORMATTER.pop()
            return

        # Read the field metadata first:
        # - There should be rows starting with a string and then rows starting with
        #   a number, or a StopIteration at the end of the row iterator.
        ws_rows = worksheet.iter_rows(values_only=True)
        field_meta = []

        # While there are rows and while they start with a string cell, collect
        # field metadata
        for row in ws_rows:
            if isinstance(row[0], str):
                field_meta.append(row)
            else:
                break

        # Convert field meta to dict and validate
        field_meta_dict = {rw[0]: rw[1:] for rw in field_meta}
        self.validate_field_meta(field_meta_dict)

        if self.fields_loaded:
            # Get an iterator on the data rows
            data_rows = worksheet.iter_rows(
                min_row=len(field_meta_dict) + 1, values_only=True
            )

            # Load and validate chunks of data
            n_chunks = ((max_row - self.n_descriptors) // row_chunk_size) + 1
            for chunk in range(n_chunks):
                # itertools.islice handles generators and StopIteration, and also
                # trap empty slices
                data = list(islice(data_rows, row_chunk_size))
                if data:
                    self.validate_data_rows(data)

        # Finish up
        self.report()
        FORMATTER.pop()

    def to_dict(self) -> dict:
        """Return a dictionary representation of a DataWorksheet instance."""

        # Update the fields to account for EmptyFields that have been validated: they
        # have no field metadata and there is no content in the rows below. EmptyFields
        # are always trailing empty fields.

        fields = [
            field_to_dict(fld, idx + 1)
            for idx, fld in enumerate(self.fields)
            if not isinstance(fld, EmptyField)
        ]

        output = dict(
            taxa_fields=self.taxa_fields,
            max_row=self.n_row + self.n_descriptors,
            max_col=self.n_fields - self.n_trailing_empty_fields,
            name=self.name,
            title=self.title,
            description=self.description,
            descriptors=self.descriptors,
            external=self.external,
            fields=fields,
            field_name_row=self.n_descriptors,
            n_data_row=self.n_row,
        )

        return output

load_from_worksheet(worksheet, row_chunk_size=1000)

Populate a Dataworksheet instance from an openpyxl worksheet.

This method takes a newly initialised DataWorksheet instance and populates the details using the contents of a safedata_validator formatted data table in an Excel spreadsheet.

To keep memory requirements low, data in the worksheet is validated by loading sets of rows containing at most row_chunk_size rows. Note that this will only actually reduce memory use for openpyxl ReadOnlyWorksheets, as these load data on demand, but will still work on standard Worksheets.

Parameters:

Name Type Description Default
worksheet worksheet

An openpyxl worksheet instance

required
row_chunk_size int

The number of rows of data to load in each chunk

1000
Source code in safedata_validator/field.py
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
def load_from_worksheet(
    self, worksheet: worksheet, row_chunk_size: int = 1000
) -> None:
    """Populate a Dataworksheet instance from an openpyxl worksheet.

    This method takes a newly initialised DataWorksheet instance and populates the
    details using the contents of a safedata_validator formatted data table in an
    Excel spreadsheet.

    To keep memory requirements low, data in the worksheet is validated by loading
    sets of rows containing at most `row_chunk_size` rows. Note that this will only
    actually reduce memory use for openpyxl ReadOnlyWorksheets, as these load data
    on demand, but will still work on standard Worksheets.

    Args:
        worksheet: An openpyxl worksheet instance
        row_chunk_size: The number of rows of data to load in each chunk
    """

    # Not using @loggerinfo_push_pop to provide access to WS name
    LOGGER.info(f"Checking worksheet '{self.name}'")
    FORMATTER.push()

    if self.fields_loaded:
        LOGGER.critical("Field metadata already loaded - use fresh instance.")
        FORMATTER.pop()
        return

    # get the data dimensions
    max_row = worksheet.max_row

    # trap completely empty worksheets
    if max_row == 0:
        LOGGER.error("Worksheet is empty")
        FORMATTER.pop()
        return

    # Read the field metadata first:
    # - There should be rows starting with a string and then rows starting with
    #   a number, or a StopIteration at the end of the row iterator.
    ws_rows = worksheet.iter_rows(values_only=True)
    field_meta = []

    # While there are rows and while they start with a string cell, collect
    # field metadata
    for row in ws_rows:
        if isinstance(row[0], str):
            field_meta.append(row)
        else:
            break

    # Convert field meta to dict and validate
    field_meta_dict = {rw[0]: rw[1:] for rw in field_meta}
    self.validate_field_meta(field_meta_dict)

    if self.fields_loaded:
        # Get an iterator on the data rows
        data_rows = worksheet.iter_rows(
            min_row=len(field_meta_dict) + 1, values_only=True
        )

        # Load and validate chunks of data
        n_chunks = ((max_row - self.n_descriptors) // row_chunk_size) + 1
        for chunk in range(n_chunks):
            # itertools.islice handles generators and StopIteration, and also
            # trap empty slices
            data = list(islice(data_rows, row_chunk_size))
            if data:
                self.validate_data_rows(data)

    # Finish up
    self.report()
    FORMATTER.pop()

report()

Report data validation for a data table.

As the validate_data_rows method works on chunks of rows, the DataWorksheet instance accumulates logging messages across individual fields. These are not reported until after all the data rows have been validated, when aggregated reports for each field can be made. This method causes these field logs to be emitted and then carries out final checks and reporting across all of the field data.

Source code in safedata_validator/field.py
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
def report(self) -> None:
    """Report data validation for a data table.

    As the validate_data_rows method works on chunks of rows, the DataWorksheet
    instance accumulates logging messages across individual fields. These are not
    reported until after all the data rows have been validated, when aggregated
    reports for each field can be made. This method causes these field logs to be
    emitted and then carries out final checks and reporting across all of the field
    data.
    """

    if not self.n_row:
        if self.external is None:
            LOGGER.error("No data passed for validation.")
        else:
            LOGGER.info(
                "Data table description associated with "
                f"external file {self.external}"
            )
    else:
        LOGGER.info("Validating field data")
        FORMATTER.push()

        for fld in self.fields:
            fld.report()
        FORMATTER.pop()

    # Report on row numbering
    if self.row_numbers_missing:
        LOGGER.error("Missing row numbers in data")

    if self.row_numbers_noninteger:
        LOGGER.error("Row numbers contain non-integer values")

    if (
        not (self.row_numbers_noninteger or self.row_numbers_missing)
        and not self.row_numbers_sequential
    ):
        LOGGER.error("Row numbers not consecutive or do not start with 1")

    # Internal blank rows?
    if self.blank_rows:
        LOGGER.error("Data contains empty rows")

    # report on detected size
    LOGGER.info(
        f"Worksheet '{self.name}' contains {self.n_descriptors} descriptors, "
        f"{self.n_row} data rows and "
        f"{self.n_fields - self.n_trailing_empty_fields} fields"
    )

    # reporting
    self.n_errors = get_handler().counters["ERROR"] - self.start_errors
    if self.n_errors > 0:
        LOGGER.info(f"Dataframe contains {self.n_errors} errors")
    else:
        LOGGER.info("Dataframe formatted correctly")

to_dict()

Return a dictionary representation of a DataWorksheet instance.

Source code in safedata_validator/field.py
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
def to_dict(self) -> dict:
    """Return a dictionary representation of a DataWorksheet instance."""

    # Update the fields to account for EmptyFields that have been validated: they
    # have no field metadata and there is no content in the rows below. EmptyFields
    # are always trailing empty fields.

    fields = [
        field_to_dict(fld, idx + 1)
        for idx, fld in enumerate(self.fields)
        if not isinstance(fld, EmptyField)
    ]

    output = dict(
        taxa_fields=self.taxa_fields,
        max_row=self.n_row + self.n_descriptors,
        max_col=self.n_fields - self.n_trailing_empty_fields,
        name=self.name,
        title=self.title,
        description=self.description,
        descriptors=self.descriptors,
        external=self.external,
        fields=fields,
        field_name_row=self.n_descriptors,
        n_data_row=self.n_row,
    )

    return output

validate_data_rows(data_rows)

Validate the contents of data rows from a data table.

This method is used to pass a list of rows of data from a data table into the appropriate field checkers created by the validate_field_meta method. Each row is represented as a list, with the row number as the first entry in the list.

Parameters:

Name Type Description Default
data_rows list

A list of rows containing data from a data table.

required
Source code in safedata_validator/field.py
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
def validate_data_rows(self, data_rows: list) -> None:
    """Validate the contents of data rows from a data table.

    This method is used to pass a list of rows of data from a data table into the
    appropriate field checkers created by the validate_field_meta method. Each row
    is represented as a list, with the row number as the first entry in the list.

    Args:
        data_rows: A list of rows containing data from a data table.
    """

    if not self.fields_loaded:
        LOGGER.critical("No fields defined - use validate_field_meta")
        return

    # Check the lengths of the rows - this should log critical
    # because it is likely to be a programming error, not a user error.
    if len(data_rows) == 0:
        LOGGER.critical("Empty data_rows passed to validate_data_rows")
        return

    row_lengths = {len(rw) for rw in data_rows}

    if len(row_lengths) != 1:
        LOGGER.critical("Data rows of unequal length - cannot validate")
        return
    elif row_lengths.pop() != (self.n_fields + 1):
        LOGGER.critical(
            "Data rows not of same length as field metadata - cannot validate"
        )
        return

    # Handle empty rows.
    blank_set = {None}
    blank_row = [set(vals) == blank_set for vals in data_rows]

    trailing_blank_row = [
        all(blank_row[-(n + 1) :]) for n in range(0, len(blank_row))
    ]
    trailing_blank_row.reverse()

    internal_blank_row = [
        blnk & ~trail for blnk, trail in zip(blank_row, trailing_blank_row)
    ]

    # Internals within a set of row are automatically bad, but trailing blank
    # rows are only bad if more data is loaded below them.
    if any(internal_blank_row):
        self.blank_rows = True

    if not self.trailing_blank_rows and any(trailing_blank_row):
        self.trailing_blank_rows = True
    elif self.trailing_blank_rows and not all(trailing_blank_row):
        self.blank_rows = True

    # Drop blank rows for further processing
    data_rows = [drw for drw, is_blank in zip(data_rows, blank_row) if not is_blank]

    # Check there is any remaining data, then convert the values into columns
    # and extract the row numbers
    n_row = len(data_rows)
    if n_row == 0:
        return
    else:
        self.n_row += n_row

    data_cols = list(zip(*data_rows))
    row_numbers = data_cols.pop(0)

    # Check for bad values (blanks, non integers) in row numbers
    valid_row_numbers = IsNotBlank(row_numbers, keep_failed=False)

    if not valid_row_numbers:
        self.row_numbers_missing = True

    row_numbers = valid_row_numbers.values

    if any([not isinstance(vl, int) for vl in row_numbers]):
        self.row_numbers_noninteger = True

    # Check the row numbering - skip if this has already failed or
    # the row numbers have already included None or non-integers
    if self.row_numbers_sequential and not (
        self.row_numbers_noninteger or self.row_numbers_missing
    ):
        expected_numbers = list(
            range(self.current_row, self.current_row + len(row_numbers))
        )
        self.current_row += len(row_numbers)

        if row_numbers != expected_numbers:
            self.row_numbers_sequential = False
    else:
        self.current_row += len(row_numbers)

    # Now feed the sets of values into the Field validation
    for data, field_inst in zip(data_cols, self.fields):
        field_inst.validate_data(list(data))

validate_field_meta(field_meta)

Validate the field metadata for a data table.

This method is used to add and validate field metadata. The field meta can include trailing empty fields - these will be checked when data rows are added. The method also then sets up a list of field validators, matching the order of the fields in the table, which are used for data validation.

Note

This relies on the maintenance of the dictionary insertion order, which required OrderedDict in the past but is guaranteed for the standard dict in Python >= 3.7.

Source code in safedata_validator/field.py
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
@loggerinfo_push_pop("Validating field metadata")
def validate_field_meta(self, field_meta: dict) -> None:
    """Validate the field metadata for a data table.

    This method is used to add and validate field metadata. The field meta can
    include trailing empty fields - these will be checked when data rows are added.
    The method also then sets up a list of field validators, matching the order of
    the fields in the table, which are used for data validation.

    Note:
        This relies on the maintenance of the dictionary insertion order, which
        required OrderedDict in the past but is guaranteed for the standard dict in
        Python >= 3.7.
    """

    # Clean off descriptor whitespace padding?
    clean_descriptors = IsNotPadded(field_meta.keys())
    if not clean_descriptors:
        # Report whitespace padding and clean up tuples
        LOGGER.error(
            "Whitespace padding in descriptor names: ",
            extra={"join": clean_descriptors.failed},
        )

        # Order preserved in dict and validator
        cleaned_entries = [
            (ky, val) for ky, val in zip(clean_descriptors, field_meta.values())
        ]
        field_meta = dict(cleaned_entries)

    self.descriptors = list(field_meta.keys())
    self.n_descriptors = len(self.descriptors)

    # Checking field_meta structure - should be an equal length and non-zero tuple
    # of values for each descriptor. If not, return without setting fields_loaded.
    descriptor_tuple_lengths = {len(tp) for tp in field_meta.values()}
    if len(descriptor_tuple_lengths) > 1 or descriptor_tuple_lengths == {0}:
        LOGGER.error("Cannot load unequal length or empty field metadata")
        return

    # * Expected descriptors - do _not_ preclude user defined descriptors
    #   but warn about them to alert to typos. Missing descriptors vary
    #   with field type so are handled in BaseField.__init__()
    unknown_descriptors = set(field_meta.keys()).difference(
        set(MANDATORY_DESCRIPTORS).union(OPTIONAL_DESCRIPTORS)
    )
    if unknown_descriptors:
        LOGGER.warning(
            "Unknown field descriptors:", extra={"join": unknown_descriptors}
        )

    # * Check for field name as _last_ descriptor. This is primarily to
    #   make it easy to read the dataframe - just skip the other descriptors
    if list(field_meta.keys())[-1] != "field_name":
        LOGGER.error("Field_name row is not the last descriptor")

    # Repackage field metadata into a list of per field descriptor
    # dictionaries, _importantly_ preserving the column order.
    field_meta_list = [
        dict(zip(field_meta.keys(), val)) for val in zip(*field_meta.values())
    ]

    # Check provided field names are unique. This doesn't cause as many
    # problems as duplications in Taxa and Locations, which expect certain
    # fields, so warn and continue. Ignore empty mandatory values - these
    # are handled in field checking.
    field_names = [
        fld["field_name"]
        for fld in field_meta_list
        if fld["field_name"] is not None
    ]
    dupes = HasDuplicates(field_names)
    if dupes:
        LOGGER.error("Field names duplicated: ", extra={"join": dupes.duplicated})

    # Lowercase the field types
    for fld in field_meta_list:
        fld["field_type"] = (
            None if fld["field_type"] is None else fld["field_type"].lower()
        )

    # Populate the instance variables
    self.fields_loaded = True
    self.field_meta = field_meta_list
    self.n_fields = len(field_meta_list)
    self.n_trailing_empty_fields = 0

    # get taxa field names for cross checking observation and trait data
    self.taxa_fields = [
        fld["field_name"] for fld in self.field_meta if fld["field_type"] == "taxa"
    ]

    # Now initialise the Field objects using the mapping of field types
    # to the BaseField subclasses. This needs to handle trailing fields
    # containing _no_ metadata (e.g. from inaccurate Excel sheet bounds or
    # CSV comments) but also fields with missing field_type (but other
    # metadata).
    #
    # - trailing empty fields are assumed to be blank columns
    #   and validate_data_rows should test for content.
    # - otherwise, these are assumed to be fields and should
    #   trigger warnings about descriptors

    # Get logical flags for empty and trailing empty metadata
    field_meta_empty = [set(vl.values()) == {None} for vl in self.field_meta]
    trailing_empty = [
        all(field_meta_empty[-(n + 1) :]) for n in range(0, len(field_meta_empty))
    ]
    trailing_empty.reverse()
    self.n_trailing_empty_fields = sum(trailing_empty)

    # Get the type map and field list
    field_subclass_map = BaseField.field_type_map()
    unknown_field_types = set()

    for col_idx, (tr_empty, fd_empty, fmeta) in enumerate(
        zip(trailing_empty, field_meta_empty, self.field_meta)
    ):
        fmeta["col_idx"] = col_idx + 1

        # Consider cases
        if tr_empty:
            # Hopefully empty trailing field
            self.fields.append(EmptyField(fmeta))
        elif fd_empty:
            # Empty field within other fields
            self.fields.append(BaseField(fmeta, dwsh=self, dataset=self.dataset))
        elif fmeta["field_type"] is None:
            # Non-empty field of None type - use BaseField for basic checks
            self.fields.append(BaseField(fmeta, dwsh=self, dataset=self.dataset))
        elif fmeta["field_type"] not in field_subclass_map:
            #  Non-empty field of unknown type - use BaseField for basic checks
            unknown_field_types.add(fmeta["field_type"])
            self.fields.append(BaseField(fmeta, dwsh=self, dataset=self.dataset))
        else:
            # Known field type.
            fld_class = field_subclass_map[fmeta["field_type"]]
            self.fields.append(fld_class(fmeta, dwsh=self, dataset=self.dataset))

    if unknown_field_types:
        LOGGER.error("Unknown field types: ", extra={"join": unknown_field_types})

Fields

This module contains the Dataset, Dataworksheet and BaseField classes, along with subclasses of BaseField for different data types. These are the core functions of safedata_validator, responsible for opening a file containing formatted data, and loading and validating the field metadata and data rows in the main data tables.

BaseField

Validating metadata and data for a field.

This class provides an implementation for checking field metadata for a field type and then testing whether the data contents are compatible with that metadata. The class is largely used for checking fields in data tables worksheets, but can also be used for checking other tabular data.

The base class defines the core checking methods and then subclasses are used to extend those methods to implement specific checking for particular data types. The main workflow for the class is:

  • Instantiate a class object using the field metadata to check the metadata.
  • The base class provides _check_meta, _check_taxon_meta and _check_interaction_meta to provide common shared components of metadata validation in extended __init__ methods in subclasses.
  • Data is then validated using the validate_data method - it ingests chunks of rows, to avoid having to read all the data in a table from file at once. The data validation process accumulates logging messages in a stack via the _log method.
  • The report method is then used to run final checks and emit the message stack, once all of the data has been ingested.

The base class also has a much more complex signature than the base functionality requires. Various subclasses need access to:

  • Dataset level information - extents, taxa and locations.
  • Dataworksheet level information - taxon fields

Rather than having complex inheritance with changing subclass signatures and kwargs, the BaseField class makes all information available to all subclasses.

Parameters:

Name Type Description Default
meta dict

A dictionary of field metadata

required
dwsh DataWorksheet | None

A DataWorksheet instance, used to pass worksheet level information, which at the moment is just taxa fields.

None
dataset Dataset | None

An Dataset instance, used to provide dataset level information, such as taxa, locations and summary, and update dataset level attributes such as data extents.

None
Class Attributes

field_types: A list of the field types - as entered in data table metadata - that are handled by a class based on BaseField. required_descriptors: A list of field metadata names that are required by a class based on BaseField. check_taxon_meta: A boolean flag indicating whether a class based on BaseField should check taxonomic metadata. check_interaction_meta: A similar boolean flag for interaction metadata.

Attributes:

Name Type Description
meta

The metadata dictionary for the field

dwsh

The DataWorksheet object in which the field is contained.

taxa Taxa | None

The Taxa object for the Dataset or None.

locations

The Locations object for the Dataset or None.

summary

The Summary object for the Dataset or None.

log_stack list

A list of tuples containing logging messages to be emitted.

n_rows

The number of rows of data in the field.

n_na

The number of NA values in the field data.

n_blank

The number of blank cells in the field data.

n_excel_error

The number of Excel error codes in the field data.

bad_values list

A list of invalid value given a class based on BaseField.

bad_rows list

A list of row numbers containing bad values.

field_name

The provided field name.

Source code in safedata_validator/field.py
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
class BaseField:
    """Validating metadata and data for a field.

    This class provides an implementation for checking field metadata for a field type
    and then testing whether the data contents are compatible with that metadata. The
    class is largely used for checking fields in data tables worksheets, but can also
    be used for checking other tabular data.

    The base class defines the core checking methods and then subclasses are used to
    extend those methods to implement specific checking for particular data types. The
    main workflow for the class is:

    * Instantiate a class object using the field metadata to check the metadata.
    * The base class provides `_check_meta`, `_check_taxon_meta` and
      `_check_interaction_meta` to provide common shared components of metadata
      validation in extended `__init__` methods in subclasses.
    * Data is then validated using the `validate_data` method - it ingests chunks of
      rows, to avoid having to read all the data in a table from file at once. The data
      validation process accumulates logging messages in a stack via the `_log` method.
    * The `report` method is then used to run final checks and emit the message stack,
      once all of the data has been ingested.

    The base class also has a much more complex signature than the base
    functionality requires. Various subclasses need access to:

    * Dataset level information - extents, taxa and locations.
    * Dataworksheet level information - taxon fields

    Rather than having complex inheritance with changing subclass signatures
    and kwargs, the BaseField class makes all information available to all
    subclasses.

    Args:
        meta: A dictionary of field metadata
        dwsh: A DataWorksheet instance, used to pass worksheet level
            information, which at the moment is just taxa fields.
        dataset: An Dataset instance, used to provide dataset level
            information, such as taxa, locations and summary, and update
            dataset level attributes such as data extents.

    Class Attributes:
        field_types: A list of the field types - as entered in data table metadata -
            that are handled by a class based on BaseField.
        required_descriptors: A list of field metadata names that are required by a
            class based on BaseField.
        check_taxon_meta: A boolean flag indicating whether a class based on BaseField
            should check taxonomic metadata.
        check_interaction_meta: A similar boolean flag for interaction metadata.

    Attributes:
        meta: The metadata dictionary for the field
        dwsh: The DataWorksheet object in which the field is contained.
        taxa: The Taxa object for the Dataset or None.
        locations: The Locations object for the Dataset or None.
        summary: The Summary object for the Dataset or None.
        log_stack: A list of tuples containing logging messages to be emitted.
        n_rows: The number of rows of data in the field.
        n_na: The number of NA values in the field data.
        n_blank: The number of blank cells in the field data.
        n_excel_error: The number of Excel error codes in the field data.
        bad_values: A list of invalid value given a class based on BaseField.
        bad_rows: A list of row numbers containing bad values.
        field_name: The provided field name.

    """

    # These class attributes describe the field_types handled by the class and sets the
    # required descriptors for those field_types. The no_validation attributes is used
    # to suppress data validation for use on e.g. comments fields.

    field_types: tuple[str, ...] | None = None
    required_descriptors = MANDATORY_DESCRIPTORS
    check_taxon_meta = False
    check_interaction_meta = False

    def __init__(
        self,
        meta: dict,
        dwsh: DataWorksheet | None = None,
        dataset: Dataset | None = None,
    ) -> None:
        self.meta = meta
        self.dwsh = dwsh

        self.taxa: Taxa | None = None
        self.locations = None
        self.summary = None

        # Shortcuts to main components
        self.dataset = dataset
        if self.dataset:
            self.taxa = getattr(self.dataset, "taxa")
            self.locations = getattr(self.dataset, "locations")
            self.summary = getattr(self.dataset, "summary")

        # Attributes
        self.log_stack: list = []
        self.n_rows = 0
        self.n_na = 0
        self.n_blank = 0
        self.n_excel_error = 0
        self.bad_values: list = []
        self.bad_rows: list = []  # TODO check on implementation of this

        # Get a field name - either from the field_meta, or a column letter
        # from col_idx or 'Unknown'
        self.field_name = self.meta.get("field_name")

        if self.field_name is None:
            idx = self.meta.get("col_idx")
            if idx is None:
                self.field_name = "Unknown"
            else:
                self.field_name = f"Column_{get_column_letter(idx)}"

        # Now check required descriptors - values are present, non-blank and
        # are unpadded strings (currently all descriptors are strings).
        for dsc in self.required_descriptors:
            self._check_meta(dsc)

        # Specific check that field name is valid - the column letter codes are
        # always valid, so missing names won't trigger this.
        # _check_meta has already tested and logged non-string errors, so
        # only test here for strings that fail.
        if isinstance(self.field_name, str) and not valid_r_name(self.field_name):
            self._log(
                f"Field name is not valid: {repr(self.field_name)}. "
                "Common errors are spaces and non-alphanumeric "
                "characters other than underscore and full stop"
            )

        # TODO - rethink implementation? Quite specific behaviour unique to
        #        some fields, so could implement as overloaded extra stub.
        if self.check_taxon_meta:
            self._check_taxon_meta()

        if self.check_interaction_meta:
            self._check_interaction_meta()

    def _log(self, msg: str, level: int = ERROR, extra: dict | None = None) -> None:
        """Adds messages to the field log stack.

        Rather than directly emitting a log message, field processing accumulates a
        stack of messages, which are all emitted when the
        [safedata_validator.field.BaseField.report][] method is called.

        Args:
            msg: The log message to add
            level: The logging level to use.
            extra: A dictionary of extra information to pass to the logging.log method.
        """
        self.log_stack.append((level, msg, extra))

    def _check_meta(self, descriptor: str) -> bool:
        """Check required field metadata.

        A standardised checker to see if a required descriptor is present for a field
        and that it isn't simply empty or whitespace. Messages are added to the
        log_stack if required. It returns a boolean showing whether checks pass or not -
        the actual causes of failure are added to the field message log.

        Args:
            descriptor: The name of the descriptor to check.
        """

        if descriptor not in self.meta:
            self._log(f"{descriptor} descriptor missing")
            return False

        val = self.meta[descriptor]

        if blank_value(val):
            self._log(f"{descriptor} descriptor is blank")
            self.meta[descriptor] = None  # standardise whitestring to None
            return False
        elif not isinstance(val, str):
            self._log(f"{descriptor} descriptor is not a string: {repr(val)}")
            return False
        elif val != val.strip():
            self._log(f"{descriptor} descriptor has whitespace padding: {repr(val)}")
            self.meta[descriptor] = val.strip()
            return False
        else:
            return True

    def _check_taxon_meta(self) -> bool:
        """Check taxon field metadata.

        Checks the taxonomic metadata of abundance and trait fields. This is more
        involved that the simple _check_meta(), because of the option to provide
        taxon_name or taxon_field descriptors. It returns a boolean showing whether
        checks pass or not - the actual causes of failure are added to the field message
        log.
        """

        # Are taxon_name and taxon_field provided and not blank: note use of
        # 'and' rather than '&' to allow missing descriptors to short cut

        tx_nm = self.meta.get("taxon_name")
        tx_fd = self.meta.get("taxon_field")
        tx_nm_prov = not blank_value(tx_nm)
        tx_fd_prov = not blank_value(tx_fd)

        if tx_nm_prov and tx_fd_prov:
            self._log("Taxon name and taxon field both provided, use one only")
            return False
        elif not tx_nm_prov and not tx_fd_prov:
            self._log("One of taxon name or taxon field must be provided")
            return False
        elif tx_nm_prov:
            if self.taxa is None:
                self._log(
                    "Taxon name provided but no Taxa instance available", CRITICAL
                )
                return False
            elif self.taxa.is_empty:
                self._log("Taxon name provided but no taxa loaded")
                return False
            elif tx_nm not in self.taxa.taxon_names:
                self._log("Taxon name not found in the Taxa worksheet")
                return False
            else:
                self.taxa.taxon_names_used.add(tx_nm)
                return True
        elif tx_fd_prov:
            if self.dwsh is None:
                self._log(
                    f"Taxon field provided but no dataworksheet provided for this "
                    f"field: {tx_fd}",
                    CRITICAL,
                )
                return False
            elif tx_fd not in self.dwsh.taxa_fields:
                self._log(f"Taxon field not found in this worksheet: {tx_fd}")
                return False
            else:
                return True
        else:
            return True

    def _check_interaction_meta(self) -> bool:
        """Check interaction metadata.

        Checks the taxonomic metadata of interaction fields. This is more involved that
        the simple _check_meta(), because of the option to provide taxon_name or
        taxon_field descriptors describing at least two taxonomic identifiers. It
        returns a boolean showing whether checks pass or not - the actual causes of
        failure are added to the field message log.
        """

        # TODO - currently no checking for descriptions being present in
        #        the interaction information. Not sure how many old datasets
        #        would be affected, but this would be good to include.

        # Are interaction_name and/or interaction_field provided and not blank
        iact_nm = self.meta.get("interaction_name")
        iact_fd = self.meta.get("interaction_field")
        iact_nm_prov = not blank_value(iact_nm)
        iact_fd_prov = not blank_value(iact_fd)

        if not iact_nm_prov and not iact_fd_prov:
            self._log(
                "At least one of interaction name or interaction field must be provided"
            )
            return False

        if iact_nm_prov:
            # Check that self.taxa has actually been populated
            if self.taxa is None:
                self._log(
                    "Interaction name provided but no Taxa instance available", CRITICAL
                )
                return False
            elif self.taxa.is_empty:
                self._log("Interaction name provided but no taxa loaded")
                return False

            # get the taxon names and descriptions from interaction name providers
            iact_nm_lab, iact_nm_desc = self._parse_levels(str(iact_nm))

            # add names to used taxa
            self.taxa.taxon_names_used.update(iact_nm_lab)

            # check they are found
            iact_nm_lab_in_set = IsInSet(iact_nm_lab, tuple(self.taxa.taxon_names))

            if not iact_nm_lab_in_set:
                self._log(
                    "Unknown taxa in interaction_name descriptor",
                    extra={"join": iact_nm_lab_in_set.failed},
                )
                nm_check = False
            else:
                nm_check = True

            iact_nm_lab = iact_nm_lab_in_set.values
        else:
            iact_nm_lab = []
            iact_nm_desc = ()
            nm_check = True

        if iact_fd_prov:
            # Check that self.dwsh has been populated properly
            if self.dwsh is None:
                self._log(
                    f"Interaction field provided but no dataworksheet provided for this"
                    f" field: {iact_fd}",
                    CRITICAL,
                )
                return False
            # check any field labels match to known taxon fields
            iact_fd_lab, iact_fd_desc = self._parse_levels(str(iact_fd))

            iact_fd_lab_in_set = IsInSet(iact_fd_lab, tuple(self.dwsh.taxa_fields))
            if not iact_fd_lab_in_set:
                self._log(
                    "Unknown taxon fields in interaction_field descriptor",
                    extra={"join": iact_fd_lab_in_set.failed},
                )
                fd_check = False
            else:
                fd_check = True

            iact_fd_lab = iact_fd_lab_in_set.values
        else:
            iact_fd_lab = []
            iact_fd_desc = ()
            fd_check = True

        if len(iact_nm_lab + iact_fd_lab) < 2:
            self._log(
                "At least two interacting taxon labels or fields must be identified"
            )
            num_check = False
        else:
            num_check = True

        all_desc = iact_nm_desc + iact_fd_desc
        if None in all_desc:
            self._log(
                "Label descriptions for interacting taxa incomplete or missing", WARNING
            )

        if nm_check and fd_check and num_check:
            return True
        else:
            return False

    def _parse_levels(self, txt: str) -> tuple[list[str], tuple[str | None, ...]]:
        """Parse categorical variable level descriptions.

        Splits up category information formatted as label:desc;label:desc, which is used
        in both levels for categorical data and interaction descriptors.

        Args:
            txt: The text string to parse

        Returns:
            A list of tuples containing pair of level labels and descriptions.
        """
        # remove terminal semi-colon, if used.
        if txt.endswith(";"):
            txt = txt[:-1]

        # - split the text up by semi-colon
        levels = txt.split(";")

        # - split descriptions and standardize
        parsed_n_parts: set[int] = set()
        levels_parsed: list[list] = []

        for each_level in levels:
            # Split off any description
            level_parts = each_level.split(":")

            # Check for descriptions and build levels_parsed
            level_n_parts = len(level_parts)
            if level_n_parts == 1:
                levels_parsed.append([level_parts[0], None])
            else:
                levels_parsed.append(level_parts[0:2])

            parsed_n_parts.add(level_n_parts)

        # simple formatting checks
        if any([np > 2 for np in parsed_n_parts]):
            self._log("Extra colons in level description.")

        if len(parsed_n_parts) > 1:
            self._log("Provide descriptions for either all or none of the categories")

        level_labels, level_desc = zip(*levels_parsed)

        # - repeated labels?
        if len(set(level_labels)) < len(level_labels):
            self._log("Repeated level labels")

        # - check for numeric level names: integers would be more common
        # but don't let floats sneak through either!
        level_labels_not_numeric = IsNotNumericString(level_labels)
        if not level_labels_not_numeric:
            self._log("Numeric level names not permitted")

        # Remove white space around the labels: simple spacing in the text
        # makes it easier to read and insisting on no space is unnecessary
        level_labels_clean = [vl.strip() for vl in level_labels_not_numeric]

        return level_labels_clean, level_desc

    @classmethod
    def field_type_map(cls):
        """Returns a map of field types to BaseField subclasses.

        This class method iterates over the existing subclasses of BaseField and returns
        a returns a dictionary mapping the field types handled by each subclass to the
        subclass type. This allows the field types listed in field metadata to be
        quickly matched to the correct BaseField subclass.
        """

        field_type_map = {}

        # Create a stack starting with the calling class
        classes = cls.__subclasses__()

        # Pop subclasses while the stack is not empty
        while classes:
            # Get a field class
            current_field_class = classes.pop(0)

            # Add any subclasses from that class
            classes.extend(current_field_class.__subclasses__())

            # Extend the field map
            for ftype in current_field_class.field_types:
                field_type_map.update({ftype: current_field_class})

        return field_type_map

    def validate_data(self, data: list) -> None:
        """Standard method to validate data.

        Most classes that inherit from BaseField will overwrite this to include more
        specific validation rules.
        """

        data = self.run_common_validation(data)

    def run_common_validation(self, data: list) -> list:
        """Validates a list of data provided for a field.

        This base class method runs the common shared validation steps for input data
        for a given field. It checks for:

        * Explicit missing data, using the NA value.
        * Empty cells, which is an error.
        * Excel cell error codes (such as `#VALUE!`)

        Subclasses should also carry out field specific testing. To ensure that only the
        data that passes the common checks is subjected to extra testing, within their
        data validation methods subclasses should use:

            data = self.run_common_validation(data)

        Args:
            data: a set of values from a data table for the field.

        Returns:
            A list of the cleaned input values, excluding blanks, NA and any Excel
            errors.
        """

        # Unless any input is ok (comments fields, basically) filter out
        # missing and blank data. Also check for Excel formula errors.
        self.n_rows += len(data)

        # Look for NAs
        data_no_na = IsNotNA(data, keep_failed=False)
        if not data_no_na:
            self.n_na += len(data_no_na.failed)

        # Look for blank data
        data_no_blanks = IsNotBlank(data_no_na, keep_failed=False)
        if not data_no_blanks:
            self.n_blank += len(data_no_blanks.failed)

        # Look for formula errors:

        # With openpyxl, using data_only=False loads function cells with
        # data_type = 'f', but using data_only=True returns cells with a
        # data_type for the resulting value _or_ an 'e' code for a function
        # error. Using value_only=True in iter_row for speed loses that cue,
        # so the values simply come back as a strings that have to be matched
        # to the of Excel error codes. See also:
        #
        # https://groups.google.com/g/openpyxl-users/c/iNi1MKSP-Bc/m/2q_7cHavBQAJ

        data_no_excel_errors = IsNotExcelError(data_no_blanks, keep_failed=False)
        if not data_no_excel_errors:
            self.n_excel_error += len(data_no_excel_errors.failed)

        # Return cleaned data for use in further checking.
        return data_no_excel_errors.values

    def report(self) -> None:
        """Report on field creation and data validation.

        This method emits any messages queued in the log stack during the creation
        of a field instance and data validation. The method also contains any final
        checking of data values across all validated data.
        """

        LOGGER.info(f"Checking field {self.field_name}")

        FORMATTER.push()

        # Emit the accumulated messages
        for log_level, msg, extra in self.log_stack:
            LOGGER.log(log_level, msg, extra=extra)

        # Run any final logging
        # - warn about NAs
        if self.n_na:
            LOGGER.warning(f"{self.n_na} / {self.n_rows} values missing")

        # We don't tolerate blank cells
        if self.n_blank:
            LOGGER.error(
                f"{self.n_blank} cells are blank or contain only whitespace text"
            )

        # Report on excel error codes
        if self.n_excel_error:
            LOGGER.error(f"{self.n_excel_error} cells contain Excel formula errors")

        FORMATTER.pop()

field_type_map() classmethod

Returns a map of field types to BaseField subclasses.

This class method iterates over the existing subclasses of BaseField and returns a returns a dictionary mapping the field types handled by each subclass to the subclass type. This allows the field types listed in field metadata to be quickly matched to the correct BaseField subclass.

Source code in safedata_validator/field.py
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
@classmethod
def field_type_map(cls):
    """Returns a map of field types to BaseField subclasses.

    This class method iterates over the existing subclasses of BaseField and returns
    a returns a dictionary mapping the field types handled by each subclass to the
    subclass type. This allows the field types listed in field metadata to be
    quickly matched to the correct BaseField subclass.
    """

    field_type_map = {}

    # Create a stack starting with the calling class
    classes = cls.__subclasses__()

    # Pop subclasses while the stack is not empty
    while classes:
        # Get a field class
        current_field_class = classes.pop(0)

        # Add any subclasses from that class
        classes.extend(current_field_class.__subclasses__())

        # Extend the field map
        for ftype in current_field_class.field_types:
            field_type_map.update({ftype: current_field_class})

    return field_type_map

report()

Report on field creation and data validation.

This method emits any messages queued in the log stack during the creation of a field instance and data validation. The method also contains any final checking of data values across all validated data.

Source code in safedata_validator/field.py
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
def report(self) -> None:
    """Report on field creation and data validation.

    This method emits any messages queued in the log stack during the creation
    of a field instance and data validation. The method also contains any final
    checking of data values across all validated data.
    """

    LOGGER.info(f"Checking field {self.field_name}")

    FORMATTER.push()

    # Emit the accumulated messages
    for log_level, msg, extra in self.log_stack:
        LOGGER.log(log_level, msg, extra=extra)

    # Run any final logging
    # - warn about NAs
    if self.n_na:
        LOGGER.warning(f"{self.n_na} / {self.n_rows} values missing")

    # We don't tolerate blank cells
    if self.n_blank:
        LOGGER.error(
            f"{self.n_blank} cells are blank or contain only whitespace text"
        )

    # Report on excel error codes
    if self.n_excel_error:
        LOGGER.error(f"{self.n_excel_error} cells contain Excel formula errors")

    FORMATTER.pop()

run_common_validation(data)

Validates a list of data provided for a field.

This base class method runs the common shared validation steps for input data for a given field. It checks for:

  • Explicit missing data, using the NA value.
  • Empty cells, which is an error.
  • Excel cell error codes (such as #VALUE!)

Subclasses should also carry out field specific testing. To ensure that only the data that passes the common checks is subjected to extra testing, within their data validation methods subclasses should use:

data = self.run_common_validation(data)

Parameters:

Name Type Description Default
data list

a set of values from a data table for the field.

required

Returns:

Type Description
list

A list of the cleaned input values, excluding blanks, NA and any Excel

list

errors.

Source code in safedata_validator/field.py
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
def run_common_validation(self, data: list) -> list:
    """Validates a list of data provided for a field.

    This base class method runs the common shared validation steps for input data
    for a given field. It checks for:

    * Explicit missing data, using the NA value.
    * Empty cells, which is an error.
    * Excel cell error codes (such as `#VALUE!`)

    Subclasses should also carry out field specific testing. To ensure that only the
    data that passes the common checks is subjected to extra testing, within their
    data validation methods subclasses should use:

        data = self.run_common_validation(data)

    Args:
        data: a set of values from a data table for the field.

    Returns:
        A list of the cleaned input values, excluding blanks, NA and any Excel
        errors.
    """

    # Unless any input is ok (comments fields, basically) filter out
    # missing and blank data. Also check for Excel formula errors.
    self.n_rows += len(data)

    # Look for NAs
    data_no_na = IsNotNA(data, keep_failed=False)
    if not data_no_na:
        self.n_na += len(data_no_na.failed)

    # Look for blank data
    data_no_blanks = IsNotBlank(data_no_na, keep_failed=False)
    if not data_no_blanks:
        self.n_blank += len(data_no_blanks.failed)

    # Look for formula errors:

    # With openpyxl, using data_only=False loads function cells with
    # data_type = 'f', but using data_only=True returns cells with a
    # data_type for the resulting value _or_ an 'e' code for a function
    # error. Using value_only=True in iter_row for speed loses that cue,
    # so the values simply come back as a strings that have to be matched
    # to the of Excel error codes. See also:
    #
    # https://groups.google.com/g/openpyxl-users/c/iNi1MKSP-Bc/m/2q_7cHavBQAJ

    data_no_excel_errors = IsNotExcelError(data_no_blanks, keep_failed=False)
    if not data_no_excel_errors:
        self.n_excel_error += len(data_no_excel_errors.failed)

    # Return cleaned data for use in further checking.
    return data_no_excel_errors.values

validate_data(data)

Standard method to validate data.

Most classes that inherit from BaseField will overwrite this to include more specific validation rules.

Source code in safedata_validator/field.py
1351
1352
1353
1354
1355
1356
1357
1358
def validate_data(self, data: list) -> None:
    """Standard method to validate data.

    Most classes that inherit from BaseField will overwrite this to include more
    specific validation rules.
    """

    data = self.run_common_validation(data)

field_to_dict(fld, col_idx)

Convert a object inheriting from BaseField into a dictionary.

A function to return a dictionary representation of a field object. This would more naturally be a method of BaseField, but needs to include the extended attributes of subclasses of BaseField.

TODO - the BaseField should define all these attributes and then this becomes a method

Parameters:

Name Type Description Default
fld EmptyField | BaseField

An instance inheriting from BaseField

required
col_idx int

The column index of the field

required

Returns:

Type Description
dict

A dictionary representation of the field attributes.

Source code in safedata_validator/field.py
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
def field_to_dict(fld: EmptyField | BaseField, col_idx: int) -> dict:
    """Convert a object inheriting from BaseField into a dictionary.

    A function to return a dictionary representation of a field object. This would more
    naturally be a method of BaseField, but needs to include the extended attributes of
    subclasses of BaseField.

    TODO - the BaseField should define all these attributes and then this becomes
    a method

    Args:
        fld: An instance inheriting from BaseField
        col_idx: The column index of the field

    Returns:
        A dictionary representation of the field attributes.
    """

    output = dict(
        field_name=fld.field_name,
        description=fld.meta["description"],
        field_type=fld.meta["field_type"],
        units=None,
        method=None,
        levels=None,
        taxon_field=None,
        taxon_name=None,
        interaction_field=None,
        interaction_name=None,
        range=None,
        col_idx=col_idx,
    )

    if hasattr(fld, "units"):
        output["units"] = getattr(fld, "units")
        output["method"] = getattr(fld, "method")

    if hasattr(fld, "levels"):
        output["levels"] = getattr(fld, "levels")

    if hasattr(fld, "taxon_field"):
        output["levels"] = getattr(fld, "taxon_field")

    if hasattr(fld, "interaction_name"):
        output["interaction_name"] = getattr(fld, "interaction_name")

    if hasattr(fld, "interaction_field"):
        output["interaction_field"] = getattr(fld, "interaction_field")

    if hasattr(fld, "range"):
        output["range"] = getattr(fld, "range")

    return output

CategoricalField

Bases: BaseField

A BaseField subclass for categorical and ordered categorical fields.

Extends the BaseField __init__ method to store the level labels and track the reported levels during data validation.

Source code in safedata_validator/field.py
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
class CategoricalField(BaseField):
    """A BaseField subclass for categorical and ordered categorical fields.

    Extends the [BaseField `__init__` method][safedata_validator.field.BaseField]
    to store the level labels and track the reported levels during data
    validation.
    """

    field_types: tuple[str, ...] = ("categorical", "ordered categorical")
    required_descriptors = MANDATORY_DESCRIPTORS + ["levels"]

    def __init__(
        self,
        meta: dict,
        dwsh: DataWorksheet | None = None,
        dataset: Dataset | None = None,
    ) -> None:
        super().__init__(meta, dwsh=dwsh, dataset=dataset)

        # Additional code to validate the levels metadata and store a set of
        # values
        self.level_labels = set()
        self.reported_levels: set[str] = set()
        levels = meta.get("levels")

        if levels is not None and isinstance(levels, str) and not levels.isspace():
            level_labels, level_desc = self._parse_levels(levels)
            self.level_labels = set(level_labels)

    def validate_data(self, data: list) -> None:
        """Validate categorical field data.

        Runs the BaseField
        [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
        method and also checks that string values are provided and populates the set of
        reported levels.
        """
        data = self.run_common_validation(data)

        # Now look for consistency: get the unique values reported in the
        # data, convert to unicode to handle checking of numeric labels and
        # then check the reported levels are a subset of the descriptors.
        # XLRD reads all numbers as floats, so coerce floats back to int
        data_as_string = IsString(data, keep_failed=False)
        if not data_as_string:
            self._log("Cells contain non-text values")

        self.reported_levels.update(data_as_string)

    def report(self) -> None:
        """Report on field creation and data validation for categorical fields.

        Extends the [BaseField report method][safedata_validator.field.BaseField.report]
        to handle undeclared or unused level labels.
        """
        super().report()
        FORMATTER.push()

        extra = self.reported_levels.difference(self.level_labels)
        unused = self.level_labels.difference(self.reported_levels)

        if extra:
            LOGGER.error(
                "Categories found in data missing from levels descriptor: ",
                extra={"join": extra},
            )
        if unused:
            LOGGER.error(
                "Categories found in levels descriptor not used in data: ",
                extra={"join": unused},
            )

        FORMATTER.pop()

report()

Report on field creation and data validation for categorical fields.

Extends the BaseField report method to handle undeclared or unused level labels.

Source code in safedata_validator/field.py
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
def report(self) -> None:
    """Report on field creation and data validation for categorical fields.

    Extends the [BaseField report method][safedata_validator.field.BaseField.report]
    to handle undeclared or unused level labels.
    """
    super().report()
    FORMATTER.push()

    extra = self.reported_levels.difference(self.level_labels)
    unused = self.level_labels.difference(self.reported_levels)

    if extra:
        LOGGER.error(
            "Categories found in data missing from levels descriptor: ",
            extra={"join": extra},
        )
    if unused:
        LOGGER.error(
            "Categories found in levels descriptor not used in data: ",
            extra={"join": unused},
        )

    FORMATTER.pop()

validate_data(data)

Validate categorical field data.

Runs the BaseField run_common_validation method and also checks that string values are provided and populates the set of reported levels.

Source code in safedata_validator/field.py
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
def validate_data(self, data: list) -> None:
    """Validate categorical field data.

    Runs the BaseField
    [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
    method and also checks that string values are provided and populates the set of
    reported levels.
    """
    data = self.run_common_validation(data)

    # Now look for consistency: get the unique values reported in the
    # data, convert to unicode to handle checking of numeric labels and
    # then check the reported levels are a subset of the descriptors.
    # XLRD reads all numbers as floats, so coerce floats back to int
    data_as_string = IsString(data, keep_failed=False)
    if not data_as_string:
        self._log("Cells contain non-text values")

    self.reported_levels.update(data_as_string)

CategoricalInteractionField

Bases: CategoricalField

A CategoricalField subclass for categorical trait fields.

Further extends the NumericField subclass to also apply checking of the interaction metadata for numeric traits and abundance data.

Source code in safedata_validator/field.py
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
class CategoricalInteractionField(CategoricalField):
    """A CategoricalField subclass for categorical trait fields.

    Further extends the [NumericField][safedata_validator.field.NumericField] subclass
    to also apply checking of the interaction metadata for numeric traits and abundance
    data.
    """

    field_types = ("categorical interaction",)
    check_interaction_meta = True

CommentField

Bases: BaseField

A BaseField subclass for comments fields.

This subclass overrides the base data validation to allow any content in comments fields. See safedata_validator.field.BaseField for details.

Source code in safedata_validator/field.py
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
class CommentField(BaseField):
    """A BaseField subclass for comments fields.

    This subclass overrides the base data validation to allow any content in comments
    fields. See [safedata_validator.field.BaseField][] for details.
    """

    field_types = ("comments",)

    def validate_data(self, data: list) -> None:
        """Validate data in comment fields.

        This overrides the BaseField
        [validate_data][safedata_validator.field.BaseField.validate_data] method to
        remove any checking on comments fields.
        """

        return

validate_data(data)

Validate data in comment fields.

This overrides the BaseField validate_data method to remove any checking on comments fields.

Source code in safedata_validator/field.py
1514
1515
1516
1517
1518
1519
1520
1521
1522
def validate_data(self, data: list) -> None:
    """Validate data in comment fields.

    This overrides the BaseField
    [validate_data][safedata_validator.field.BaseField.validate_data] method to
    remove any checking on comments fields.
    """

    return

DatetimeField

Bases: BaseField

A BaseField subclass for date and datetime fields.

Extends the BaseField __init__ method to validate date and datetime. This is primarily about checking the consistency of the provided data. The function accepts either as ISO strings in a consistent datetime or date format or datetime.datetime objects. The parsing of timestrings follows ISO8601, which includes quite a range of valid inputs.

One problem here is that - when stored as Excel datetime cells - dates and times in Excel are poorly typed: essentially as a number with a meaning that depends in part on the cell format. The openpyxl package returns datetime.datetime objects for cells with datetime or date formats.

Source code in safedata_validator/field.py
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
class DatetimeField(BaseField):
    """A BaseField subclass for date and datetime fields.

    Extends the [BaseField `__init__` method][safedata_validator.field.BaseField]
    to validate date and datetime. This is primarily about checking the
    consistency of the provided data. The function accepts either as ISO strings
    in a consistent datetime or date format or datetime.datetime objects. The
    parsing of timestrings follows ISO8601, which includes quite a range of
    valid inputs.

    One problem here is that - when stored as Excel datetime cells - dates and
    times in Excel are poorly typed: essentially as a number with a meaning that
    depends in part on the cell format. The openpyxl package returns
    datetime.datetime objects for cells with datetime or date formats.
    """

    field_types = ("datetime", "date")

    def __init__(
        self,
        meta: dict,
        dwsh: DataWorksheet | None = None,
        dataset: Dataset | None = None,
    ) -> None:
        super().__init__(meta, dwsh=dwsh, dataset=dataset)

        if self.dataset is None:
            self._log("No dataset object provided - cannot update extents")

        # Defaults
        self.first_data_class_set: set | None = None
        self.consistent_class = True
        self.expected_class = True
        self.all_midnight = True

        self.bad_strings: list[str] = []
        self.min = None
        self.max = None

    def validate_data(self, data: list) -> None:
        """Validate date and datetime field data.

        Runs the BaseField
        [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
        method and also checks that time data has consistent formatting and are valid
        date or datetime values.
        """

        data = self.run_common_validation(data)

        # TODO: report row number of issues - problem of mismatch between
        # self.n_rows (_all_ rows) and index in data (non-NA rows)

        # Check for consistent class formatting, using first row. Do not try and
        # validate further when data is not consistently formatted.
        if self.consistent_class and self.expected_class:
            cell_types = [type(dt) for dt in data]
            cell_type_set = set(cell_types)

            # Are all the values of expected types?
            if not {datetime.datetime, str}.issuperset(cell_type_set):
                self.expected_class = False
                return

            # Are the values internally consistent and consistent with
            # previously loaded data
            if len(cell_type_set) == 0:
                # If data is empty skip consistency checking
                return
            elif len(cell_type_set) > 1:
                self.consistent_class = False
                return
            elif self.first_data_class_set is None:
                self.first_data_class_set = cell_type_set
            elif self.first_data_class_set != cell_type_set:
                self.consistent_class = False
                return
        else:
            return

        # Value checking only happens while data are consistently formatted
        # Look for string formatting, but also presence/absence of non-zero
        # time components.
        midnight = datetime.time(0, 0)

        if self.first_data_class_set == {str}:
            parsed_strings = []
            for val in data:
                try:
                    val_parse = parser.isoparse(val)
                    if val_parse.time() != midnight:
                        self.all_midnight = False
                    parsed_strings.append(val_parse)
                except ValueError:
                    self.bad_strings.append(val)

            # standardised to datetime objects
            data = parsed_strings

        elif self.first_data_class_set == {datetime.datetime}:
            for val in data:
                if val.time() != midnight:
                    self.all_midnight = False

        # Check that data isn't empty
        if data:
            # Update range for extents
            if self.min is None:
                self.min = min(data)
            else:
                self.min = min([self.min] + data)

            if self.max is None:
                self.max = min(data)
            else:
                self.max = max([self.max] + data)

    def report(self):
        """Report on field creation and data validation for date and datetime fields.

        Extends the [BaseField report method][safedata_validator.field.BaseField.report]
        to flag inconsistent  date and datetime formatting and invalid data.
        """
        super().report()
        FORMATTER.push()

        # INconsistent and bad data classes
        if not self.expected_class:
            LOGGER.error(
                "Date or datetime data include values neither ISO string nor time "
                "formatted"
            )
            return

        if not self.consistent_class:
            LOGGER.error(
                "Date or datetime data mixes ISO string and time formatted rows"
            )
            return

        # Bad string formats
        if self.bad_strings:
            LOGGER.error(
                "ISO datetime strings contain badly formatted values: e.g.",
                extra={"join": self.bad_strings[:5]},
            )

        # Datetime vs Date checking - Datetimes _could_ all be at midnight, so not an
        # error but does look odd.
        if self.meta["field_type"] == "datetime" and self.all_midnight:
            LOGGER.warning("Field is of type datetime, but only reports dates")
        if self.meta["field_type"] == "date" and not self.all_midnight:
            LOGGER.error("Field is of type date, but includes time data")

        # Update extent if possible - note that inheritance means that isinstance
        # in extent.Extent is not successfully testing for datetime.datetime rather
        # than set datatype of datetime.date
        if not (self.dataset is None or self.min is None or self.max is None):
            self.dataset.temporal_extent.update([self.min.date(), self.max.date()])

        FORMATTER.pop()

report()

Report on field creation and data validation for date and datetime fields.

Extends the BaseField report method to flag inconsistent date and datetime formatting and invalid data.

Source code in safedata_validator/field.py
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
def report(self):
    """Report on field creation and data validation for date and datetime fields.

    Extends the [BaseField report method][safedata_validator.field.BaseField.report]
    to flag inconsistent  date and datetime formatting and invalid data.
    """
    super().report()
    FORMATTER.push()

    # INconsistent and bad data classes
    if not self.expected_class:
        LOGGER.error(
            "Date or datetime data include values neither ISO string nor time "
            "formatted"
        )
        return

    if not self.consistent_class:
        LOGGER.error(
            "Date or datetime data mixes ISO string and time formatted rows"
        )
        return

    # Bad string formats
    if self.bad_strings:
        LOGGER.error(
            "ISO datetime strings contain badly formatted values: e.g.",
            extra={"join": self.bad_strings[:5]},
        )

    # Datetime vs Date checking - Datetimes _could_ all be at midnight, so not an
    # error but does look odd.
    if self.meta["field_type"] == "datetime" and self.all_midnight:
        LOGGER.warning("Field is of type datetime, but only reports dates")
    if self.meta["field_type"] == "date" and not self.all_midnight:
        LOGGER.error("Field is of type date, but includes time data")

    # Update extent if possible - note that inheritance means that isinstance
    # in extent.Extent is not successfully testing for datetime.datetime rather
    # than set datatype of datetime.date
    if not (self.dataset is None or self.min is None or self.max is None):
        self.dataset.temporal_extent.update([self.min.date(), self.max.date()])

    FORMATTER.pop()

validate_data(data)

Validate date and datetime field data.

Runs the BaseField run_common_validation method and also checks that time data has consistent formatting and are valid date or datetime values.

Source code in safedata_validator/field.py
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
def validate_data(self, data: list) -> None:
    """Validate date and datetime field data.

    Runs the BaseField
    [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
    method and also checks that time data has consistent formatting and are valid
    date or datetime values.
    """

    data = self.run_common_validation(data)

    # TODO: report row number of issues - problem of mismatch between
    # self.n_rows (_all_ rows) and index in data (non-NA rows)

    # Check for consistent class formatting, using first row. Do not try and
    # validate further when data is not consistently formatted.
    if self.consistent_class and self.expected_class:
        cell_types = [type(dt) for dt in data]
        cell_type_set = set(cell_types)

        # Are all the values of expected types?
        if not {datetime.datetime, str}.issuperset(cell_type_set):
            self.expected_class = False
            return

        # Are the values internally consistent and consistent with
        # previously loaded data
        if len(cell_type_set) == 0:
            # If data is empty skip consistency checking
            return
        elif len(cell_type_set) > 1:
            self.consistent_class = False
            return
        elif self.first_data_class_set is None:
            self.first_data_class_set = cell_type_set
        elif self.first_data_class_set != cell_type_set:
            self.consistent_class = False
            return
    else:
        return

    # Value checking only happens while data are consistently formatted
    # Look for string formatting, but also presence/absence of non-zero
    # time components.
    midnight = datetime.time(0, 0)

    if self.first_data_class_set == {str}:
        parsed_strings = []
        for val in data:
            try:
                val_parse = parser.isoparse(val)
                if val_parse.time() != midnight:
                    self.all_midnight = False
                parsed_strings.append(val_parse)
            except ValueError:
                self.bad_strings.append(val)

        # standardised to datetime objects
        data = parsed_strings

    elif self.first_data_class_set == {datetime.datetime}:
        for val in data:
            if val.time() != midnight:
                self.all_midnight = False

    # Check that data isn't empty
    if data:
        # Update range for extents
        if self.min is None:
            self.min = min(data)
        else:
            self.min = min([self.min] + data)

        if self.max is None:
            self.max = min(data)
        else:
            self.max = max([self.max] + data)

EmptyField

A BaseField-like interface for empty fields.

This class mocks the interface of BaseField and is only used to check trailing empty fields in loaded field data. These are columns which have no field metadata, but which have are included in the data table. The data rows should be empty.

This class does not inherit from BaseField, so is never included in the BaseField.field_type_map.

Parameters:

Name Type Description Default
meta dict

A dictionary of field metadata. All values should be None with the exception of the column index number (col_idx).

required
Source code in safedata_validator/field.py
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
class EmptyField:
    """A BaseField-like interface for empty fields.

    This class mocks the interface of BaseField and is only used to check trailing empty
    fields in loaded field data. These are columns which have no field metadata, but
    which have are included in the data table. The data rows should be empty.

    This class does not inherit from BaseField, so is never included in the
    BaseField.field_type_map.

    Args:
        meta: A dictionary of field metadata. All values should be None with the
            exception of the column index number (`col_idx`).
    """

    def __init__(self, meta: dict) -> None:
        self.meta = meta
        self.empty = True
        # Get a field name - either a column letter from col_idx if set or 'Unknown'
        idx = self.meta.get("col_idx")
        if idx is None:
            self.field_name = "Unknown"
        else:
            self.field_name = f"Column_{get_column_letter(idx)}"

    def validate_data(self, data) -> None:
        """Validates that empty fields contain no data."""
        blank = [blank_value(val) for val in data]

        if not all(blank):
            self.empty = False

    def report(self):
        """Report on an empty field."""

        if not self.empty:
            LOGGER.info(f"Checking field {self.field_name}")
            FORMATTER.push()
            LOGGER.error("Trailing field with no descriptors contains data.")
            FORMATTER.pop()

report()

Report on an empty field.

Source code in safedata_validator/field.py
2293
2294
2295
2296
2297
2298
2299
2300
def report(self):
    """Report on an empty field."""

    if not self.empty:
        LOGGER.info(f"Checking field {self.field_name}")
        FORMATTER.push()
        LOGGER.error("Trailing field with no descriptors contains data.")
        FORMATTER.pop()

validate_data(data)

Validates that empty fields contain no data.

Source code in safedata_validator/field.py
2286
2287
2288
2289
2290
2291
def validate_data(self, data) -> None:
    """Validates that empty fields contain no data."""
    blank = [blank_value(val) for val in data]

    if not all(blank):
        self.empty = False

FileField

Bases: BaseField

A BaseField subclass for time fields.

Extends the BaseField subclass to check file fields. The data values need to match to an external file or can be contained within an archive file provided in the 'file_container' descriptor.

Source code in safedata_validator/field.py
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
class FileField(BaseField):
    """A BaseField subclass for time fields.

    Extends the [BaseField][safedata_validator.field.BaseField] subclass to check file
    fields. The data values need to match to an external file or can be contained within
    an archive file provided in the 'file_container' descriptor.
    """

    field_types = ("file",)

    def __init__(
        self,
        meta: dict,
        dwsh: DataWorksheet | None = None,
        dataset: Dataset | None = None,
    ) -> None:
        super().__init__(meta, dwsh=dwsh, dataset=dataset)

        self.unknown_file_names: set[str] = set()

        # Check whether filename testing is possible
        if self.summary is None:
            self._log(
                "No Summary instance provided - cannot check file fields",
                level=CRITICAL,
            )
            return
        if self.summary.external_files is None:
            self._log("No external files listed in Summary")
            return

        self.external_names = {ex["file"] for ex in self.summary.external_files}

        # Look for a file container metadata value, pointing to a single file
        # containing all of the listed values (zip etc)
        self.file_container = self.meta.get("file_container")

        if self.file_container is not None:
            if self.file_container not in self.external_names:
                self._log(
                    f"Field file_container value not found in external files:"
                    f" {self.file_container}"
                )

    def validate_data(self, data: list):
        """Validate file field data.

        Runs the BaseField
        [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
        method, and also checks for unknown files given in a file field.
        """

        data = self.run_common_validation(data)

        # If the files are listed in external and not provided in a file
        # container then check they are all present
        if not self.file_container:
            unknown_files = set(data) - self.external_names
            if unknown_files:
                self.unknown_file_names.update(unknown_files)

    def report(self):
        """Report on field creation and data validation for file fields.

        Extends the [BaseField report method][safedata_validator.field.BaseField.report]
        to flag unknown files.
        """
        super().report()
        FORMATTER.push()

        if self.unknown_file_names:
            LOGGER.error(
                "Field contains external files not provided in Summary: ",
                extra={"join": self.unknown_file_names},
            )

        FORMATTER.pop()

report()

Report on field creation and data validation for file fields.

Extends the BaseField report method to flag unknown files.

Source code in safedata_validator/field.py
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
def report(self):
    """Report on field creation and data validation for file fields.

    Extends the [BaseField report method][safedata_validator.field.BaseField.report]
    to flag unknown files.
    """
    super().report()
    FORMATTER.push()

    if self.unknown_file_names:
        LOGGER.error(
            "Field contains external files not provided in Summary: ",
            extra={"join": self.unknown_file_names},
        )

    FORMATTER.pop()

validate_data(data)

Validate file field data.

Runs the BaseField run_common_validation method, and also checks for unknown files given in a file field.

Source code in safedata_validator/field.py
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
def validate_data(self, data: list):
    """Validate file field data.

    Runs the BaseField
    [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
    method, and also checks for unknown files given in a file field.
    """

    data = self.run_common_validation(data)

    # If the files are listed in external and not provided in a file
    # container then check they are all present
    if not self.file_container:
        unknown_files = set(data) - self.external_names
        if unknown_files:
            self.unknown_file_names.update(unknown_files)

GeoField

Bases: BaseField

A BaseField subclass for latitude and longitude fields.

Extends the BaseField __init__ method to check latitude and longitude data.

Source code in safedata_validator/field.py
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
class GeoField(BaseField):
    """A BaseField subclass for latitude and longitude fields.

    Extends the [BaseField `__init__` method][safedata_validator.field.BaseField]
    to check latitude and longitude data.
    """

    field_types = ("latitude", "longitude")

    def __init__(
        self,
        meta: dict,
        dwsh: DataWorksheet | None = None,
        dataset: Dataset | None = None,
    ) -> None:
        super().__init__(meta, dwsh=dwsh, dataset=dataset)

        if self.dataset is None:
            self._log("No dataset object provided - cannot update extents")

        self.min = None
        self.max = None

    def validate_data(self, data: list) -> None:
        """Validate latitude and longitude data.

        Runs the BaseField
        [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
        method, and also checks for non-decimal formatting (e.g. 12°24'32"W) and
        collates the data range.
        """
        data = self.run_common_validation(data)

        data_as_number = IsNumber(data, keep_failed=False)

        if not data_as_number:
            self._log("Field contains non-numeric data")

            if any([RE_DMS.search(str(dt)) for dt in data_as_number.failed]):
                self._log(
                    "Possible degrees minutes and seconds formatting? Use decimal "
                    "degrees",
                    WARNING,
                )

        if data_as_number.values:
            self.min = (
                min(data_as_number.values + [self.min])
                if self.min
                else min(data_as_number.values)
            )
            self.max = (
                max(data_as_number.values + [self.max])
                if self.max
                else max(data_as_number.values)
            )

    def report(self) -> None:
        """Report on field creation and data validation for latitude and longitude data.

        Extends the [BaseField report method][safedata_validator.field.BaseField.report]
        to check the coordinate range against the dataset geographic extents.
        """
        super().report()
        FORMATTER.push()

        if self.min is None or self.max is None:
            return

        # Note that self.min and self.max can be None here if all data is invalid, but
        # the update process handles None.
        if self.dataset is not None:
            if self.meta["field_type"] == "latitude":
                self.dataset.latitudinal_extent.update([self.min, self.max])
            elif self.meta["field_type"] == "longitude":
                self.dataset.longitudinal_extent.update([self.min, self.max])

        FORMATTER.pop()

report()

Report on field creation and data validation for latitude and longitude data.

Extends the BaseField report method to check the coordinate range against the dataset geographic extents.

Source code in safedata_validator/field.py
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
def report(self) -> None:
    """Report on field creation and data validation for latitude and longitude data.

    Extends the [BaseField report method][safedata_validator.field.BaseField.report]
    to check the coordinate range against the dataset geographic extents.
    """
    super().report()
    FORMATTER.push()

    if self.min is None or self.max is None:
        return

    # Note that self.min and self.max can be None here if all data is invalid, but
    # the update process handles None.
    if self.dataset is not None:
        if self.meta["field_type"] == "latitude":
            self.dataset.latitudinal_extent.update([self.min, self.max])
        elif self.meta["field_type"] == "longitude":
            self.dataset.longitudinal_extent.update([self.min, self.max])

    FORMATTER.pop()

validate_data(data)

Validate latitude and longitude data.

Runs the BaseField run_common_validation method, and also checks for non-decimal formatting (e.g. 12°24'32"W) and collates the data range.

Source code in safedata_validator/field.py
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
def validate_data(self, data: list) -> None:
    """Validate latitude and longitude data.

    Runs the BaseField
    [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
    method, and also checks for non-decimal formatting (e.g. 12°24'32"W) and
    collates the data range.
    """
    data = self.run_common_validation(data)

    data_as_number = IsNumber(data, keep_failed=False)

    if not data_as_number:
        self._log("Field contains non-numeric data")

        if any([RE_DMS.search(str(dt)) for dt in data_as_number.failed]):
            self._log(
                "Possible degrees minutes and seconds formatting? Use decimal "
                "degrees",
                WARNING,
            )

    if data_as_number.values:
        self.min = (
            min(data_as_number.values + [self.min])
            if self.min
            else min(data_as_number.values)
        )
        self.max = (
            max(data_as_number.values + [self.max])
            if self.max
            else max(data_as_number.values)
        )

LocationsField

Bases: BaseField

A BaseField subclass for location fields.

Extends the BaseField __init__ method to check that all the values provided in a locations field are found in the locations data for the dataset.

Source code in safedata_validator/field.py
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
class LocationsField(BaseField):
    """A BaseField subclass for location fields.

    Extends the [BaseField `__init__` method][safedata_validator.field.BaseField]
    to check that all the values provided in a locations field are found in the
    locations data for the dataset.
    """

    field_types = ("locations", "location")

    def __init__(
        self,
        meta: dict,
        dwsh: DataWorksheet | None = None,
        dataset: Dataset | None = None,
    ) -> None:
        super().__init__(meta, dwsh=dwsh, dataset=dataset)

        if self.locations is None:
            self._log("No location details provided for dataset")

        self.locations_found: set[str] = set()

    def validate_data(self, data: list) -> None:
        """Validate location field data.

        Runs the BaseField
        [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
        method and also checks that location names are all known.
        """
        data = self.run_common_validation(data)

        data_locs = IsLocName(data, keep_failed=False)

        if not data_locs:
            self._log("Cells contain invalid location values")

        # Now should be strings and integer codes - convert to string
        # representations as used in the locations
        data = [str(v) for v in data_locs]

        if self.locations is not None:
            self.locations_found.update(data)
            self.locations.locations_used.update(data)

    def report(self) -> None:
        """Report on field creation and data validation for categorical fields.

        Extends the [BaseField report method][safedata_validator.field.BaseField.report]
        to emit undeclared location names.
        """
        super().report()
        FORMATTER.push()

        # TODO - not sure about this - no other fields test for emptiness?
        if self.locations_found == set():
            LOGGER.error("No locations loaded")
            return

        if self.locations is not None:
            extra_locs = self.locations_found.difference(self.locations.locations)

            if extra_locs:
                LOGGER.error(
                    "Includes unreported locations: ", extra={"join": extra_locs}
                )

            # add the found taxa to the list of taxa used
            self.locations.locations_used.update(self.locations_found)

        FORMATTER.pop()

report()

Report on field creation and data validation for categorical fields.

Extends the BaseField report method to emit undeclared location names.

Source code in safedata_validator/field.py
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
def report(self) -> None:
    """Report on field creation and data validation for categorical fields.

    Extends the [BaseField report method][safedata_validator.field.BaseField.report]
    to emit undeclared location names.
    """
    super().report()
    FORMATTER.push()

    # TODO - not sure about this - no other fields test for emptiness?
    if self.locations_found == set():
        LOGGER.error("No locations loaded")
        return

    if self.locations is not None:
        extra_locs = self.locations_found.difference(self.locations.locations)

        if extra_locs:
            LOGGER.error(
                "Includes unreported locations: ", extra={"join": extra_locs}
            )

        # add the found taxa to the list of taxa used
        self.locations.locations_used.update(self.locations_found)

    FORMATTER.pop()

validate_data(data)

Validate location field data.

Runs the BaseField run_common_validation method and also checks that location names are all known.

Source code in safedata_validator/field.py
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
def validate_data(self, data: list) -> None:
    """Validate location field data.

    Runs the BaseField
    [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
    method and also checks that location names are all known.
    """
    data = self.run_common_validation(data)

    data_locs = IsLocName(data, keep_failed=False)

    if not data_locs:
        self._log("Cells contain invalid location values")

    # Now should be strings and integer codes - convert to string
    # representations as used in the locations
    data = [str(v) for v in data_locs]

    if self.locations is not None:
        self.locations_found.update(data)
        self.locations.locations_used.update(data)

NumericField

Bases: BaseField

A BaseField subclass for numeric fields.

Extends BaseField to validate numeric data fields.

Source code in safedata_validator/field.py
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
class NumericField(BaseField):
    """A BaseField subclass for numeric fields.

    Extends [BaseField][safedata_validator.field.BaseField] to validate numeric data
    fields.
    """

    field_types: tuple[str, ...] = ("numeric",)
    required_descriptors = MANDATORY_DESCRIPTORS + ["method", "units"]

    def validate_data(self, data: list) -> None:
        """Validate numeric field data.

        Runs the BaseField
        [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
        method and also ensures that data values are numeric.
        """
        data = self.run_common_validation(data)

        numeric = IsNumber(data)

        if not numeric:
            self._log("Cells contain non-numeric values")

validate_data(data)

Validate numeric field data.

Runs the BaseField run_common_validation method and also ensures that data values are numeric.

Source code in safedata_validator/field.py
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
def validate_data(self, data: list) -> None:
    """Validate numeric field data.

    Runs the BaseField
    [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
    method and also ensures that data values are numeric.
    """
    data = self.run_common_validation(data)

    numeric = IsNumber(data)

    if not numeric:
        self._log("Cells contain non-numeric values")

NumericInteractionField

Bases: NumericField

A NumericField subclass for numeric interaction fields.

Further extends the NumericField subclass to also apply checking of the interaction metadata for numeric interaction data.

Source code in safedata_validator/field.py
1883
1884
1885
1886
1887
1888
1889
1890
1891
class NumericInteractionField(NumericField):
    """A NumericField subclass for numeric interaction fields.

    Further extends the [NumericField][safedata_validator.field.NumericField] subclass
    to also apply checking of the interaction metadata for numeric interaction data.
    """

    field_types = ("numeric interaction",)
    check_interaction_meta = True

ReplicateField

Bases: BaseField

A BaseField subclass for Replicate and ID fields.

Applies the base data checking needed on Replicate and ID fields. See safedata_validator.field.BaseField for details.

Source code in safedata_validator/field.py
1525
1526
1527
1528
1529
1530
1531
1532
class ReplicateField(BaseField):
    """A BaseField subclass for Replicate and ID fields.

    Applies the base data checking needed on Replicate and ID fields. See
    [safedata_validator.field.BaseField][] for details.
    """

    field_types = ("replicate", "id")

TaxaField

Bases: BaseField

A BaseField subclass for taxa fields.

Extends BaseField to to track the reported taxa in the field during data validation.

Source code in safedata_validator/field.py
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
class TaxaField(BaseField):
    """A BaseField subclass for taxa fields.

    Extends [BaseField][safedata_validator.field.BaseField] to to track the reported
    taxa in the field during data validation.
    """

    field_types = ("taxa",)

    def __init__(
        self,
        meta: dict,
        dwsh: DataWorksheet | None = None,
        dataset: Dataset | None = None,
    ) -> None:
        super().__init__(meta, dwsh=dwsh, dataset=dataset)

        if self.taxa is None:
            self._log("No taxon details provided for dataset")

        self.taxa_found: set[str] = set()

    def validate_data(self, data: list) -> None:
        """Validate taxa field data.

        Runs the BaseField
        [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
        method and also looks for non-string values and tracks unused or unknown taxon
        names.
        """
        data = self.run_common_validation(data)

        data_as_string = IsString(data, keep_failed=False)

        if not data_as_string:
            self._log("Cells contain non-string values")

        if self.taxa is not None:
            self.taxa_found.update(data_as_string)
            self.taxa.taxon_names_used.update(data_as_string)

    def report(self) -> None:
        """Report on field creation and data validation for taxa fields.

        Extends the [BaseField report method][safedata_validator.field.BaseField.report]
        to emit unused or unknown taxon names
        """
        super().report()
        FORMATTER.push()

        # TODO - not sure about this - no other fields test for emptiness?
        if self.taxa_found == set():
            LOGGER.error("No taxa loaded")
            return

        if self.taxa is not None:
            extra_taxa = self.taxa_found.difference(self.taxa.taxon_names)

            if extra_taxa:
                LOGGER.error("Includes unreported taxa: ", extra={"join": extra_taxa})

            # add the found taxa to the list of taxa used
            self.taxa.taxon_names_used.update(self.taxa_found)

        FORMATTER.pop()

report()

Report on field creation and data validation for taxa fields.

Extends the BaseField report method to emit unused or unknown taxon names

Source code in safedata_validator/field.py
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
def report(self) -> None:
    """Report on field creation and data validation for taxa fields.

    Extends the [BaseField report method][safedata_validator.field.BaseField.report]
    to emit unused or unknown taxon names
    """
    super().report()
    FORMATTER.push()

    # TODO - not sure about this - no other fields test for emptiness?
    if self.taxa_found == set():
        LOGGER.error("No taxa loaded")
        return

    if self.taxa is not None:
        extra_taxa = self.taxa_found.difference(self.taxa.taxon_names)

        if extra_taxa:
            LOGGER.error("Includes unreported taxa: ", extra={"join": extra_taxa})

        # add the found taxa to the list of taxa used
        self.taxa.taxon_names_used.update(self.taxa_found)

    FORMATTER.pop()

validate_data(data)

Validate taxa field data.

Runs the BaseField run_common_validation method and also looks for non-string values and tracks unused or unknown taxon names.

Source code in safedata_validator/field.py
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
def validate_data(self, data: list) -> None:
    """Validate taxa field data.

    Runs the BaseField
    [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
    method and also looks for non-string values and tracks unused or unknown taxon
    names.
    """
    data = self.run_common_validation(data)

    data_as_string = IsString(data, keep_failed=False)

    if not data_as_string:
        self._log("Cells contain non-string values")

    if self.taxa is not None:
        self.taxa_found.update(data_as_string)
        self.taxa.taxon_names_used.update(data_as_string)

TimeField

Bases: BaseField

A BaseField subclass for time fields.

Extends the BaseField subclass to validate data in time fields. Time field validation is primarily about checking the consistency of the provided data. The function accepts either as ISO strings in a consistent time format or datetime.time objects. The parsing of timestrings follows ISO8601, which includes quite a range of valid inputs: '11:12:13' and '111213' are both valid.

One problem here is that - when stored as Excel datetime cells - dates and times in Excel are poorly typed: essentially as a number with a meaning that depends in part on the cell format. The openpyxl package returns datetime.time objects for cells with time formats.

Source code in safedata_validator/field.py
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
class TimeField(BaseField):
    """A BaseField subclass for time fields.

    Extends the [BaseField][safedata_validator.field.BaseField] subclass to validate
    data in time fields.  Time field validation is primarily about checking the
    consistency of the provided data. The function accepts either as ISO strings in a
    consistent time format or datetime.time objects. The parsing of timestrings follows
    ISO8601, which includes quite a range of valid inputs: '11:12:13' and '111213' are
    both valid.

    One problem here is that - when stored as Excel datetime cells - dates and times in
    Excel are poorly typed: essentially as a number with a meaning that depends in part
    on the cell format. The openpyxl package returns datetime.time objects for cells
    with time formats.
    """

    field_types = ("time",)

    def __init__(
        self,
        meta: dict,
        dwsh: DataWorksheet | None = None,
        dataset: Dataset | None = None,
    ) -> None:
        super().__init__(meta, dwsh=dwsh, dataset=dataset)

        # Defaults
        self.first_data_class_set: set | None = None
        self.consistent_class = True
        self.expected_class = True

        self.bad_strings: list[str] = []
        self.min = None
        self.max = None

    def validate_data(self, data: list) -> None:
        """Validate time field data.

        Runs the BaseField
        [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
        methoda nd also checks that time data has consistent formatting and are valid
        time values.
        """
        data = self.run_common_validation(data)

        # TODO: report row number of issues - problem of mismatch between
        # self.n_rows (_all_ rows) and index in data (non-NA rows)

        # Check for consistent class formatting, using first row. Do not try and
        # validate further when data is not consistently formatted.
        if self.consistent_class and self.expected_class:
            cell_types = [type(dt) for dt in data]
            cell_type_set = set(cell_types)

            # Are all the values of expected types?
            if not {datetime.time, str}.issuperset(cell_type_set):
                self.expected_class = False
                return

            # Are the values internally consistent and consistent with
            # previously loaded data
            if len(cell_type_set) == 0:
                # If data is empty skip consistency checking
                return
            elif len(cell_type_set) > 1:
                self.consistent_class = False
                return
            elif self.first_data_class_set is None:
                self.first_data_class_set = cell_type_set
            elif self.first_data_class_set != cell_type_set:
                self.consistent_class = False
                return
        else:
            return

        # Value checking only happens while data are consistently formatted
        # There is no need to check time objects passed in, just time formatted
        # strings
        if self.first_data_class_set == {str}:
            for val in data:
                try:
                    _ = parser.isoparser().parse_isotime(val)
                except ValueError:
                    self.bad_strings.append(val)

    def report(self):
        """Report on field creation and data validation for time fields.

        Extends the [BaseField report method][safedata_validator.field.BaseField.report]
        to flag inconsistent time formatting and invalid data.
        """
        super().report()
        FORMATTER.push()

        if not self.expected_class:
            LOGGER.error(
                "Time data include values neither ISO string nor time formatted"
            )
            return

        if not self.consistent_class:
            LOGGER.error("Time data mixes ISO string and time formatted rows")
            return

        if self.bad_strings:
            LOGGER.error(
                "ISO time strings contain badly formatted values: e.g.",
                extra={"join": self.bad_strings[:5]},
            )

        FORMATTER.pop()

report()

Report on field creation and data validation for time fields.

Extends the BaseField report method to flag inconsistent time formatting and invalid data.

Source code in safedata_validator/field.py
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
def report(self):
    """Report on field creation and data validation for time fields.

    Extends the [BaseField report method][safedata_validator.field.BaseField.report]
    to flag inconsistent time formatting and invalid data.
    """
    super().report()
    FORMATTER.push()

    if not self.expected_class:
        LOGGER.error(
            "Time data include values neither ISO string nor time formatted"
        )
        return

    if not self.consistent_class:
        LOGGER.error("Time data mixes ISO string and time formatted rows")
        return

    if self.bad_strings:
        LOGGER.error(
            "ISO time strings contain badly formatted values: e.g.",
            extra={"join": self.bad_strings[:5]},
        )

    FORMATTER.pop()

validate_data(data)

Validate time field data.

Runs the BaseField run_common_validation methoda nd also checks that time data has consistent formatting and are valid time values.

Source code in safedata_validator/field.py
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
def validate_data(self, data: list) -> None:
    """Validate time field data.

    Runs the BaseField
    [run_common_validation][safedata_validator.field.BaseField.run_common_validation]
    methoda nd also checks that time data has consistent formatting and are valid
    time values.
    """
    data = self.run_common_validation(data)

    # TODO: report row number of issues - problem of mismatch between
    # self.n_rows (_all_ rows) and index in data (non-NA rows)

    # Check for consistent class formatting, using first row. Do not try and
    # validate further when data is not consistently formatted.
    if self.consistent_class and self.expected_class:
        cell_types = [type(dt) for dt in data]
        cell_type_set = set(cell_types)

        # Are all the values of expected types?
        if not {datetime.time, str}.issuperset(cell_type_set):
            self.expected_class = False
            return

        # Are the values internally consistent and consistent with
        # previously loaded data
        if len(cell_type_set) == 0:
            # If data is empty skip consistency checking
            return
        elif len(cell_type_set) > 1:
            self.consistent_class = False
            return
        elif self.first_data_class_set is None:
            self.first_data_class_set = cell_type_set
        elif self.first_data_class_set != cell_type_set:
            self.consistent_class = False
            return
    else:
        return

    # Value checking only happens while data are consistently formatted
    # There is no need to check time objects passed in, just time formatted
    # strings
    if self.first_data_class_set == {str}:
        for val in data:
            try:
                _ = parser.isoparser().parse_isotime(val)
            except ValueError:
                self.bad_strings.append(val)