Skip to content

The locations submodule

This module provides the Locations class, which used to validate a set of known and or new location information, formatted in the safedata_validator style. An instance can then also be used to track which of the validated set of locations is used in the rest of the dataset.

Locations

An interface for Location metadata.

A Locations instance is initialised using a Resources instance that provides data on known valid locations. The instance validates location names provided in the Locations table of a Dataset and then validates and updates the latitudinal and logitudinal extent of those locations. The instance can then be used to track the use of location names across data tables in the dataset.

Parameters:

Name Type Description Default
resources Resources

A Resources instance, used to provide information about known locations

required
latitudinal_extent Extent | None

An Extent instance tracking latititudinal extents.

None
longitudinal_extent Extent | None

An Extent instance tracking longitudinal extents.

None

Attributes:

Name Type Description
locations set

A list of

locations_index set
locations_used set
valid_locations
location_aliases
known_loc_names
latitudinal_extent
longitudinal_extent
Source code in safedata_validator/locations.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
class Locations:
    """An interface for Location metadata.

    A Locations instance is initialised using a Resources instance that provides data on
    known valid locations. The instance validates location names provided in the
    Locations table of a Dataset and then validates and updates the latitudinal and
    logitudinal extent of those locations. The instance can then be used to track the
    use of location names across data tables in the dataset.

    Args:
        resources: A Resources instance, used to provide information about
            known locations
        latitudinal_extent: An Extent instance tracking latititudinal extents.
        longitudinal_extent: An Extent instance tracking longitudinal extents.

    Attributes:
        locations: A list of
        locations_index:
        locations_used:
        valid_locations:
        location_aliases:
        known_loc_names:
        latitudinal_extent:
        longitudinal_extent:
    """

    def __init__(
        self,
        resources: Resources,
        latitudinal_extent: Extent | None = None,
        longitudinal_extent: Extent | None = None,
    ) -> None:
        self.n_errors = 0
        self.locations: set = set()
        self.location_index: list = []
        self.locations_used: set = set()

        self.valid_locations = resources.valid_locations
        self.location_aliases = resources.location_aliases
        self.known_loc_names = set(
            list(resources.valid_locations.keys())
            + list(resources.location_aliases.keys())
        )

        # Attach or create extents
        if latitudinal_extent is None:
            self.latitudinal_extent = Extent(
                "latitudinal extent",
                (float, int),
                hard_bounds=resources.extents.latitudinal_hard_extent,
                soft_bounds=resources.extents.latitudinal_soft_extent,
            )
        else:
            self.latitudinal_extent = latitudinal_extent

        if longitudinal_extent is None:
            self.longitudinal_extent = Extent(
                "latitudinal extent",
                (float, int),
                hard_bounds=resources.extents.longitudinal_hard_extent,
                soft_bounds=resources.extents.longitudinal_soft_extent,
            )
        else:
            self.longitudinal_extent = longitudinal_extent

    @loggerinfo_push_pop("Loading Locations worksheet")
    def load(self, worksheet: Worksheet):
        """Populate a Locations instance from an Excel Worksheet.

        Validates the contents of a locations table stored in an Excel Worksheet and
        then updates the geographic extent of the locations used.

        Args:
            worksheet: An openpyxl Worksheet instance containing the formatted set of
                locations used within a Dataset.
        """
        handler = get_handler()
        start_errors = handler.counters["ERROR"]

        # Load the locations data frame - which runs header checks
        dframe = GetDataFrame(worksheet)

        if not dframe.data_columns:
            LOGGER.error("No data or only headers in Locations worksheet")
            return

        # Dupe headers likely cause serious issues, so stop
        if "duplicated" in dframe.bad_headers:
            LOGGER.error("Cannot parse locations with duplicated headers")
            return

        # Reduce to lower case
        # TODO - not trapping dupes that are only case
        #        differences. Do that in GetDataFrame?
        headers = IsLower(dframe.headers).values

        # Check location names are available
        if "location name" not in headers:
            LOGGER.error("Location name column not found")
            return

        # Get dictionaries of values for each row
        locs = [dict(zip(headers, rw)) for rw in zip(*dframe.data_columns)]

        # Split up old and new if there are there any new ones?
        if "new" in headers:
            # Check the New column is just yes, no
            new_vals_with_blanks = IsLower([rw["new"] for rw in locs])
            new_vals = IsNotBlank(new_vals_with_blanks, keep_failed=False)
            if not new_vals:
                LOGGER.error("Missing values in 'new' field")

            # check only yes or no entries
            valid_new = {"yes", "no"}
            bad_new = set(new_vals) - valid_new
            if bad_new:
                LOGGER.error(
                    "Values other than yes and no in 'new' field: ",
                    extra={"join": bad_new},
                )

            # Parse locations that can be assigned to new or known.
            known_locs = [
                rw
                for rw in locs
                if isinstance(rw["new"], str) and rw["new"].lower() == "no"
            ]
            new_locs = [
                rw
                for rw in locs
                if isinstance(rw["new"], str) and rw["new"].lower() == "yes"
            ]
        else:
            new_locs = []
            known_locs = locs

        if known_locs:
            known_loc_names = [lc["location name"] for lc in known_locs]
            self.add_known_locations(known_loc_names)

        if new_locs:
            self.add_new_locations(new_locs)

        # summary of processing
        self.n_errors = handler.counters["ERROR"] - start_errors

        if self.n_errors > 0:
            LOGGER.info(f"Locations contains {self.n_errors} errors")
        else:
            LOGGER.info(f"{len(self.locations)} locations loaded correctly")

    @loggerinfo_push_pop("Checking new locations")
    def add_new_locations(self, locs: list[dict]):
        """Add new locations to a Locations instance.

        This method takes a list of dictionaries giving the details of new locations to
        be added to the instance. These are user-defined locations that are not included
        in the set of known locations loaded from the instance resources. These
        dictionaries  should contain keys `location name` and `type` and then at least
        one of:

        * `latitude` _and_ `longitude` as float values
        * `wkt` providing a WellKnownText geometry for the location.

        Either of these options _can_ be 'NA' to show that the location coordinates are
        not known, but they must be provided.

        Args:
            locs: The list of dictionaries of user-defined locations.
        """

        # Validation - TODO check locs is a list of dicts

        # - Do all the dicts have the same keys
        loc_keys = {tuple(k.keys()) for k in locs}
        if len(loc_keys) > 1:
            LOGGER.critical("Inconsistent keys in add_new_locations")
            return

        # - Do they provide location names...
        location_keys = loc_keys.pop()
        if "location name" not in location_keys:
            LOGGER.error("No location name entries in add_new_locations")
            return

        # - ... and are any of those names blank ...
        loc_names_with_blanks = [itm["location name"] for itm in locs]
        loc_names = IsNotBlank(loc_names_with_blanks, keep_failed=False)
        if not loc_names:
            LOGGER.error("Location names contains empty cells or whitespace text")

        # - ... or not strings. Only allow strings for new locations - known
        #   locations get a pass for integer site codes but not here.
        loc_names_as_str = IsString(loc_names)
        if not loc_names_as_str:
            LOGGER.error(
                "New location names include non-string values: ",
                extra={"join": loc_names_as_str.failed},
            )

        # Look for duplicated names in inputs - this includes all types and so
        # could give messy information but can't detect duplicates late on
        # cleaned names because we're dealing with sets by that point and there
        # can be no duplication in sets
        dupes = HasDuplicates(loc_names_as_str)
        if dupes:
            LOGGER.error(
                "New location names contain duplicated values: ",
                extra={"join": dupes.duplicated},
            )

        # Look for new names that duplicate known names
        duplicates_existing = [
            rw["location name"]
            for rw in locs
            if rw["location name"] in self.known_loc_names
        ]

        if duplicates_existing:
            LOGGER.error(
                "New location names duplicate known names and aliases: ",
                extra={"join": duplicates_existing},
            )

        # Type is required - used to indicate the kind of location in the absence
        # of any actual geodata
        if "type" not in location_keys:
            LOGGER.error("New locations do not provide the location type")
            for this_loc in locs:
                this_loc["type"] = "MISSING"
        else:
            # get lowercase types
            geo_types_with_blanks = set(IsLower([vl["type"] for vl in locs]))

            # Handle blanks
            geo_types = IsNotBlank(geo_types_with_blanks, keep_failed=False)
            if not geo_types:
                LOGGER.error(
                    "Types for new locations contains blank or whitespace entries."
                )

            # Handle unknown geo types
            valid_geotypes = {"point", "linestring", "polygon", "transect", "area"}
            bad_geo_types = set(geo_types) - valid_geotypes
            if bad_geo_types:
                LOGGER.error(
                    "New locations include unknown location types: ",
                    extra={"join": bad_geo_types},
                )

        # Geometry information
        # Record which geom columns are present
        if ("latitude" in location_keys) ^ ("longitude" in location_keys):
            LOGGER.error(
                "New locations should either latitude _and_ longitude or neither"
            )

        lonlat_provided = "latitude" in location_keys and "longitude" in location_keys
        wkt_provided = "wkt" in location_keys

        # TODO - supplying both is not an error, and probably shouldn't be. WKT
        # takes priority in the index when both are present, but not testing for
        # congruence at present

        # Look for geographic data (even if it is just the explicit statement
        # that none is available using NA)
        if not (lonlat_provided or wkt_provided):
            LOGGER.error(
                "New locations reported: you must provide Lat/Long or WKT,"
                "using NA explicitly when this data is missing."
            )
        else:
            # Check Lat Long and WKT

            if lonlat_provided:
                LOGGER.info("Validating lat / long data")
                FORMATTER.push()
                for axs, ext_attr in [
                    ("latitude", "latitudinal_extent"),
                    ("longitude", "longitudinal_extent"),
                ]:
                    # Allow NAs for unknown location points
                    axs_vals_with_blanks = [vl[axs] for vl in locs if vl[axs] != "NA"]
                    axs_vals = IsNotBlank(axs_vals_with_blanks, keep_failed=False)
                    if not axs_vals:
                        LOGGER.error(f"Blank {axs} values for new locations: use NA.")

                    # Check for data types _here_ to keep interpretable errors
                    # TODO - maybe simplify Extent objects to _assume_ types.
                    #        Probably no but there is some duplication of effort here
                    axs_vals_as_number = IsNumber(axs_vals, keep_failed=False)
                    if not axs_vals_as_number:
                        LOGGER.error(
                            f"Non-numeric {axs} values for new locations: ",
                            extra={"join": axs_vals_as_number.failed},
                        )

                    # Update extent instances
                    if axs_vals_as_number.values:
                        ext = getattr(self, ext_attr)
                        ext.update(axs_vals_as_number)

                FORMATTER.pop()

            if wkt_provided:
                LOGGER.info("Validating WKT data")
                FORMATTER.push()

                blank_wkt = []
                non_string_wkt = []
                bad_wkt = []
                bounds = []

                for this_new_loc in locs:
                    if this_new_loc["wkt"] is None:
                        blank_wkt.append(this_new_loc["location name"])
                    elif not isinstance(this_new_loc["wkt"], str):
                        non_string_wkt.append(this_new_loc["location name"])
                    elif this_new_loc["wkt"].isspace():
                        blank_wkt.append(this_new_loc["location name"])
                    elif this_new_loc["wkt"] == "NA":
                        pass

                    else:
                        # Run the potential WKT through the parser
                        try:
                            this_new_geom = wkt.loads(this_new_loc["wkt"])
                        except WKTReadingError:
                            bad_wkt.append(this_new_loc["location name"])
                        else:
                            # Is it a valid 2D geom
                            if not this_new_geom.is_valid or this_new_geom.has_z:
                                bad_wkt.append(this_new_loc["location name"])
                            # Store the extents to check for sensible coordinates
                            bounds.append(this_new_geom.bounds)

                if blank_wkt:
                    LOGGER.error("Blank WKT values for new locations: use NA.")

                if non_string_wkt:
                    LOGGER.error(
                        "WKT values for new location not a string: ",
                        extra={"join": non_string_wkt},
                    )

                if bad_wkt:
                    LOGGER.error(
                        "WKT information badly formatted, not geometrically valid or "
                        "3D: ",
                        extra={"join": bad_wkt},
                    )

                if bounds:
                    # Extract from bound tuples to lists of lats and longs
                    lat_bnds = [bnd[1] for bnd in bounds] + [bnd[3] for bnd in bounds]
                    lng_bnds = [bnd[0] for bnd in bounds] + [bnd[2] for bnd in bounds]
                    self.latitudinal_extent.update(lat_bnds)
                    self.longitudinal_extent.update(lng_bnds)

                FORMATTER.pop()

        # new location names
        # - test for duplicated names to already added values
        duped_names = [lc for lc in loc_names if lc in self.locations]
        if duped_names:
            LOGGER.error(
                "Location names already added to Location instance: ",
                extra={"join": duped_names},
            )

        self.locations.update(loc_names)

        for this_new_loc in locs:
            if wkt_provided and this_new_loc["wkt"] != "NA":
                geom = this_new_loc["wkt"]
            elif lonlat_provided and (
                (this_new_loc["latitude"] != "NA")
                and (this_new_loc["longitude"] != "NA")
            ):
                geom = "Point({longitude} {latitude})".format(**this_new_loc)
            else:
                geom = None

            self.location_index.append((this_new_loc["location name"], True, geom))

    @loggerinfo_push_pop("Checking known locations")
    def add_known_locations(self, loc_names: list):
        """Add known locations to a Locations instance.

        This method takes a list of values and tries to validate those values against
        known locations from the loaded resources. The values are expected to be
        strings.

        Args:
            loc_names: A list of known location names.
        """

        # Check for blanks
        loc_names_no_blanks = IsNotBlank(loc_names, keep_failed=False)
        if not loc_names_no_blanks:
            LOGGER.error("Location names contains empty cells or whitespace text")

        # Look for duplicated values in names - this includes all types and so
        # could give messy information but can't detect duplicates late on
        # cleaned names because we're dealing with sets by that point and there
        # can be no duplication
        dupes = HasDuplicates(loc_names_no_blanks)
        if dupes:
            LOGGER.error(
                "Added names contain duplicated values: ",
                extra={"join": dupes.duplicated},
            )

        # Validate and standardise types - strings or integer codes.
        loc_names_standardised = IsLocName(loc_names_no_blanks, keep_failed=False)

        if not loc_names_standardised:
            LOGGER.error(
                "Location names contains values that are not strings or integers: ",
                extra={"join": loc_names_standardised.failed},
            )

        # Enforce strings and check loc names exist
        loc_names_as_str = {str(v) for v in loc_names_standardised}
        unknown = loc_names_as_str - self.known_loc_names
        if unknown:
            LOGGER.error(
                "Unknown locations found: ",
                extra={
                    "join": unknown,
                },
            )

        # are aliases being used?
        aliased = loc_names_as_str & set(self.location_aliases.keys())
        if aliased:
            LOGGER.warning(
                "Locations aliases used. Maybe change to primary location names: ",
                extra={"join": aliased},
            )

        # Get the bounding box of known locations and aliased locations
        bbox_keys = (loc_names_as_str - (unknown | aliased)) | {
            self.location_aliases[ky] for ky in aliased
        }

        # get the extents of known unaliased locations
        if bbox_keys:
            bbox = [
                vl for ky, vl in list(self.valid_locations.items()) if ky in bbox_keys
            ]
            bbox = list(zip(*bbox))
            self.longitudinal_extent.update((min(bbox[0]), max(bbox[2])))
            self.latitudinal_extent.update((min(bbox[1]), max(bbox[3])))

        # Update location names and index
        # - test for duplicated names to already added values
        duped_names = [lc for lc in loc_names_as_str if lc in self.locations]
        if duped_names:
            LOGGER.error(
                "Location names already added to Location instance: ",
                extra={"join": duped_names},
            )

        self.locations.update(loc_names_as_str)
        index_entries = [(lc, False, None) for lc in loc_names_as_str]
        self.location_index.extend(index_entries)

    @property
    def is_empty(self) -> bool:
        """Reports if any locations have been loaded in a Locations instance."""
        return len(self.locations) == 0

add_known_locations(loc_names)

Add known locations to a Locations instance.

This method takes a list of values and tries to validate those values against known locations from the loaded resources. The values are expected to be strings.

Parameters:

Name Type Description Default
loc_names list

A list of known location names.

required
Source code in safedata_validator/locations.py
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
@loggerinfo_push_pop("Checking known locations")
def add_known_locations(self, loc_names: list):
    """Add known locations to a Locations instance.

    This method takes a list of values and tries to validate those values against
    known locations from the loaded resources. The values are expected to be
    strings.

    Args:
        loc_names: A list of known location names.
    """

    # Check for blanks
    loc_names_no_blanks = IsNotBlank(loc_names, keep_failed=False)
    if not loc_names_no_blanks:
        LOGGER.error("Location names contains empty cells or whitespace text")

    # Look for duplicated values in names - this includes all types and so
    # could give messy information but can't detect duplicates late on
    # cleaned names because we're dealing with sets by that point and there
    # can be no duplication
    dupes = HasDuplicates(loc_names_no_blanks)
    if dupes:
        LOGGER.error(
            "Added names contain duplicated values: ",
            extra={"join": dupes.duplicated},
        )

    # Validate and standardise types - strings or integer codes.
    loc_names_standardised = IsLocName(loc_names_no_blanks, keep_failed=False)

    if not loc_names_standardised:
        LOGGER.error(
            "Location names contains values that are not strings or integers: ",
            extra={"join": loc_names_standardised.failed},
        )

    # Enforce strings and check loc names exist
    loc_names_as_str = {str(v) for v in loc_names_standardised}
    unknown = loc_names_as_str - self.known_loc_names
    if unknown:
        LOGGER.error(
            "Unknown locations found: ",
            extra={
                "join": unknown,
            },
        )

    # are aliases being used?
    aliased = loc_names_as_str & set(self.location_aliases.keys())
    if aliased:
        LOGGER.warning(
            "Locations aliases used. Maybe change to primary location names: ",
            extra={"join": aliased},
        )

    # Get the bounding box of known locations and aliased locations
    bbox_keys = (loc_names_as_str - (unknown | aliased)) | {
        self.location_aliases[ky] for ky in aliased
    }

    # get the extents of known unaliased locations
    if bbox_keys:
        bbox = [
            vl for ky, vl in list(self.valid_locations.items()) if ky in bbox_keys
        ]
        bbox = list(zip(*bbox))
        self.longitudinal_extent.update((min(bbox[0]), max(bbox[2])))
        self.latitudinal_extent.update((min(bbox[1]), max(bbox[3])))

    # Update location names and index
    # - test for duplicated names to already added values
    duped_names = [lc for lc in loc_names_as_str if lc in self.locations]
    if duped_names:
        LOGGER.error(
            "Location names already added to Location instance: ",
            extra={"join": duped_names},
        )

    self.locations.update(loc_names_as_str)
    index_entries = [(lc, False, None) for lc in loc_names_as_str]
    self.location_index.extend(index_entries)

add_new_locations(locs)

Add new locations to a Locations instance.

This method takes a list of dictionaries giving the details of new locations to be added to the instance. These are user-defined locations that are not included in the set of known locations loaded from the instance resources. These dictionaries should contain keys location name and type and then at least one of:

  • latitude and longitude as float values
  • wkt providing a WellKnownText geometry for the location.

Either of these options can be 'NA' to show that the location coordinates are not known, but they must be provided.

Parameters:

Name Type Description Default
locs list[dict]

The list of dictionaries of user-defined locations.

required
Source code in safedata_validator/locations.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
@loggerinfo_push_pop("Checking new locations")
def add_new_locations(self, locs: list[dict]):
    """Add new locations to a Locations instance.

    This method takes a list of dictionaries giving the details of new locations to
    be added to the instance. These are user-defined locations that are not included
    in the set of known locations loaded from the instance resources. These
    dictionaries  should contain keys `location name` and `type` and then at least
    one of:

    * `latitude` _and_ `longitude` as float values
    * `wkt` providing a WellKnownText geometry for the location.

    Either of these options _can_ be 'NA' to show that the location coordinates are
    not known, but they must be provided.

    Args:
        locs: The list of dictionaries of user-defined locations.
    """

    # Validation - TODO check locs is a list of dicts

    # - Do all the dicts have the same keys
    loc_keys = {tuple(k.keys()) for k in locs}
    if len(loc_keys) > 1:
        LOGGER.critical("Inconsistent keys in add_new_locations")
        return

    # - Do they provide location names...
    location_keys = loc_keys.pop()
    if "location name" not in location_keys:
        LOGGER.error("No location name entries in add_new_locations")
        return

    # - ... and are any of those names blank ...
    loc_names_with_blanks = [itm["location name"] for itm in locs]
    loc_names = IsNotBlank(loc_names_with_blanks, keep_failed=False)
    if not loc_names:
        LOGGER.error("Location names contains empty cells or whitespace text")

    # - ... or not strings. Only allow strings for new locations - known
    #   locations get a pass for integer site codes but not here.
    loc_names_as_str = IsString(loc_names)
    if not loc_names_as_str:
        LOGGER.error(
            "New location names include non-string values: ",
            extra={"join": loc_names_as_str.failed},
        )

    # Look for duplicated names in inputs - this includes all types and so
    # could give messy information but can't detect duplicates late on
    # cleaned names because we're dealing with sets by that point and there
    # can be no duplication in sets
    dupes = HasDuplicates(loc_names_as_str)
    if dupes:
        LOGGER.error(
            "New location names contain duplicated values: ",
            extra={"join": dupes.duplicated},
        )

    # Look for new names that duplicate known names
    duplicates_existing = [
        rw["location name"]
        for rw in locs
        if rw["location name"] in self.known_loc_names
    ]

    if duplicates_existing:
        LOGGER.error(
            "New location names duplicate known names and aliases: ",
            extra={"join": duplicates_existing},
        )

    # Type is required - used to indicate the kind of location in the absence
    # of any actual geodata
    if "type" not in location_keys:
        LOGGER.error("New locations do not provide the location type")
        for this_loc in locs:
            this_loc["type"] = "MISSING"
    else:
        # get lowercase types
        geo_types_with_blanks = set(IsLower([vl["type"] for vl in locs]))

        # Handle blanks
        geo_types = IsNotBlank(geo_types_with_blanks, keep_failed=False)
        if not geo_types:
            LOGGER.error(
                "Types for new locations contains blank or whitespace entries."
            )

        # Handle unknown geo types
        valid_geotypes = {"point", "linestring", "polygon", "transect", "area"}
        bad_geo_types = set(geo_types) - valid_geotypes
        if bad_geo_types:
            LOGGER.error(
                "New locations include unknown location types: ",
                extra={"join": bad_geo_types},
            )

    # Geometry information
    # Record which geom columns are present
    if ("latitude" in location_keys) ^ ("longitude" in location_keys):
        LOGGER.error(
            "New locations should either latitude _and_ longitude or neither"
        )

    lonlat_provided = "latitude" in location_keys and "longitude" in location_keys
    wkt_provided = "wkt" in location_keys

    # TODO - supplying both is not an error, and probably shouldn't be. WKT
    # takes priority in the index when both are present, but not testing for
    # congruence at present

    # Look for geographic data (even if it is just the explicit statement
    # that none is available using NA)
    if not (lonlat_provided or wkt_provided):
        LOGGER.error(
            "New locations reported: you must provide Lat/Long or WKT,"
            "using NA explicitly when this data is missing."
        )
    else:
        # Check Lat Long and WKT

        if lonlat_provided:
            LOGGER.info("Validating lat / long data")
            FORMATTER.push()
            for axs, ext_attr in [
                ("latitude", "latitudinal_extent"),
                ("longitude", "longitudinal_extent"),
            ]:
                # Allow NAs for unknown location points
                axs_vals_with_blanks = [vl[axs] for vl in locs if vl[axs] != "NA"]
                axs_vals = IsNotBlank(axs_vals_with_blanks, keep_failed=False)
                if not axs_vals:
                    LOGGER.error(f"Blank {axs} values for new locations: use NA.")

                # Check for data types _here_ to keep interpretable errors
                # TODO - maybe simplify Extent objects to _assume_ types.
                #        Probably no but there is some duplication of effort here
                axs_vals_as_number = IsNumber(axs_vals, keep_failed=False)
                if not axs_vals_as_number:
                    LOGGER.error(
                        f"Non-numeric {axs} values for new locations: ",
                        extra={"join": axs_vals_as_number.failed},
                    )

                # Update extent instances
                if axs_vals_as_number.values:
                    ext = getattr(self, ext_attr)
                    ext.update(axs_vals_as_number)

            FORMATTER.pop()

        if wkt_provided:
            LOGGER.info("Validating WKT data")
            FORMATTER.push()

            blank_wkt = []
            non_string_wkt = []
            bad_wkt = []
            bounds = []

            for this_new_loc in locs:
                if this_new_loc["wkt"] is None:
                    blank_wkt.append(this_new_loc["location name"])
                elif not isinstance(this_new_loc["wkt"], str):
                    non_string_wkt.append(this_new_loc["location name"])
                elif this_new_loc["wkt"].isspace():
                    blank_wkt.append(this_new_loc["location name"])
                elif this_new_loc["wkt"] == "NA":
                    pass

                else:
                    # Run the potential WKT through the parser
                    try:
                        this_new_geom = wkt.loads(this_new_loc["wkt"])
                    except WKTReadingError:
                        bad_wkt.append(this_new_loc["location name"])
                    else:
                        # Is it a valid 2D geom
                        if not this_new_geom.is_valid or this_new_geom.has_z:
                            bad_wkt.append(this_new_loc["location name"])
                        # Store the extents to check for sensible coordinates
                        bounds.append(this_new_geom.bounds)

            if blank_wkt:
                LOGGER.error("Blank WKT values for new locations: use NA.")

            if non_string_wkt:
                LOGGER.error(
                    "WKT values for new location not a string: ",
                    extra={"join": non_string_wkt},
                )

            if bad_wkt:
                LOGGER.error(
                    "WKT information badly formatted, not geometrically valid or "
                    "3D: ",
                    extra={"join": bad_wkt},
                )

            if bounds:
                # Extract from bound tuples to lists of lats and longs
                lat_bnds = [bnd[1] for bnd in bounds] + [bnd[3] for bnd in bounds]
                lng_bnds = [bnd[0] for bnd in bounds] + [bnd[2] for bnd in bounds]
                self.latitudinal_extent.update(lat_bnds)
                self.longitudinal_extent.update(lng_bnds)

            FORMATTER.pop()

    # new location names
    # - test for duplicated names to already added values
    duped_names = [lc for lc in loc_names if lc in self.locations]
    if duped_names:
        LOGGER.error(
            "Location names already added to Location instance: ",
            extra={"join": duped_names},
        )

    self.locations.update(loc_names)

    for this_new_loc in locs:
        if wkt_provided and this_new_loc["wkt"] != "NA":
            geom = this_new_loc["wkt"]
        elif lonlat_provided and (
            (this_new_loc["latitude"] != "NA")
            and (this_new_loc["longitude"] != "NA")
        ):
            geom = "Point({longitude} {latitude})".format(**this_new_loc)
        else:
            geom = None

        self.location_index.append((this_new_loc["location name"], True, geom))

is_empty: bool property

Reports if any locations have been loaded in a Locations instance.

load(worksheet)

Populate a Locations instance from an Excel Worksheet.

Validates the contents of a locations table stored in an Excel Worksheet and then updates the geographic extent of the locations used.

Parameters:

Name Type Description Default
worksheet Worksheet

An openpyxl Worksheet instance containing the formatted set of locations used within a Dataset.

required
Source code in safedata_validator/locations.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
@loggerinfo_push_pop("Loading Locations worksheet")
def load(self, worksheet: Worksheet):
    """Populate a Locations instance from an Excel Worksheet.

    Validates the contents of a locations table stored in an Excel Worksheet and
    then updates the geographic extent of the locations used.

    Args:
        worksheet: An openpyxl Worksheet instance containing the formatted set of
            locations used within a Dataset.
    """
    handler = get_handler()
    start_errors = handler.counters["ERROR"]

    # Load the locations data frame - which runs header checks
    dframe = GetDataFrame(worksheet)

    if not dframe.data_columns:
        LOGGER.error("No data or only headers in Locations worksheet")
        return

    # Dupe headers likely cause serious issues, so stop
    if "duplicated" in dframe.bad_headers:
        LOGGER.error("Cannot parse locations with duplicated headers")
        return

    # Reduce to lower case
    # TODO - not trapping dupes that are only case
    #        differences. Do that in GetDataFrame?
    headers = IsLower(dframe.headers).values

    # Check location names are available
    if "location name" not in headers:
        LOGGER.error("Location name column not found")
        return

    # Get dictionaries of values for each row
    locs = [dict(zip(headers, rw)) for rw in zip(*dframe.data_columns)]

    # Split up old and new if there are there any new ones?
    if "new" in headers:
        # Check the New column is just yes, no
        new_vals_with_blanks = IsLower([rw["new"] for rw in locs])
        new_vals = IsNotBlank(new_vals_with_blanks, keep_failed=False)
        if not new_vals:
            LOGGER.error("Missing values in 'new' field")

        # check only yes or no entries
        valid_new = {"yes", "no"}
        bad_new = set(new_vals) - valid_new
        if bad_new:
            LOGGER.error(
                "Values other than yes and no in 'new' field: ",
                extra={"join": bad_new},
            )

        # Parse locations that can be assigned to new or known.
        known_locs = [
            rw
            for rw in locs
            if isinstance(rw["new"], str) and rw["new"].lower() == "no"
        ]
        new_locs = [
            rw
            for rw in locs
            if isinstance(rw["new"], str) and rw["new"].lower() == "yes"
        ]
    else:
        new_locs = []
        known_locs = locs

    if known_locs:
        known_loc_names = [lc["location name"] for lc in known_locs]
        self.add_known_locations(known_loc_names)

    if new_locs:
        self.add_new_locations(new_locs)

    # summary of processing
    self.n_errors = handler.counters["ERROR"] - start_errors

    if self.n_errors > 0:
        LOGGER.info(f"Locations contains {self.n_errors} errors")
    else:
        LOGGER.info(f"{len(self.locations)} locations loaded correctly")