Skip to content

The taxondb submodule

This module contains functions to:

  1. download versions of the GBIF backbone taxonomy database, and
  2. build and index SQLite3 databases of this dataset for use in local taxon validation.

The functions also store the timestamp of each database version in the SQLite3 files, so that datasets can include a taxonomy timestamp.

download_gbif_backbone(outdir, timestamp, url)

Download the GBIF backbone database.

This function downloads the data for a GBIF backbone taxonomy version to a given location.

Parameters:

Name Type Description Default
outdir str

The location to download the files to

required
timestamp str

The timestamp for an available GBIF backbone version.

required
url str

The download URL for the provided version.

required

Returns:

Type Description
dict

A dictionary giving the paths to the downloaded files and timestamp of the

dict

version downloaded.

Source code in safedata_validator/taxondb.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def download_gbif_backbone(outdir: str, timestamp: str, url: str) -> dict:
    """Download the GBIF backbone database.

    This function downloads the data for a GBIF backbone taxonomy version to a given
    location.

    Args:
        outdir: The location to download the files to
        timestamp: The timestamp for an available GBIF backbone version.
        url: The download URL for the provided version.

    Returns:
        A dictionary giving the paths to the downloaded files and timestamp of the
        version downloaded.
    """

    LOGGER.info(f"Downloading {timestamp} GBIF data to: {outdir}")
    FORMATTER.push()
    return_dict = {"timestamp": timestamp}

    # Two possible names for the key backbone file: backbone in earlier snapshots.
    simple_head = requests.head(url + "simple.txt.gz")
    backbone_head = requests.head(url + "backbone.txt.gz")
    deleted_head = requests.head(url + "simple-deleted.txt.gz")

    if not (simple_head.ok or backbone_head.ok):
        log_and_raise(
            "Timestamp version does not provide simple.txt.gz or backbone.txt.gz",
            ValueError,
        )

    # Download files to target directory - alternative names for simple backbone dump
    if simple_head.ok:
        targets = [
            ("simple", "simple.txt.gz", int(simple_head.headers["Content-Length"]))
        ]
    elif backbone_head.ok:
        targets = [
            ("simple", "backbone.txt.gz", int(backbone_head.headers["Content-Length"]))
        ]

    if deleted_head.ok:
        targets += [
            (
                "deleted",
                "simple-deleted.txt.gz",
                int(deleted_head.headers["Content-Length"]),
            )
        ]
    else:
        # Warn the user that deleted taxa information cannot be found, most likely
        # because the database snapshot is too old
        LOGGER.warning(
            "Information on deleted taxa could not be found. This is likely because you"
            "are building a database version older than 2021-11-26. The database will"
            "be built without any information on deleted taxa."
        )

    for key, file, fsize in targets:
        # Download the file with a TQDM progress bar
        LOGGER.info(f"Downloading {file}")
        file_req = requests.get(url + file, stream=True)
        out_path = os.path.join(outdir, file)
        with tqdm.wrapattr(file_req.raw, "read", total=fsize) as r_raw:
            with open(out_path, "wb") as outf:
                shutil.copyfileobj(r_raw, outf)

        # store the file path
        return_dict[key] = out_path

    FORMATTER.pop()

    return return_dict

build_local_gbif(outfile, timestamp, simple, deleted=None, keep=False)

Create a local GBIF backbone database.

This function takes the paths to downloaded data files for the GBIF backbone taxonomy and builds a SQLite3 database file for use in local validation in the safedata_validator package. The location of this file then needs to be included in the package configuration to be used in validation.

The data files can be downloaded using the download_gbif_backbone function and two files can be used. The main data is in 'simple.txt.gz' but deleted taxa can also be included from 'simple-deleted.txt.gz'. Data is read automatically from the compressed files - they do not need to be extracted.

By default, the downloaded files are deleted after the database has been created, but the 'keep' argument can be used to retain them.

Parameters:

Name Type Description Default
outfile str

The filepath to use to create the SQLite file

required
timestamp str

The timestamp of the downloaded version.

required
simple str

The path to the simple.txt.gz file.

required
deleted str | None

The path to the simple-deleted.txt.gz

None
keep bool

Should the original datafiles be retained.

False
Source code in safedata_validator/taxondb.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
def build_local_gbif(
    outfile: str,
    timestamp: str,
    simple: str,
    deleted: str | None = None,
    keep: bool = False,
) -> None:
    """Create a local GBIF backbone database.

    This function takes the paths to downloaded data files for the GBIF backbone
    taxonomy and builds a SQLite3 database file for use in local validation in the
    safedata_validator package. The location of this file then needs to be included in
    the package configuration to be used in validation.

    The data files can be downloaded using the download_gbif_backbone function and two
    files can be used. The main data is in 'simple.txt.gz' but deleted taxa can also be
    included from 'simple-deleted.txt.gz'. Data is read automatically from the
    compressed files - they do not need to be extracted.

    By default, the downloaded files are deleted after the database has been created,
    but the 'keep' argument can be used to retain them.

    Args:
        outfile: The filepath to use to create the SQLite file
        timestamp: The timestamp of the downloaded version.
        simple: The path to the simple.txt.gz file.
        deleted: The path to the simple-deleted.txt.gz
        keep: Should the original datafiles be retained.
    """

    # Guard against madly long fields: these are typically the issues and published in
    # fields that are dropped in building the database itself, but they can be so long
    # that they exceed the default CSV field read limit which explodes the csv reader.
    # For example, the 2022-11-23 version has a row with 231Kb of crab literature. This
    # increases that limit to 512Kb.
    csv.field_size_limit(524288)

    # Create the output file and turn off safety features for speed
    LOGGER.info(f"Building GBIF backbone database in: {outfile}")
    FORMATTER.push()

    con = sqlite3.connect(outfile)
    con.execute("PRAGMA synchronous = OFF")

    # Write the timestamp into a table
    con.execute("CREATE TABLE timestamp (timestamp date);")
    con.execute(f"INSERT INTO timestamp VALUES ('{timestamp}');")
    con.commit()
    LOGGER.info("Timestamp table created")

    # Create the schema for the backbone table, using drop fields to remove unwanted
    # fields in the schema and the data tuples. The file_schema list describes the full
    # set of fields provided by GBIF
    file_schema = [
        ("id", "int PRIMARY KEY"),
        ("parent_key", "int"),
        ("basionym_key", "int"),
        ("is_synonym", "boolean"),
        ("status", "text"),
        ("rank", "text"),
        ("nom_status", "text[]"),
        ("constituent_key", "text"),
        ("origin", "text"),
        ("source_taxon_key", "int"),
        ("kingdom_key", "int"),
        ("phylum_key", "int"),
        ("class_key", "int"),
        ("order_key", "int"),
        ("family_key", "int"),
        ("genus_key", "int"),
        ("species_key", "int"),
        ("name_id", "int"),
        ("scientific_name", "text"),
        ("canonical_name", "text"),
        ("genus_or_above", "text"),
        ("specific_epithet", "text"),
        ("infra_specific_epithet", "text"),
        ("notho_type", "text"),
        ("authorship", "text"),
        ("year", "text"),
        ("bracket_authorship", "text"),
        ("bracket_year", "text"),
        ("name_published_in", "text"),
        ("issues", "text[]"),
    ]

    drop_fields = ["name_published_in", "issues"]

    # Get a logical index of which fields are being kept
    drop_index = [True if vl[0] in drop_fields else False for vl in file_schema]

    # Create the final schema for the backbone table and insert statement
    output_schema = ", ".join(
        [" ".join(val) for val, drop in zip(file_schema, drop_index) if not drop]
    )
    output_schema = f"CREATE TABLE backbone ({output_schema})"

    insert_placeholders = ",".join(["?"] * (len(drop_index) - sum(drop_index)))
    insert_statement = f"INSERT INTO backbone VALUES ({insert_placeholders})"

    # Create the table
    con.execute(output_schema)
    con.commit()
    LOGGER.info("Backbone table created")

    # Import data from the simple backbone and deleted taxa

    # The approach below is more efficient but makes it impossible to drop fields and
    # substitute \\N to None. Although converting \\N to None can be done later with an
    # update statement, you _cannot_ drop fields in sqlite3, so that has to be done up
    # front.
    #
    # con.executemany( insert_statement, bb_reader )

    LOGGER.info("Adding core backbone taxa")

    with gzip.open(simple, "rt", encoding="utf-8") as bbn:
        # The files are tab delimited but the quoting is sometimes unclosed,
        # so turning off quoting - includes quotes in the fields where present
        bb_reader = csv.reader(bbn, delimiter="\t", quoting=csv.QUOTE_NONE)

        # There is no obvious way of finding the number of rows in simple.txt without
        # reading the file and counting them. And that is a huge cost just to provide a
        # progress bar with real percentages, so just show a progress meter to show
        # things happening
        with tqdm(total=None) as pbar:
            # Loop over the lines in the file.
            for row in bb_reader:
                row_clean = [
                    None if val == "\\N" else val
                    for val, drp in zip(row, drop_index)
                    if not drp
                ]

                con.execute(insert_statement, row_clean)
                pbar.update()

        con.commit()

    if deleted is not None:
        LOGGER.info("Adding deleted taxa")

        with gzip.open(deleted, "rt", encoding="utf-8") as dlt:
            # The files are tab delimited but the quoting is sometimes unclosed,
            # so turning off quoting - includes quotes in the fields where present
            dl_reader = csv.reader(dlt, delimiter="\t", quoting=csv.QUOTE_NONE)

            with tqdm(total=None) as pbar:
                for row in dl_reader:
                    row_clean = [
                        None if val == "\\N" else val
                        for val, drp in zip(row, drop_index)
                        if not drp
                    ]

                    # replace the status with DELETED
                    row_clean[4] = "DELETED"

                    con.execute(insert_statement, row_clean)
                    pbar.update()

        con.commit()

    # Create the indices
    LOGGER.info("Creating database indexes")

    con.execute("CREATE INDEX backbone_name_rank ON backbone (canonical_name, rank);")
    con.execute("CREATE INDEX backbone_id ON backbone (id);")
    con.commit()

    # Delete the downloaded files
    if not keep:
        LOGGER.info("Removing downloaded files")
        os.remove(simple)
        if deleted is not None:
            os.remove(deleted)

    FORMATTER.pop()