Skip to content

Utilities

estimate_average_bounds

estimate_average_bounds(gdf, quantile=0.9)

Estimates the average size of all features in a GeoDataFrame.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame that contains all features (i.e. shapely.Geometry objects)

required
quantile float

what quantile will represent the feature population

0.9

Returns:

Type Description
Tuple[float, float]

a tuple of floats representing average width and height

Source code in geococo\utils.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def estimate_average_bounds(
    gdf: gpd.GeoDataFrame, quantile: float = 0.9
) -> Tuple[float, float]:
    """Estimates the average size of all features in a GeoDataFrame.

    :param gdf: GeoDataFrame that contains all features (i.e. shapely.Geometry objects)
    :param quantile: what quantile will represent the feature population
    :return: a tuple of floats representing average width and height
    """

    label_bounds = gdf.bounds
    label_widths = label_bounds.apply(lambda row: row["maxx"] - row["minx"], axis=1)
    label_heights = label_bounds.apply(lambda row: row["maxy"] - row["miny"], axis=1)
    average_width = np.nanquantile(label_widths, quantile).astype(float)
    average_height = np.nanquantile(label_heights, quantile).astype(float)

    return average_width, average_height

estimate_schema

estimate_schema(
    gdf,
    src,
    quantile=0.9,
    window_bounds=[(256, 256), (512, 512)],
)

Attempts to find a schema that is able to represent the average GeoDataFrame feature (i.e. sufficient overlap) but within the bounds given by window_bounds.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame that contains features that determine the degree of overlap

required
src DatasetReader

The rasterio DataSource associated with the resulting schema (i.e. bounds and pixelsizes)

required
quantile float

what quantile will represent the feature population

0.9
window_bounds List[Tuple[int, int]]

a list of possible limits for the window generators

[(256, 256), (512, 512)]

Returns:

Type Description
WindowSchema

(if found) a viable WindowSchema with sufficient overlap within the window_bounds

Source code in geococo\utils.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def estimate_schema(
    gdf: gpd.GeoDataFrame,
    src: DatasetReader,
    quantile: float = 0.9,
    window_bounds: List[Tuple[int, int]] = [(256, 256), (512, 512)],
) -> WindowSchema:
    """Attempts to find a schema that is able to represent the average GeoDataFrame
    feature (i.e. sufficient overlap) but within the bounds given by window_bounds.

    :param gdf: GeoDataFrame that contains features that determine the degree of overlap
    :param src: The rasterio DataSource associated with the resulting schema (i.e.
        bounds and pixelsizes)
    :param quantile: what quantile will represent the feature population
    :param window_bounds: a list of possible limits for the window generators
    :return: (if found) a viable WindowSchema with sufficient overlap within the
        window_bounds
    """

    # estimating the required overlap between windows for labels to be represented fully
    average_width, average_height = estimate_average_bounds(gdf=gdf, quantile=quantile)
    width_overlap = int(np.ceil(average_width / src.res[0]))
    height_overlap = int(np.ceil(average_height / src.res[1]))

    schema = None
    last_exception = None
    for i in range(len(window_bounds)):
        width, height = window_bounds[i]
        try:
            schema = WindowSchema(
                width_window=width,
                width_overlap=width_overlap,
                width_step=None,
                height_window=height,
                height_overlap=height_overlap,
                height_step=None,
            )
            break  # Break the loop as soon as a valid schema is found

        except ValueError as value_error:
            last_exception = value_error
            continue

    if schema is None:
        raise ValueError(
            "No WindowSchema objects could be created from the given "
            f"window_bounds {window_bounds}, raising last Exception.."
        ) from last_exception

    return schema

generate_window_offsets

generate_window_offsets(window, schema)

Computes an array of window offsets bound by a given window.

Parameters:

Name Type Description Default
window Window

the bounding window (i.e. offsets will be within its bounds)

required
schema WindowSchema

the parameters for the window generator

required

Returns:

Type Description
ndarray

an array of window offsets within the bounds of window

Source code in geococo\utils.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def generate_window_offsets(window: Window, schema: WindowSchema) -> np.ndarray:
    """Computes an array of window offsets bound by a given window.

    :param window: the bounding window (i.e. offsets will be within its bounds)
    :param schema: the parameters for the window generator
    :return: an array of window offsets within the bounds of window
    """

    col_range: np.ndarray = np.arange(
        np.max((0, window.col_off - schema.width_overlap)),
        window.width + window.col_off - schema.width_overlap,
        schema.width_step,
    )
    row_range: np.ndarray = np.arange(
        np.max((0, window.row_off - schema.height_overlap)),
        window.height + window.row_off - schema.height_overlap,
        schema.height_step,
    )

    window_offsets = np.array(np.meshgrid(col_range, row_range))
    window_offsets = window_offsets.T.reshape(-1, 2)

    return window_offsets

generate_window_polygon

generate_window_polygon(datasource, window)

Turns the spatial bounds of a given window to a shapely.Polygon object in a given dataset's CRS.

Parameters:

Name Type Description Default
datasource DatasetReader

a rasterio DatasetReader object that provides the affine transformation

required
window Window

bounds to represent as Polygon

required

Returns:

Type Description
Polygon

shapely Polygon representing the spatial bounds of a given window in a given CRS

Source code in geococo\utils.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def generate_window_polygon(datasource: DatasetReader, window: Window) -> Polygon:
    """Turns the spatial bounds of a given window to a shapely.Polygon object in a given
    dataset's CRS.

    :param datasource: a rasterio DatasetReader object that provides the affine
        transformation
    :param window: bounds to represent as Polygon
    :return: shapely Polygon representing the spatial bounds of a given window in a
        given CRS
    """

    window_transform = datasource.window_transform(window)
    window_bounds = array_bounds(window.height, window.width, window_transform)
    window_poly = box(*window_bounds)
    return window_poly

get_date_created

get_date_created(raster_source)

Get the creation date of the input image, represented as a datetime object. If no such information is available in the image's metadata, we return the date the file itself was last modified.

Parameters:

Name Type Description Default
raster_source DatasetReader

reader for input image

required

Returns:

Type Description
datetime

datetime object representing date_created

Source code in geococo\utils.py
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
def get_date_created(raster_source: DatasetReader) -> datetime:
    """
    Get the creation date of the input image, represented as a datetime object.
    If no such information is available in the image's metadata, we return the date
    the file itself was last modified.

    :param raster_source: reader for input image
    :return: datetime object representing date_created
    """

    # all tags representing datetime info and their format
    datetime_tags = {"TIFFTAG_DATETIME": "%Y-%m-%d %H:%M:%S"}

    date_created = None
    for datetime_tag, datetime_format in datetime_tags.items():
        try:
            timestamp = raster_source.tags()[datetime_tag]
            date_created = datetime.strptime(timestamp, datetime_format)
            break  # exit loop when valid datetime is found
        except ValueError:
            # datetime_tag in raster_source.tags() but does not match datetime_format
            try:
                date_created = parser.parse(timestamp)
                break  # exit loop when valid datetime is found
            except ParserError:
                # datetimestamp could not be parsed
                continue
        except KeyError:
            # datetime_tag not in raster_source.tags()
            continue

    # if no valid datetime objects could be parsed, we just get last_modified as date
    if not date_created:
        timestamp = pathlib.Path(raster_source.name).stat().st_ctime
        date_created = datetime.fromtimestamp(timestamp)

    return date_created

mask_label

mask_label(input_raster, label)

Masks out an label from input_raster and flattens it to a 2D binary array. If it doesn't overlap, the resulting mask will only consist of False bools.

Parameters:

Name Type Description Default
input_raster DatasetReader

open rasterio DatasetReader for the input raster

required
label Union[Polygon, MultiPolygon]

Polygon object representing the area to be masked (i.e. label)

required

Returns:

Type Description
ndarray

A 2D binary array representing the label

Source code in geococo\utils.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def mask_label(
    input_raster: DatasetReader, label: Union[Polygon, MultiPolygon]
) -> np.ndarray:
    """Masks out an label from input_raster and flattens it to a 2D binary array. If it
    doesn't overlap, the resulting mask will only consist of False bools.

    :param input_raster: open rasterio DatasetReader for the input raster
    :param label: Polygon object representing the area to be masked (i.e. label)
    :return: A 2D binary array representing the label
    """

    if input_raster.closed:
        raise ValueError("Attempted mask with a closed DatasetReader")

    label_mask, _ = riomask(
        dataset=input_raster, shapes=[label], all_touched=True, filled=False
    )
    label_mask = np.all(label_mask.mask, axis=0)
    label_mask = np.invert(label_mask)

    return label_mask

reshape_image

reshape_image(img_array, shape, padding_value=0)

Reshapes 3D numpy array to match given 3D shape, done through slicing or padding.

Parameters:

Name Type Description Default
img_array ndarray

the numpy array to be reshaped

required
shape Tuple[int, int, int]

the desired shape (bands, rows, cols)

required
padding_value int

what value to pad img_array with (if too small)

0

Returns:

Type Description
ndarray

numpy array in desired shape

Source code in geococo\utils.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def reshape_image(
    img_array: np.ndarray, shape: Tuple[int, int, int], padding_value: int = 0
) -> np.ndarray:
    """Reshapes 3D numpy array to match given 3D shape, done through slicing or padding.

    :param img_array: the numpy array to be reshaped
    :param shape: the desired shape (bands, rows, cols)
    :param padding_value: what value to pad img_array with (if too small)
    :return: numpy array in desired shape
    """

    if len(img_array.shape) != len(shape):
        raise ValueError(
            f"Number of dimensions have to match ({img_array.shape} != {shape})"
        )

    img_array = img_array[: shape[0], : shape[1], : shape[2]]
    img_pads = [(0, max(0, n - img_array.shape[i])) for i, n in enumerate(shape)]
    img_array = np.pad(
        img_array, img_pads, mode="constant", constant_values=padding_value
    )

    return img_array

update_labels

update_labels(
    labels,
    categories,
    id_attribute="category_id",
    name_attribute=None,
)

Updates labels with validated (super)category names and ids from given Category instances (i.e. source of truth created from current and previous labels). This ultimately just matches a given key (name or id) with keys in each Category instance and maps the associated (and validated) values to labels.

Parameters:

Name Type Description Default
labels GeoDataFrame

GeoDataFrame containing labels and category attributes (validated by validate_labels)

required
categories List[Category]

list of Category instances created from current and previous labels

required
id_attribute Optional[str]

Column name that holds category_id values

'category_id'
name_attribute Optional[str]

Column name that holds category_name values

None

Returns:

Type Description
GeoDataFrame

labels with name, id and supercategory attributes from all given Category instances

Source code in geococo\utils.py
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
def update_labels(
    labels: gpd.GeoDataFrame,
    categories: List[Category],
    id_attribute: Optional[str] = "category_id",
    name_attribute: Optional[str] = None,
) -> gpd.GeoDataFrame:
    """Updates labels with validated (super)category names and ids from given Category
    instances (i.e. source of truth created from current and previous labels). This
    ultimately just matches a given key (name or id) with keys in each Category instance
    and maps the associated (and validated) values to labels.

    :param labels: GeoDataFrame containing labels and category attributes (validated by
        validate_labels)
    :param categories: list of Category instances created from current and previous
        labels
    :param id_attribute: Column name that holds category_id values
    :param name_attribute: Column name that holds category_name values
    :return: labels with name, id and supercategory attributes from all given Category
        instances
    """

    # Loading all given Category instances as a single dataframe
    category_pd = pd.DataFrame(
        [category.dict() for category in categories],
        columns=Category.schema()["properties"].keys(),
    )

    # Finding indices for matching values of a given attribute (name or id)
    if id_attribute in labels.columns:
        indices = np.where(
            labels[id_attribute].values.reshape(-1, 1) == category_pd.id.values
        )[1]
    elif name_attribute in labels.columns:
        indices = np.where(
            labels[name_attribute].values.reshape(-1, 1) == category_pd.name.values
        )[1]
    else:
        raise AttributeError("At least one category attribute must be present")

    # Indexing and assigning values associated with COCO attributes (see Category def)
    labels["id"] = category_pd.iloc[indices].id.values
    labels["name"] = category_pd.iloc[indices].name.values
    labels["supercategory"] = category_pd.iloc[indices].supercategory.values

    return labels

validate_labels

validate_labels(
    labels,
    id_attribute="category_id",
    name_attribute=None,
    super_attribute=None,
)

Validates all necessary attributes for a geococo-viable GeoDataFrame. It also checks for the presence of either category_id or category_name values and ensures valid geometry.

Parameters:

Name Type Description Default
labels GeoDataFrame

GeoDataFrame containing labels and category attributes

required
id_attribute Optional[str]

Column name that holds category_id values

'category_id'
name_attribute Optional[str]

Column name that holds category_name values

None
super_attribute Optional[str]

Column name that holds supercategory values

None

Returns:

Type Description
GeoDataFrame

Validated GeoDataFrame with coerced dtypes

Source code in geococo\utils.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def validate_labels(
    labels: gpd.GeoDataFrame,
    id_attribute: Optional[str] = "category_id",
    name_attribute: Optional[str] = None,
    super_attribute: Optional[str] = None,
) -> gpd.GeoDataFrame:
    """Validates all necessary attributes for a geococo-viable GeoDataFrame. It also
    checks for the presence of either category_id or category_name values and ensures
    valid geometry.

    :param labels: GeoDataFrame containing labels and category attributes
    :param id_attribute: Column name that holds category_id values
    :param name_attribute: Column name that holds category_name values
    :param super_attribute: Column name that holds supercategory values
    :return: Validated GeoDataFrame with coerced dtypes
    """

    schema_dict = {
        "geometry": pa.Column(
            GeometryDtype(),
            pa.Check(lambda geoms: geoms.is_valid, error="Invalid geometry found"),
            nullable=False,
        ),
        id_attribute: pa.Column(
            int, pa.Check.greater_than(0), required=False, nullable=False, coerce=True
        ),
        name_attribute: pa.Column(str, required=False, nullable=False),
        super_attribute: pa.Column(str, required=False, nullable=False),
    }

    schema = pa.DataFrameSchema(schema_dict)
    validated_labels = schema.validate(labels)

    req_cols = np.array([id_attribute, name_attribute])
    if not np.isin(req_cols, validated_labels.columns).any():
        raise AttributeError("At least one category attribute must be present")

    return validated_labels

window_factory

window_factory(parent_window, schema, boundless=True)

Generator that produces rasterio.Window objects in predetermined steps, within the given Window.

:yield: a rasterio.Window used for windowed reading/writing

Parameters:

Name Type Description Default
parent_window Window

the window that provides the bounds for all child_window objects

required
schema WindowSchema

the parameters that determine the window steps

required
boundless bool

whether the child_window should be clipped by the parent_window or not

True
Source code in geococo\utils.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def window_factory(
    parent_window: Window, schema: WindowSchema, boundless: bool = True
) -> Generator[Window, None, None]:
    """Generator that produces rasterio.Window objects in predetermined steps, within
    the given Window.

    :param parent_window: the window that provides the bounds for all child_window
        objects
    :param schema: the parameters that determine the window steps
    :param boundless: whether the child_window should be clipped by the parent_window or
        not
    :yield: a rasterio.Window used for windowed reading/writing
    """

    window_offsets = generate_window_offsets(window=parent_window, schema=schema)

    for col_off, row_off in window_offsets:
        child_window = Window(
            col_off=int(col_off),
            row_off=int(row_off),
            width=int(schema.width_window),
            height=int(schema.height_window),
        )
        if not boundless:
            child_window = child_window.intersection(parent_window)

        yield child_window

window_intersect

window_intersect(input_raster, input_vector)

Generates a Rasterio Window from the intersecting extents of the input data. It also verifies if the input data share the same CRS and if they physically overlap.

Parameters:

Name Type Description Default
input_raster DatasetReader

rasterio dataset (i.e. input image)

required
input_vector GeoDataFrame

geopandas geodataframe (i.e. input labels)

required

Returns:

Type Description
Window

rasterio window that represent the intersection between input data extents

Source code in geococo\utils.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def window_intersect(
    input_raster: DatasetReader, input_vector: gpd.GeoDataFrame
) -> Window:
    """Generates a Rasterio Window from the intersecting extents of the input data. It
    also verifies if the input data share the same CRS and if they physically overlap.

    :param input_raster: rasterio dataset (i.e. input image)
    :param input_vector: geopandas geodataframe (i.e. input labels)
    :return: rasterio window that represent the intersection between input data extents
    """

    if input_vector.crs != input_raster.crs:
        raise ValueError(
            f"CRS of input raster ({input_raster.crs.to_epsg()}) and "
            "labels ({input_vector.crs.to_epsg()}) don't match, exiting.."
        )

    raster_bounds = input_raster.bounds
    vector_bounds = input_vector.total_bounds
    raster_window = from_bounds(*raster_bounds, transform=input_raster.transform)
    vector_window = from_bounds(*vector_bounds, transform=input_raster.transform)

    try:
        intersection_window = vector_window.intersection(raster_window)
    except WindowError as window_error:
        raise ValueError(
            "Extent of input raster and vector don't overlap"
        ) from window_error

    return intersection_window