Skip to content

myco.utils

Helper functions used by multiple modules or don't have a clear home.

AttributeDictionary

Bases: dict

Extends a dictionary to index keys as attributes.

Source code in myco/utils.py
318
319
320
321
322
323
class AttributeDictionary(dict):
    """Extends a dictionary to index keys as attributes."""

    def __init__(self, *args, **kwargs):
        super(AttributeDictionary, self).__init__(*args, **kwargs)
        self.__dict__ = self

augment_feature_patch(feature)

Apply random feature augmentations (flipping and rotating)

Source code in myco/utils.py
331
332
333
334
335
336
337
338
339
340
def augment_feature_patch(feature: np.ndarray) -> np.ndarray:
    """Apply random feature augmentations (flipping and rotating)"""
    flip_lr = np.flip(feature, 2)
    flip_ud = np.flip(feature, 1)
    rot90 = np.rot90(feature, axes=(1, 2))
    rot180 = np.rot90(feature, k=2, axes=(1, 2))
    rot270 = np.rot90(feature, k=3, axes=(1, 2))
    aug_patches = np.vstack([feature, flip_lr, flip_ud, rot90, rot180, rot270])

    return aug_patches

balance_samples(samples, class_weight)

Over and undersamples classification data.

Parameters:

Name Type Description Default
samples np.array

array of shape (height, width, n_classes) to get samples from

required
class_weight dict

the proportional distribution of each class

required
add_samples

???

required

Returns:

Type Description
list

a list of response data sample indices to include in model training

Source code in myco/utils.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def balance_samples(samples: np.array, class_weight: dict) -> list:
    """Over and undersamples classification data.

    Args:
        samples: array of shape (height, width, n_classes) to get samples from
        class_weight: the proportional distribution of each class
        add_samples: ???

    Returns:
        a list of response data sample indices to include in model training
    """
    num_samples = samples.shape[0]
    samples_per_class = np.sum(samples, (1, 2))
    classes, weights = np.array(list(class_weight.keys())), np.array(
        list(class_weight.values())
    )

    # the indices in each class with at least one positive
    indices = [np.where(samples_per_class[:, i] > 0) for i in classes]

    # the labels that should be oversampled
    over_sample_classes = np.where(weights >= 1)[0]
    ind = {}
    for cl in over_sample_classes:
        ind[cl] = np.repeat(indices[cl], np.abs(weights[cl] + 1))
        np.random.shuffle(ind[cl])

    # the labels that should be undersampled
    under_sample_classes = np.where(weights < 1)[0]
    for cl in under_sample_classes:
        np.random.shuffle(indices[cl][0])
        stop_ind = np.floor(indices[0][0].shape[0] * np.abs(weights[cl]))
        ind[cl] = indices[cl][0][: int(stop_ind)]

    # stack the under and oversampled indices and shuffle em
    sample_inxs = np.hstack(list(ind.values()))
    np.random.shuffle(sample_inxs)

    return sample_inxs[:num_samples]

compute_class_weights(response)

Computes weights per class.

Parameters:

Name Type Description Default
response np.ndarray

one-hot encoded tensor of response data

required

Returns:

Type Description
dict

a vector of weights per class

Source code in myco/utils.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
def compute_class_weights(response: np.ndarray) -> dict:
    """Computes weights per class.

    Args:
        response: one-hot encoded tensor of response data

    Returns:
        a vector of weights per class
    """

    data_vals = np.argmax(response, axis=3).flatten()
    class_labels = np.unique(data_vals)
    weights = class_weight.compute_class_weight(
        "balanced", classes=class_labels, y=data_vals
    )
    return dict(zip(class_labels, weights))

compute_sample_weights(class_weights, response)

Computes weights per sample.

Parameters:

Name Type Description Default
class_weights np.ndarray

weights of each class

required
response np.ndarray

one-hot encoded tensor of response data

required

Returns:

Type Description
np.ndarray

2D array of sample weights

Source code in myco/utils.py
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def compute_sample_weights(
    class_weights: np.ndarray, response: np.ndarray
) -> np.ndarray:
    """Computes weights per sample.

    Args:
        class_weights: weights of each class
        response: one-hot encoded tensor of response data

    Returns:
        2D array of sample weights
    """
    assert len(response.shape) == 4, "Input data should have 4 dimensions"
    labels = np.argmax(response, axis=3)
    shape = labels.shape
    sample_weights = np.take(
        class_weights, labels.reshape(shape[0], shape[1] * shape[2])
    )

    return sample_weights

create_classes(pred_score)

Reads the scores of one hot encoded data and finds the class value.

Parameters:

Name Type Description Default
pred_score np.ndarray

matrix of scores

required

Returns:

Type Description
np.ndarray

class-labeled array cells. dim 2 is reduced from num-classes to 1

Source code in myco/utils.py
195
196
197
198
199
200
201
202
203
204
205
def create_classes(pred_score: np.ndarray) -> np.ndarray:
    """Reads the scores of one hot encoded data and finds the class value.

    Args:
        pred_score: matrix of scores

    Returns:
        class-labeled array cells. dim 2 is reduced from num-classes to 1
    """
    y_pred = tf.math.argmax(pred_score, axis=-1)
    return np.expand_dims(y_pred, axis=-1)

find_yaml_files(filepath, yaml_ext)

Search for yaml files affiliated with a numpy archive

Source code in myco/utils.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def find_yaml_files(filepath: str, yaml_ext: str):
    """Search for yaml files affiliated with a numpy archive"""
    base, ext = os.path.splitext(filepath.rstrip(os.path.sep))
    yaml_files = glob(f"{base}*{yaml_ext}")
    yaml_files.sort()
    n_files = len(yaml_files)

    if n_files < 1:
        raise FileNotFoundError(
            f"No associated *.{yaml_ext} files found for {filepath}"
        )
    elif n_files > 1:
        nearest_ext = f"{base}.{yaml_ext}"
        next_nearest_ext = f"{filepath}.{yaml_ext}"
        if nearest_ext in yaml_files:
            yaml_file = nearest_ext
        elif next_nearest_ext in yaml_files:
            yaml_file = nearest_ext
        else:
            yaml_file = yaml_files[-1]
            warnings.warn(
                f"Multiple *.{yaml_ext} files found for {filepath}. Reading "
                f"{yaml_file}."
            )
    else:
        yaml_file = yaml_files[0]

    return yaml_file

load_object(path, compressed=True)

Reads a python object that's been saved to disk or to GCS

Parameters:

Name Type Description Default
path str

file path of the object to load

required
compressed bool

whether the file was compressed prior to saving

True

Returns:

Name Type Description
obj Any

the original python object (e.g., a fit NetworkScaler() object)

Source code in myco/utils.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def load_object(path: str, compressed: bool = True) -> Any:
    """Reads a python object that's been saved to disk or to GCS

    Args:
        path: file path of the object to load
        compressed: whether the file was compressed prior to saving

    Returns:
        obj: the original python object (e.g., a fit NetworkScaler() object)
    """
    with tf.io.gfile.GFile(path, "rb") as f:
        obj = f.read()

    if compressed:
        obj = gzip.decompress(obj)

    return pickle.loads(obj)

n_digits(number)

Counts the number of significant integer digits of a number.

Parameters:

Name Type Description Default
number Union[int, float]

the number to evaluate.

required

Returns:

Name Type Description
order int

number of digits required to represent a number

Source code in myco/utils.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def n_digits(number: Union[int, float]) -> int:
    """Counts the number of significant integer digits of a number.

    Args:
        number: the number to evaluate.

    Returns:
        order: number of digits required to represent a number
    """
    if number == 0:
        order = 1
    else:
        order = np.floor(np.log10(number)).astype(int) + 1

    return order

read_archive_data(filepath, mode='r', ext='yml')

Reads a numpy memmap archive based on an associated header yaml file.

Parameters:

Name Type Description Default
filepath str

path to the numpy archive to read. automatically searches

required
mode str

the mode for opening the memmap (in 'r', 'r+', 'w+': see

'r'
ext str

the filename extension to search for based on the input filepath.

'yml'

Returns:

Type Description
np.ndarray

The numpy array as a memmap object.

Source code in myco/utils.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def read_archive_data(filepath: str, mode: str = "r", ext: str = "yml") -> np.ndarray:
    """Reads a numpy memmap archive based on an associated header yaml file.

    Args:
        filepath: path to the numpy archive to read. automatically searches
        for associated .yml files.
        mode: the mode for opening the memmap (in 'r', 'r+', 'w+': see
        np.memmap).
        ext: the filename extension to search for based on the input filepath.

    Returns:
        The numpy array as a memmap object.
    """
    yaml_file = find_yaml_files(filepath, ext)
    config = read_yaml(yaml_file)
    memmap = np.memmap(
        filepath, dtype=config["dtype"], shape=tuple(config["shape"]), mode=mode
    )

    return memmap

read_archive_header(filepath, ext='yml')

Reads archive metadata information from a yaml file.

Source code in myco/utils.py
167
168
169
170
def read_archive_header(filepath: str, ext: str = "yml") -> dict:
    """Reads archive metadata information from a yaml file."""
    yaml_file = find_yaml_files(filepath, ext)
    return read_yaml(yaml_file)

read_yaml(filepath)

Read a yaml file into memory as a dictionary.

Parameters:

Name Type Description Default
filepath str

path to a local .yml/.yaml file.

required

Returns:

Type Description
dict

A parsed dictionary with the yaml file's contents.

Source code in myco/utils.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def read_yaml(filepath: str) -> dict:
    """Read a yaml file into memory as a dictionary.

    Args:
        filepath: path to a local .yml/.yaml file.

    Returns:
        A parsed dictionary with the yaml file's contents.
    """
    yaml = YAML(typ="safe", pure=True)
    with tf.io.gfile.GFile(filepath, "r") as handler:
        configuration = yaml.load(handler)

    return configuration

revert_augmentation(pred)

Inverse transformation of the augment_feature_path() function.

Source code in myco/utils.py
343
344
345
346
347
348
349
350
351
def revert_augmentation(pred: np.ndarray) -> np.ndarray:
    """Inverse transformation of the augment_feature_path() function."""
    pred[1] = np.flip(pred[1], 1)
    pred[2] = np.flip(pred[2], 0)
    pred[3] = np.rot90(pred[3], k=3, axes=(0, 1))
    pred[4] = np.rot90(pred[4], k=2, axes=(0, 1))
    pred[5] = np.rot90(pred[5], axes=(0, 1))

    return pred

save_object(obj, path, compress=True)

Writes a python object to disk or GCS for later access

Parameters:

Name Type Description Default
obj object

a python object to be saved (e.g. a variable or a class)

required
path str

the output file path

required
compress bool

flag to specify whether the file should be compressed

True
Source code in myco/utils.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def save_object(obj: object, path: str, compress: bool = True) -> None:
    """Writes a python object to disk or GCS for later access

    Args:
        obj: a python object to be saved (e.g. a variable or a class)
        path: the output file path
        compress: flag to specify whether the file should be compressed
    """
    obj = pickle.dumps(obj)

    if compress:
        obj = gzip.compress(obj)

    with tf.io.gfile.GFile(path, "wb") as f:
        f.write(obj)

write_archive_header(filepath, archive=None, shape=None, dtype=None, nodata=None, tfrecords=[])

Write a numpy memmap archive header yaml file.

You can either get the shape/dtype/nodata info by passing the array as the archive keyword or by passing shape and dtype as keywords.

Parameters:

Name Type Description Default
filepath str

path to an output .yml file.

required
archive np.ndarray

a numpy array to write the header for.

None
shape tuple

dimensions of the numpy array

None
dtype str

the numpy datatype of the array

None
nodata float

the nodata value to ignore

None
tfrecords list

list of tfrecord filepaths

[]

Returns:

Type Description
None

None. Writes a yaml file to filepath.

Source code in myco/utils.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def write_archive_header(
    filepath: str,
    archive: np.ndarray = None,
    shape: tuple = None,
    dtype: str = None,
    nodata: float = None,
    tfrecords: list = [],
) -> None:
    """Write a numpy memmap archive header yaml file.

    You can either get the shape/dtype/nodata info by passing the array as the
        `archive` keyword or by passing `shape` and `dtype` as keywords.

    Args:
        filepath: path to an output .yml file.
        archive: a numpy array to write the header for.
        shape: dimensions of the numpy array
        dtype: the numpy datatype of the array
        nodata: the nodata value to ignore
        tfrecords: list of tfrecord filepaths

    Returns:
        None. Writes a yaml file to `filepath`.
    """

    # get required data from the archive or from keywords
    shape = archive.shape if archive is not None else shape
    dtype = str(archive.dtype) if archive is not None else dtype
    assert shape is not None, "archive or shape parameter must be set"
    assert dtype is not None, "archive or dtype parameter must be set"

    # avoid overwriting the archive if that filename is passed
    base, ext = os.path.splitext(filepath.rstrip(os.path.sep))
    if "npy" in ext.lower():
        filepath = f"{base}.yml"

    config = {
        "shape": shape,
        "dtype": dtype,
        "nodata": nodata,
        "tfrecords": tfrecords,
    }
    write_yaml(config, filepath)

write_yaml(data, filepath)

Write a dictionary to a yaml file.

Parameters:

Name Type Description Default
data dict

dictionary of values to write.

required
filepath str

path to an output .yml/.yaml file.

required

Returns:

Type Description
None

None. Writes to disk.

Source code in myco/utils.py
77
78
79
80
81
82
83
84
85
86
87
88
89
def write_yaml(data: dict, filepath: str) -> None:
    """Write a dictionary to a yaml file.

    Args:
        data: dictionary of values to write.
        filepath: path to an output .yml/.yaml file.

    Returns:
        None. Writes to disk.
    """
    yaml = YAML()
    with tf.io.gfile.GFile(filepath, "w") as handler:
        yaml.dump(data, handler)