Source code for echofilter.raw.loader

"""
Input/Output handling for raw Echoview files.
"""

# This file is part of Echofilter.
#
# Copyright (C) 2020-2022  Scott C. Lowe and Offshore Energy Research Association (OERA)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from collections import OrderedDict
import csv
import datetime
import os
import textwrap
import warnings

import numpy as np
import scipy.interpolate
import scipy.ndimage
import skimage.measure
import pandas as pd

from . import utils
from ..ui import style


ROOT_DATA_DIR = "/data/dsforce/surveyExports"

TRANSECT_FIELD_TYPES = {
    "Ping_index": int,
    "Distance_gps": float,
    "Distance_vl": float,
    "Ping_date": str,
    "Ping_time": str,
    "Ping_milliseconds": float,
    "Latitude": float,
    "Longitude": float,
    "Depth_start": float,
    "Depth_stop": float,
    "Range_start": float,
    "Range_stop": float,
    "Sample_count": int,
}


[docs]def transect_reader(fname): """ Creates a generator which iterates through a survey csv file. Parameters ---------- fname: str Path to survey CSV file. Returns ------- generator Yields a tupule of `(metadata, data)`, where metadata is a dict, and data is a :class:`numpy.ndarray`. Each yield corresponds to a single row in the data. Every row (except for the header) is yielded. """ metadata_header = [] with open(fname, "rb") as hf: for i_row, row in enumerate(hf): try: row = row.decode("utf-8-sig" if i_row == 0 else "utf-8") except: if i_row == 0: raise print( "Row {} of {} contained a byte which is not in UTF-8" " and will be skipped.".format(i_row, fname) ) continue row = row.split(",") row = [entry.strip() for entry in row] if i_row == 0: metadata_header = row continue metadata = row[: len(metadata_header)] metadata_d = OrderedDict() for k, v in zip(metadata_header, metadata): if k in TRANSECT_FIELD_TYPES: metadata_d[k] = TRANSECT_FIELD_TYPES[k](v) else: metadata_d[k] = v data = np.array([float(x) for x in row[len(metadata_header) :]]) yield metadata_d, data
[docs]def count_lines(filename): """ Count the number of lines in a file. Parameters ---------- filename : str Path to file. Returns ------- int Number of lines in file. """ with open(filename, "rb") as f: for i, _ in enumerate(f): pass return i + 1
[docs]def transect_loader( fname, skip_lines=0, warn_row_overflow=None, row_len_selector="mode", ): """ Loads an entire survey transect CSV. Parameters ---------- fname : str Path to survey CSV file. skip_lines : int, optional Number of initial entries to skip. Default is 0. warn_row_overflow : bool or int, optional Whether to print a warning message if the number of elements in a row exceeds the expected number. If this is an int, this is the number of times to display the warnings before they are supressed. If this is `True`, the number of outputs is unlimited. If `None`, the maximum number of underflow and overflow warnings differ: if `row_len_selector` is `"init"` or `"min"`, underflow always produces a message and the overflow messages stop at 2; otherwise the values are reversed. Default is `None`. row_len_selector : {"init", "min", "max", "median", "mode"}, optional The method used to determine which row length (number of depth samples) to use. Default is `"mode"`, the most common row length across all the measurement timepoints. Returns ------- numpy.ndarray Timestamps for each row, in seconds. Note: not corrected for timezone (so make sure your timezones are internally consistent). numpy.ndarray Depth of each column, in metres. numpy.ndarray Survey signal (Sv, for instance). Units match that of the file. """ row_len_selector = row_len_selector.lower() if row_len_selector in {"init", "min"}: expand_for_overflow = False else: expand_for_overflow = True if warn_row_overflow is True: warn_row_overflow = np.inf if warn_row_overflow is not None: warn_row_underflow = warn_row_overflow elif expand_for_overflow: warn_row_underflow = 2 warn_row_overflow = np.inf else: warn_row_underflow = np.inf warn_row_overflow = 2 # We remove one from the line count because of the header # which is excluded from output n_lines = count_lines(fname) - 1 n_distances = 0 # Initialise output array for i_line, (meta, row) in enumerate(transect_reader(fname)): if i_line < min(n_lines, max(1, skip_lines)): continue n_depths_init = len(row) depth_start_init = meta["Depth_start"] depth_stop_init = meta["Depth_stop"] break n_depth_exp = n_depths_init data = np.empty((n_lines - skip_lines, n_depth_exp)) data[:] = np.nan timestamps = np.empty((n_lines - skip_lines)) timestamps[:] = np.nan row_lengths = np.empty((n_lines - skip_lines), dtype=np.int) row_depth_starts = np.empty((n_lines - skip_lines)) row_depth_ends = np.empty((n_lines - skip_lines)) n_warn_overflow = 0 n_warn_underflow = 0 n_entry = 0 for i_line, (meta, row) in enumerate(transect_reader(fname)): if i_line < skip_lines: continue i_entry = i_line - skip_lines # Track the range of depths used in the row with this length row_lengths[i_entry] = len(row) row_depth_starts[i_entry] = meta["Depth_start"] row_depth_ends[i_entry] = meta["Depth_stop"] if len(row) > n_depth_exp: if n_warn_overflow < warn_row_overflow: print( "Row {} of {} exceeds expected n_depth of {} with {}".format( i_line, fname, n_depth_exp, len(row) ) ) n_warn_overflow += 1 if expand_for_overflow: data = np.pad( data, ((0, 0), (0, len(row) - n_depth_exp)), mode="constant", constant_values=np.nan, ) n_depth_exp = len(row) if len(row) < n_depth_exp: if n_warn_underflow < warn_row_underflow: print( "Row {} of {} shorter than expected n_depth_exp of {} with {}".format( i_line, fname, n_depth_exp, len(row) ) ) n_warn_underflow += 1 data[i_entry, : len(row)] = row else: data[i_entry, :] = row[:n_depth_exp] timestamps[i_entry] = datetime.datetime.strptime( "{}T{}.{:06d}".format( meta["Ping_date"], meta["Ping_time"], int(1000 * float(meta["Ping_milliseconds"])), ), "%Y-%m-%dT%H:%M:%S.%f", ).timestamp() n_entry += 1 # Turn NaNs into NaNs (instead of extremely negative number) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered in less") warnings.filterwarnings("ignore", "invalid value encountered in greater") # 9.9e+37 and -9.9e+37 are special values indicating missing data # https://support.echoview.com/WebHelp/Reference/File_formats/Export_file_formats/Special_Export_Values.htm data[data < -1e37] = np.nan data[data > 1e37] = np.nan # Trim timestamps dimension down to size timestamps = timestamps[:n_entry] data = data[:n_entry] row_lengths = row_lengths[:n_entry] row_depth_starts = row_depth_starts[:n_entry] row_depth_ends = row_depth_ends[:n_entry] # Work out what row length we should return if row_len_selector == "init": n_depth_use = n_depths_init elif row_len_selector == "min": n_depth_use = np.min(row_lengths) elif row_len_selector == "max": n_depth_use = np.max(row_lengths) elif row_len_selector == "median": n_depth_use = np.median(row_lengths) # If the median is half-way between two values, round up if n_depth_use not in row_depth_starts: n_depth_use = int(np.round(n_depth_use)) # If the median is still not between values, drop the last value # to make the array be odd, guaranteeing the median is an observed # value, not an intermediary. if n_depth_use not in row_depth_starts: n_depth_use = np.median(row_lengths[:-1]) elif row_len_selector == "mode": n_depth_use = utils.mode(row_lengths) else: raise ValueError( "Unsupported row_len_selector value: {}".format(row_len_selector) ) # Use depths corresponding to that declared in the rows which had the # number of entries used. if row_len_selector == "median": d_start = np.median(row_depth_starts[row_lengths == n_depth_use]) d_stop = np.median(row_depth_ends[row_lengths == n_depth_use]) else: d_start = utils.mode(row_depth_starts[row_lengths == n_depth_use]) d_stop = utils.mode(row_depth_ends[row_lengths == n_depth_use]) depths = np.linspace(d_start, d_stop, n_depth_use) # Interpolate depths to get a consistent sampling grid interp_kwargs = dict(nan_threshold=0.3, assume_sorted=True) for i_entry, (nd, d0, d1) in enumerate( zip(row_lengths, row_depth_starts, row_depth_ends) ): if d0 < d1: data[i_entry, :n_depth_use] = utils.interp1d_preserve_nan( np.linspace(d0, d1, nd), data[i_entry, :nd], depths, **interp_kwargs, ) else: data[i_entry, :n_depth_use] = utils.interp1d_preserve_nan( np.linspace(d1, d0, nd), data[i_entry, :nd][::-1], depths, **interp_kwargs, ) # Crop the data down to size data = data[:, :n_depth_use] return timestamps, depths, data
[docs]def evl_reader(fname): """ EVL file reader Parameters ---------- fname : str Path to .evl file. Returns ------- generator A generator which yields the timestamp (in seconds), depth (in metres), and status (int) for each entry. Note that the timestamp is not corrected for timezone (so make sure your timezones are internally consistent). """ with open(fname, "r") as hf: continuance = True for i_row, row in enumerate(csv.reader(hf, delimiter=" ")): if i_row == 0: continue if len(row) < 4: if not continuance: raise ValueError("Trying to skip data after parsing began") continue continuance = False timestamp = datetime.datetime.strptime( row[0] + "T" + row[1], "%Y%m%dT%H%M%S%f", ).timestamp() if len(row[2]) > 0: raise ValueError("row[2] was non-empty: {}".format(row[2])) yield timestamp, float(row[3]), int(row[4])
[docs]def evl_loader(fname, special_to_nan=True, return_status=False): """ EVL file loader Parameters ---------- fname : str Path to .evl file. special_to_nan : bool, optional Whether to replace the special value, `-10000.99`, which indicates no depth value, with NaN. https://support.echoview.com/WebHelp/Reference/File_formats/Export_file_formats/Special_Export_Values.htm Returns ------- numpy.ndarray of floats Timestamps, in seconds. numpy.ndarary of floats Depth, in metres. numpy.ndarary of ints, optional Status codes. """ timestamps = [] values = [] statuses = [] for timestamp, value, status in evl_reader(fname): timestamps.append(timestamp) values.append(value) statuses.append(status) timestamps = np.array(timestamps) values = np.array(values) statuses = np.array(statuses) if special_to_nan: # Replace the special value -10000.99 with NaN # https://support.echoview.com/WebHelp/Reference/File_formats/Export_file_formats/Special_Export_Values.htm values[np.isclose(values, -10000.99)] = np.nan if return_status: return timestamps, values, statuses return timestamps, values
[docs]def timestamp2evdtstr(timestamp): """ Converts a timestamp into an Echoview-compatible datetime string, in the format "CCYYMMDD HHmmSSssss", where: | CC: century | YY: year | MM: month | DD: day | HH: hour | mm: minute | SS: second | ssss: 0.1 milliseconds Parameters ---------- timestamp : float Number of seconds since Unix epoch. Returns ------- datetimestring : str Datetime string in the Echoview-compatible format "CCYYMMDD HHmmSSssss". """ # Datetime must be in the format CCYYMMDD HHmmSSssss # where ssss = 0.1 milliseconds. # We have to manually determine the number of "0.1 milliseconds" # from the microsecond component. dt = datetime.datetime.fromtimestamp(timestamp) return "{}{:04d}".format(dt.strftime("%Y%m%d %H%M%S"), round(dt.microsecond / 100))
[docs]def evl_writer(fname, timestamps, depths, status=1, line_ending="\r\n", pad=False): r""" EVL file writer Parameters ---------- fname : str Destination of output file. timestamps : array_like Timestamps for each node in the line. depths : array_like Depths (in meters) for each node in the line. status : 0, 1, 2, or 3; optional Status for the line. - `0` : none - `1` : unverified - `2` : bad - `3` : good Default is `1` (unverified). For more details on line status, see https://support.echoview.com/WebHelp/Using_Echoview/Echogram/Lines/About_Line_Status.htm pad : bool, optional Whether to pad the line with an extra datapoint half a pixel before the first and after the last given timestamp. Default is `False`. line_ending : str, optional Line ending. Default is `"\r\n"` the standard line ending on Windows/DOS, as per the specification for the file format. https://support.echoview.com/WebHelp/Using_Echoview/Exporting/Exporting_data/Exporting_line_data.htm Set to `"\n"` to get Unix-style line endings instead. Notes ----- For more details on the format specification, see https://support.echoview.com/WebHelp/Using_Echoview/Exporting/Exporting_data/Exporting_line_data.htm#Line_definition_file_format """ if len(timestamps) != len(depths): raise ValueError( "Number of timestamps ({}) and depths ({}) are not equal".format( len(timestamps), len(depths) ) ) if pad and len(timestamps) > 1: timestamps = timestamps[:] timestamps = np.r_[ timestamps[0] - (timestamps[1] - timestamps[0]) / 2, timestamps, timestamps[-1] + (timestamps[-2] - timestamps[-1]) / 2, ] depths = np.r_[depths[0], depths, depths[-1]] # The file object will automatically replace \n with our chosen line ending with open(fname, "w+", encoding="utf-8-sig", newline=line_ending) as hf: # Write header hf.write("EVBD 3 10.0.270.37090" + "\n") n_row = len(depths) hf.write(str(n_row) + "\n") # Write each row for i_row, (timestamp, depth) in enumerate(zip(timestamps, depths)): # Datetime must be in the format CCYYMMDD HHmmSSssss # where ssss = 0.1 milliseconds. # We have to manually determine the number of "0.1 milliseconds" # from the microsecond component. dt = datetime.datetime.fromtimestamp(timestamp) hf.write("{} {} {} \n".format(timestamp2evdtstr(timestamp), depth, status))
[docs]def evr_writer( fname, rectangles=[], contours=[], common_notes="", default_region_type=0, line_ending="\r\n", ): r""" EVR file writer. Writes regions to an Echoview region file. Parameters ---------- fname : str Destination of output file. rectangles : list of dictionaries, optional Rectangle region definitions. Default is an empty list. Each rectangle region must implement fields `"depths"` and `"timestamps"`, which indicate the extent of the rectangle. Optionally, `"creation_type"`, `"region_name"`, `"region_type"`, and `"notes"` may be set. If these are not given, the default creation_type is 4 and region_type is set by `default_region_type`. contours : list of dictionaries Contour region definitions. Default is an empty list. Each contour region must implement a `"points"` field containing a :class:`numpy.ndarray` shaped `(n, 2)` defining the co-ordinates of nodes along the (open) contour in units of timestamp and depth. Optionally, `"creation_type"`, `"region_name"`, `"region_type"`, and `"notes"` may be set. If these are not given, the default creation_type is 2 and region_type is set by `default_region_type`. common_notes : str, optional Notes to include for every region. Default is `""`, an empty string. default_region_type : int, optional The region type to use for rectangles and contours which do not define a `"region_type"` field. Possible region types are - `0` : bad (no data) - `1` : analysis - `2` : marker - `3` : fishtracks - `4` : bad (empty water) Default is `0`. line_ending : str, optional Line ending. Default is `"\r\n"` the standard line ending on Windows/DOS, as per the specification for the file format. https://support.echoview.com/WebHelp/Using_Echoview/Exporting/Exporting_data/Exporting_line_data.htm Set to `"\n"` to get Unix-style line endings instead. Notes ----- For more details on the format specification, see: https://support.echoview.com/WebHelp/Reference/File_formats/Export_file_formats/2D_Region_definition_file_format.htm """ # Remove leading/trailing new lines, since we will join with our own line ending common_notes = common_notes.strip("\r\n") # Standardize line endings to be \n, regardless of input common_notes = common_notes.replace("\r\n", "\n").replace("\r", "\n") if len(common_notes) == 0: n_lines_common_notes = 0 else: n_lines_common_notes = 1 + common_notes.count(line_ending) n_regions = len(rectangles) + len(contours) i_region = 0 # The file object will automatically replace \n with our chosen line ending with open(fname, "w+", encoding="utf-8-sig", newline=line_ending) as hf: # Write header hf.write("EVRG 7 10.0.283.37689" + "\n") hf.write(str(n_regions) + "\n") # Write each rectangle for region in rectangles: # Regions are indexed from 1, so increment the counter first i_region += 1 hf.write("\n") # Blank line separates regions # Determine extent of rectangle left = timestamp2evdtstr(np.min(region["timestamps"])) right = timestamp2evdtstr(np.max(region["timestamps"])) top = np.min(region["depths"]) bottom = np.max(region["depths"]) # Region header hf.write( "13 4 {i} 0 {type} -1 1 {left} {top} {right} {bottom}".format( i=i_region, type=region.get("creation_type", 4), left=left, right=right, top=top, bottom=bottom, ) + "\n" ) # Notes notes = region.get("notes", "") if len(notes) == 0: notes = common_notes n_lines_notes = n_lines_common_notes else: notes = notes.strip("\n") if len(common_notes) > 0: notes += "\n" + common_notes n_lines_notes = 1 + notes.count("\n") hf.write(str(n_lines_notes) + "\n") # Number of lines of notes if len(notes) > 0: hf.write(notes + "\n") # Detection settings hf.write("0" + "\n") # Number of lines of detection settings # Region classification string hf.write("Unclassified regions" + "\n") # The points defining the region itself hf.write( "{left} {top} {left} {bottom} {right} {bottom} {right} {top} ".format( left=left, right=right, top=top, bottom=bottom, ) # Terminates with a space, not a new line ) # Region type hf.write(str(region.get("region_type", default_region_type)) + "\n") # Region name hf.write( str(region.get("region_name", "Region {}".format(i_region))) + "\n" ) # Write each contour for region in contours: # Regions are indexed from 1, so increment the counter first i_region += 1 hf.write("\n") # Blank line separates regions # Header line hf.write( "13 {n} {i} 0 {type} -1 1 {left} {top} {right} {bottom}".format( n=region["points"].shape[0], i=i_region, type=region.get("creation_type", 2), left=timestamp2evdtstr(np.min(region["points"][:, 0])), right=timestamp2evdtstr(np.max(region["points"][:, 0])), top=np.min(region["points"][:, 1]), bottom=np.max(region["points"][:, 1]), ) + "\n" ) # Notes notes = region.get("notes", "") if len(notes) == 0: notes = common_notes n_lines_notes = n_lines_common_notes else: notes = notes.strip("\n") if len(common_notes) > 0: notes += "\n" + common_notes n_lines_notes = 1 + notes.count("\n") hf.write(str(n_lines_notes) + "\n") # Number of lines of notes if len(notes) > 0: hf.write(notes + "\n") # Detection settings hf.write("0" + "\n") # Number of lines of detection settings # Region classification string hf.write("Unclassified regions" + "\n") # The region itself for point in region["points"]: hf.write("{} {} ".format(timestamp2evdtstr(point[0]), point[1])) # Region type hf.write(str(region.get("region_type", default_region_type)) + "\n") # Region name hf.write( str(region.get("region_name", "Region {}".format(i_region))) + "\n" )
[docs]def write_transect_regions( fname, transect, depth_range=None, passive_key="is_passive", removed_key="is_removed", patches_key="mask_patches", collate_passive_length=0, collate_removed_length=0, minimum_passive_length=0, minimum_removed_length=0, minimum_patch_area=0, name_suffix="", common_notes="", line_ending="\r\n", verbose=0, verbose_indent=0, ): r""" Convert a transect dictionary to a set of regions and write as an EVR file. Parameters ---------- fname : str Destination of output file. transect : dict Transect dictionary. depth_range : array_like or None, optional The minimum and maximum depth extents (in any order) of the passive and removed block regions. If this is `None` (default), the minimum and maximum of `transect["depths"]` is used. passive_key : str, optional Field name to use for passive data identification. Default is `"is_passive"`. removed_key : str, optional Field name to use for removed blocks. Default is `"is_removed"`. patches_key : str, optional Field name to use for the mask of patch regions. Default is `"mask_patches"`. collate_passive_length : int, optional Maximum distance (in indices) over which passive regions should be merged together, closing small gaps between them. Default is `0`. collate_removed_length : int, optional Maximum distance (in indices) over which removed blocks should be merged together, closing small gaps between them. Default is `0`. minimum_passive_length : int, optional Minimum length (in indices) a passive region must have to be included in the output. Set to -1 to omit all passive regions from the output. Default is `0`. minimum_removed_length : int, optional Minimum length (in indices) a removed block must have to be included in the output. Set to -1 to omit all removed regions from the output. Default is `0`. minimum_patch_area : float, optional Minimum amount of area (in input pixel space) that a patch must occupy in order to be included in the output. Set to `0` to include all patches, no matter their area. Set to `-1` to omit all patches. Default is `0`. name_suffix : str, optional Suffix to append to variable names. Default is `""`, an empty string. common_notes : str, optional Notes to include for every region. Default is `""`, an empty string. line_ending : str, optional Line ending. Default is `"\r\n"` the standard line ending on Windows/DOS, as per the specification for the file format, https://support.echoview.com/WebHelp/Using_Echoview/Exporting/Exporting_data/Exporting_line_data.htm Set to `"\n"` to get Unix-style line endings instead. verbose : int, optional Verbosity level. Default is `0`. verbose_indent : int, optional Level of indentation (number of preceding spaces) before verbosity messages. Default is `0`. """ if depth_range is None: depth_range = transect["depths"] depth_range = [np.min(depth_range), np.max(depth_range)] rectangles = [] contours = [] # Regions around each period of passive data key = passive_key if key not in transect: key = "p_" + key if key not in transect: raise ValueError("Key {} and {} not found in transect.".format(key[2:], key)) is_passive = transect[key] > 0.5 is_passive = ~utils.squash_gaps(~is_passive, collate_passive_length) passive_starts, passive_ends = utils.get_indicator_onoffsets(is_passive) i_passive = 1 n_passive_skipped = 0 for start_index, end_index in zip(passive_starts, passive_ends): start_index -= 0.5 end_index += 0.5 if minimum_passive_length == -1: # No passive regions break if end_index - start_index <= minimum_passive_length: n_passive_skipped += 1 continue region = {} region["region_name"] = "Passive{} {}".format(name_suffix, i_passive) region["creation_type"] = 4 region["region_type"] = 0 region["depths"] = depth_range region["timestamps"] = scipy.interpolate.interp1d( np.arange(len(transect["timestamps"])), transect["timestamps"], fill_value="extrapolate", )([start_index, end_index]) region["notes"] = textwrap.dedent( """ Passive data Length in pixels: {} Duration in seconds: {} """.format( end_index - start_index, region["timestamps"][1] - region["timestamps"][0], ) ) rectangles.append(region) i_passive += 1 # Regions around each period of removed data key = removed_key if key not in transect: key = "p_" + key if key not in transect: raise ValueError("Key {} and {} not found in transect.".format(key[2:], key)) is_removed = transect[key] > 0.5 is_removed = ~utils.squash_gaps(~is_removed, collate_removed_length) removed_starts, removed_ends = utils.get_indicator_onoffsets(is_removed) i_removed = 1 n_removed_skipped = 0 for start_index, end_index in zip(removed_starts, removed_ends): start_index -= 0.5 end_index += 0.5 if minimum_removed_length == -1: # No passive regions break if end_index - start_index <= minimum_removed_length: n_removed_skipped += 1 continue region = {} region["region_name"] = "Removed block{} {}".format(name_suffix, i_removed) region["creation_type"] = 4 region["region_type"] = 0 region["depths"] = depth_range region["timestamps"] = scipy.interpolate.interp1d( np.arange(len(transect["timestamps"])), transect["timestamps"], fill_value="extrapolate", )([start_index, end_index]) region["notes"] = textwrap.dedent( """ Removed data block Length in pixels: {} Duration in seconds: {} """.format( end_index - start_index, region["timestamps"][1] - region["timestamps"][0], ) ) rectangles.append(region) i_removed += 1 # Contours around each removed patch if patches_key not in transect: raise ValueError("Key {} not found in transect.".format(patches_key)) patches = transect[patches_key] patches = scipy.ndimage.binary_fill_holes(patches > 0.5) contours_coords = skimage.measure.find_contours(patches, 0.5) contour_dicts = [] i_contour = 1 n_contour_skipped = 0 for contour in contours_coords: if minimum_patch_area == -1: # No patches break area = utils.integrate_area_of_contour( contour[:, 0], contour[:, 1], closed=False ) if area < minimum_patch_area: n_contour_skipped += 1 continue region = {} region["region_name"] = "Removed patch{} {}".format(name_suffix, i_contour) region["creation_type"] = 2 region["region_type"] = 0 x = scipy.interpolate.interp1d( np.arange(len(transect["timestamps"])), transect["timestamps"], fill_value="extrapolate", )(contour[:, 0]) y = scipy.interpolate.interp1d( np.arange(len(transect["depths"])), transect["depths"], fill_value="extrapolate", )(contour[:, 1]) region["points"] = np.stack([x, y], axis=-1) region["notes"] = textwrap.dedent( """ Removed patch Area in pixels: {} Area in meter-seconds: {} """.format( area, utils.integrate_area_of_contour(x, y, closed=False) ) ) contour_dicts.append(region) i_contour += 1 if verbose >= 1: print( " " * verbose_indent + "Outputting {} region{}:" " {} passive, {} removed blocks, {} removed patches".format( len(rectangles) + len(contour_dicts), "" if len(rectangles) + len(contour_dicts) == 1 else "s", i_passive - 1, i_removed - 1, i_contour - 1, ) ) n_skipped = n_passive_skipped + n_removed_skipped + n_contour_skipped if n_skipped > 0: print( " " * verbose_indent + style.skip_fmt( "There {} {} skipped (too small) region{}:" " {} passive, {} removed blocks, {} removed patches".format( "was" if n_skipped == 1 else "were", n_skipped, "" if n_skipped == 1 else "s", n_passive_skipped, n_removed_skipped, n_contour_skipped, ) ) ) # Write the output return evr_writer( fname, rectangles=rectangles, contours=contour_dicts, common_notes=common_notes, line_ending=line_ending, )
[docs]def load_transect_data(transect_pth, dataset="mobile", root_data_dir=ROOT_DATA_DIR): """ Load all data for one transect. Parameters ---------- transect_pth : str Relative path to transect, excluding `"_Sv_raw.csv"`. dataset : str, optional Name of dataset. Default is `"mobile"`. root_data_dir : str Path to root directory where data is located. Returns ------- timestamps : numpy.ndarray Timestamps (in seconds since Unix epoch), with each entry corresponding to each row in the `signals` data. depths : numpy.ndarray Depths from the surface (in metres), with each entry corresponding to each column in the `signals` data. signals : numpy.ndarray Echogram Sv data, shaped (num_timestamps, num_depths). turbulence : numpy.ndarray Depth of turbulence line, shaped (num_timestamps, ). bottom : numpy.ndarray Depth of bottom line, shaped (num_timestamps, ). """ dirname = os.path.join(root_data_dir, dataset) raw_fname = os.path.join(dirname, transect_pth + "_Sv_raw.csv") bottom_fname = os.path.join(dirname, transect_pth + "_bottom.evl") turbulence_fname = os.path.join(dirname, transect_pth + "_turbulence.evl") timestamps, depths, signals = transect_loader(raw_fname) t_bottom, d_bottom = evl_loader(bottom_fname) t_turbulence, d_turbulence = evl_loader(turbulence_fname) return ( timestamps, depths, signals, np.interp(timestamps, t_turbulence, d_turbulence), np.interp(timestamps, t_bottom, d_bottom), )
[docs]def get_partition_data( partition, dataset="mobile", partitioning_version="firstpass", root_data_dir=ROOT_DATA_DIR, ): """ Loads partition metadata. Parameters ---------- transect_pth : str Relative path to transect, excluding `"_Sv_raw.csv"`. dataset : str, optional Name of dataset. Default is `"mobile"`. partitioning_version : str, optional Name of partitioning method. root_data_dir : str Path to root directory where data is located. Returns ------- pandas.DataFrame Metadata for all transects in the partition. Each row is a single sample. """ dirname = os.path.join(root_data_dir, dataset, "sets", partitioning_version) fname_partition = os.path.join(dirname, partition + ".txt") fname_header = os.path.join(dirname, "header" + ".txt") with open(fname_header, "r") as hf: for row in csv.reader(hf): header = [entry.strip() for entry in row] break df = pd.read_csv(fname_partition, header=None, names=header) return df
[docs]def remove_trailing_slash(s): """ Remove trailing forward slashes from a string. Parameters ---------- s : str String representing a path, possibly with trailing slashes. Returns ------- str Same as `s`, but without trailing forward slashes. """ while s[-1] == "/" or s[-1] == os.path.sep: s = s[:-1] return s
[docs]def list_from_file(fname): """ Get a list from a file. Parameters ---------- fname : str Path to file. Returns ------- list Contents of the file, one line per entry in the list. Trailing whitespace is removed from each end of each line. """ with open(fname, "r") as hf: contents = hf.readlines() contents = [x.strip() for x in contents] return contents
[docs]def get_partition_list( partition, dataset="mobile", full_path=False, partitioning_version="firstpass", root_data_dir=ROOT_DATA_DIR, sharded=False, ): """ Get a list of transects in a single partition. Parameters ---------- transect_pth : str Relative path to transect, excluding `"_Sv_raw.csv"`. dataset : str, optional Name of dataset. Default is `"mobile"`. full_path : bool, optional Whether to return the full path to the sample. If `False`, only the relative path (from the dataset directory) is returned. Default is `False`. partitioning_version : str, optional Name of partitioning method. root_data_dir : str, optional Path to root directory where data is located. sharded : bool, optional Whether to return path to sharded version of data. Default is `False`. Returns ------- list Path for each sample in the partition. """ if dataset == "mobile": df = get_partition_data( partition, dataset=dataset, partitioning_version=partitioning_version, root_data_dir=root_data_dir, ) fnames = df["Filename"] fnames = [os.path.join(f.split("_")[0], f.strip()) for f in fnames] else: partition_file = os.path.join( root_data_dir, dataset, "sets", partitioning_version, partition + ".txt", ) fnames = list_from_file(partition_file) fnames = [f.replace("_Sv_raw.csv", "") for f in fnames] if full_path and sharded: root_data_dir = remove_trailing_slash(root_data_dir) fnames = [os.path.join(root_data_dir + "_sharded", dataset, f) for f in fnames] elif full_path: fnames = [os.path.join(root_data_dir, dataset, f) for f in fnames] return fnames