Source code for echofilter.raw.loader

"""
Input/Output handling for raw Echoview files.
"""

# This file is part of Echofilter.
#
# Copyright (C) 2020-2022  Scott C. Lowe and Offshore Energy Research Association (OERA)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

import csv
import datetime
import os
import textwrap
import warnings
from collections import OrderedDict

import numpy as np
import pandas as pd
import scipy.interpolate
import scipy.ndimage
import skimage.measure

from ..ui import style
from . import utils

ROOT_DATA_DIR = "/data/dsforce/surveyExports"

TRANSECT_FIELD_TYPES = {
    "Ping_index": int,
    "Distance_gps": float,
    "Distance_vl": float,
    "Ping_date": str,
    "Ping_time": str,
    "Ping_milliseconds": float,
    "Latitude": float,
    "Longitude": float,
    "Depth_start": float,
    "Depth_stop": float,
    "Range_start": float,
    "Range_stop": float,
    "Sample_count": int,
}


[docs]def transect_reader(fname): """ Create a generator which iterates through a survey csv file. Parameters ---------- fname: str Path to survey CSV file. Returns ------- generator Yields a tupule of `(metadata, data)`, where metadata is a dict, and data is a :class:`numpy.ndarray`. Each yield corresponds to a single row in the data. Every row (except for the header) is yielded. """ metadata_header = [] with open(fname, "rb") as hf: for i_row, row in enumerate(hf): try: row = row.decode("utf-8-sig" if i_row == 0 else "utf-8") except Exception: if i_row == 0: raise print( "Row {} of {} contained a byte which is not in UTF-8" " and will be skipped.".format(i_row, fname) ) continue row = row.split(",") row = [entry.strip() for entry in row] if i_row == 0: metadata_header = row continue metadata = row[: len(metadata_header)] metadata_d = OrderedDict() for k, v in zip(metadata_header, metadata): if k in TRANSECT_FIELD_TYPES: metadata_d[k] = TRANSECT_FIELD_TYPES[k](v) else: metadata_d[k] = v data = np.array([float(x) for x in row[len(metadata_header) :]]) yield metadata_d, data
[docs]def count_lines(filename): """ Count the number of lines in a file. Parameters ---------- filename : str Path to file. Returns ------- int Number of lines in file. """ with open(filename, "rb") as f: for _i, _ in enumerate(f): pass return _i + 1
[docs]def transect_loader( fname, skip_lines=0, warn_row_overflow=None, row_len_selector="mode", ): """ Load an entire survey transect CSV. Parameters ---------- fname : str Path to survey CSV file. skip_lines : int, optional Number of initial entries to skip. Default is 0. warn_row_overflow : bool or int, optional Whether to print a warning message if the number of elements in a row exceeds the expected number. If this is an int, this is the number of times to display the warnings before they are supressed. If this is ``True``, the number of outputs is unlimited. If ``None``, the maximum number of underflow and overflow warnings differ: if ``row_len_selector`` is ``"init"`` or ``"min"``, underflow always produces a message and the overflow messages stop at 2; otherwise the values are reversed. Default is ``None``. row_len_selector : {"init", "min", "max", "median", "mode"}, optional The method used to determine which row length (number of depth samples) to use. Default is ``"mode"``, the most common row length across all the measurement timepoints. Returns ------- numpy.ndarray Timestamps for each row, in seconds. Note: not corrected for timezone (so make sure your timezones are internally consistent). numpy.ndarray Depth of each column, in metres. numpy.ndarray Survey signal (Sv, for instance). Units match that of the file. """ row_len_selector = row_len_selector.lower() if row_len_selector in {"init", "min"}: expand_for_overflow = False else: expand_for_overflow = True if warn_row_overflow is True: warn_row_overflow = np.inf if warn_row_overflow is not None: warn_row_underflow = warn_row_overflow elif expand_for_overflow: warn_row_underflow = 2 warn_row_overflow = np.inf else: warn_row_underflow = np.inf warn_row_overflow = 2 # We remove one from the line count because of the header # which is excluded from output n_lines = count_lines(fname) - 1 # Initialise output array for i_line, (_, row) in enumerate(transect_reader(fname)): if i_line < min(n_lines, max(1, skip_lines)): continue n_depths_init = len(row) break n_depth_exp = n_depths_init data = np.empty((n_lines - skip_lines, n_depth_exp)) data[:] = np.nan timestamps = np.empty((n_lines - skip_lines)) timestamps[:] = np.nan row_lengths = np.empty((n_lines - skip_lines), dtype=np.int) row_depth_starts = np.empty((n_lines - skip_lines)) row_depth_ends = np.empty((n_lines - skip_lines)) n_warn_overflow = 0 n_warn_underflow = 0 n_entry = 0 for i_line, (meta, row) in enumerate(transect_reader(fname)): if i_line < skip_lines: continue i_entry = i_line - skip_lines # Track the range of depths used in the row with this length row_lengths[i_entry] = len(row) row_depth_starts[i_entry] = meta["Depth_start"] row_depth_ends[i_entry] = meta["Depth_stop"] if len(row) > n_depth_exp: if n_warn_overflow < warn_row_overflow: print( "Row {} of {} exceeds expected n_depth of {} with {}".format( i_line, fname, n_depth_exp, len(row) ) ) n_warn_overflow += 1 if expand_for_overflow: data = np.pad( data, ((0, 0), (0, len(row) - n_depth_exp)), mode="constant", constant_values=np.nan, ) n_depth_exp = len(row) if len(row) < n_depth_exp: if n_warn_underflow < warn_row_underflow: print( "Row {} of {} shorter than expected n_depth_exp of {} with {}".format( i_line, fname, n_depth_exp, len(row) ) ) n_warn_underflow += 1 data[i_entry, : len(row)] = row else: data[i_entry, :] = row[:n_depth_exp] timestamps[i_entry] = datetime.datetime.strptime( "{}T{}.{:06d}".format( meta["Ping_date"], meta["Ping_time"], int(1000 * float(meta["Ping_milliseconds"])), ), "%Y-%m-%dT%H:%M:%S.%f", ).timestamp() n_entry += 1 # Turn NaNs into NaNs (instead of extremely negative number) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered in less") warnings.filterwarnings("ignore", "invalid value encountered in greater") # 9.9e+37 and -9.9e+37 are special values indicating missing data # https://support.echoview.com/WebHelp/Reference/File_formats/Export_file_formats/Special_Export_Values.htm data[data < -1e37] = np.nan data[data > 1e37] = np.nan # Trim timestamps dimension down to size timestamps = timestamps[:n_entry] data = data[:n_entry] row_lengths = row_lengths[:n_entry] row_depth_starts = row_depth_starts[:n_entry] row_depth_ends = row_depth_ends[:n_entry] # Work out what row length we should return if row_len_selector == "init": n_depth_use = n_depths_init elif row_len_selector == "min": n_depth_use = np.min(row_lengths) elif row_len_selector == "max": n_depth_use = np.max(row_lengths) elif row_len_selector == "median": n_depth_use = np.median(row_lengths) # If the median is half-way between two values, round up if n_depth_use not in row_depth_starts: n_depth_use = int(np.round(n_depth_use)) # If the median is still not between values, drop the last value # to make the array be odd, guaranteeing the median is an observed # value, not an intermediary. if n_depth_use not in row_depth_starts: n_depth_use = np.median(row_lengths[:-1]) elif row_len_selector == "mode": n_depth_use = utils.mode(row_lengths) else: raise ValueError( "Unsupported row_len_selector value: {}".format(row_len_selector) ) # Use depths corresponding to that declared in the rows which had the # number of entries used. if row_len_selector == "median": d_start = np.median(row_depth_starts[row_lengths == n_depth_use]) d_stop = np.median(row_depth_ends[row_lengths == n_depth_use]) else: d_start = utils.mode(row_depth_starts[row_lengths == n_depth_use]) d_stop = utils.mode(row_depth_ends[row_lengths == n_depth_use]) depths = np.linspace(d_start, d_stop, n_depth_use) # Interpolate depths to get a consistent sampling grid interp_kwargs = dict(nan_threshold=0.3, assume_sorted=True) for i_entry, (nd, d0, d1) in enumerate( zip(row_lengths, row_depth_starts, row_depth_ends) ): if d0 < d1: data[i_entry, :n_depth_use] = utils.interp1d_preserve_nan( np.linspace(d0, d1, nd), data[i_entry, :nd], depths, **interp_kwargs, ) else: data[i_entry, :n_depth_use] = utils.interp1d_preserve_nan( np.linspace(d1, d0, nd), data[i_entry, :nd][::-1], depths, **interp_kwargs, ) # Crop the data down to size data = data[:, :n_depth_use] return timestamps, depths, data
[docs]def evl_reader(fname): """ EVL file reader. Parameters ---------- fname : str Path to .evl file. Returns ------- generator A generator which yields the timestamp (in seconds), depth (in metres), and status (int) for each entry. Note that the timestamp is not corrected for timezone (so make sure your timezones are internally consistent). """ with open(fname, "r") as hf: continuance = True for i_row, row in enumerate(csv.reader(hf, delimiter=" ")): if i_row == 0: continue if len(row) < 4: if not continuance: raise ValueError("Trying to skip data after parsing began") continue continuance = False timestamp = evdtstr2timestamp(row[0], row[1]) if len(row[2]) > 0: raise ValueError("row[2] was non-empty: {}".format(row[2])) yield timestamp, float(row[3]), int(row[4])
[docs]def evl_loader(fname, special_to_nan=True, return_status=False): """ EVL file loader. Parameters ---------- fname : str Path to .evl file. special_to_nan : bool, optional Whether to replace the special value, ``-10000.99``, which indicates no depth value, with NaN. https://support.echoview.com/WebHelp/Reference/File_formats/Export_file_formats/Special_Export_Values.htm Returns ------- numpy.ndarray of floats Timestamps, in seconds. numpy.ndarary of floats Depth, in metres. numpy.ndarary of ints, optional Status codes. """ timestamps = [] values = [] statuses = [] for timestamp, value, status in evl_reader(fname): timestamps.append(timestamp) values.append(value) statuses.append(status) timestamps = np.array(timestamps) values = np.array(values) statuses = np.array(statuses) if special_to_nan: # Replace the special value -10000.99 with NaN # https://support.echoview.com/WebHelp/Reference/File_formats/Export_file_formats/Special_Export_Values.htm values[np.isclose(values, -10000.99)] = np.nan if return_status: return timestamps, values, statuses return timestamps, values
[docs]def timestamp2evdtstr(timestamp): """ Convert a timestamp into an Echoview-compatible datetime string. The output is in the format "CCYYMMDD HHmmSSssss", where: | CC: century | YY: year | MM: month | DD: day | HH: hour | mm: minute | SS: second | ssss: 0.1 milliseconds Parameters ---------- timestamp : float Number of seconds since Unix epoch. Returns ------- datetimestring : str Datetime string in the Echoview-compatible format "CCYYMMDD HHmmSSssss". """ # Datetime must be in the format CCYYMMDD HHmmSSssss # where ssss = 0.1 milliseconds. # We have to manually determine the number of "0.1 milliseconds" # from the microsecond component. dt = datetime.datetime.fromtimestamp(timestamp) return "{}{:04d}".format(dt.strftime("%Y%m%d %H%M%S"), round(dt.microsecond / 100))
[docs]def evdtstr2timestamp(datestr, timestr=None): """ Convert an Echoview-compatible datetime string into a Unix epoch timestamp. Parameters ---------- datestr : str Datetime string in the Echoview-compatible format ``"CCYYMMDD HHmmSSssss"``, or (if timestr is also provided) just the date part, ``"CCYYMMDD"``. timestr : str, optional Time string in the Echoview-compatible format "HHmmSSssss". Returns ------- timestamp : float Number of seconds since Unix epoch. """ if timestr: datestr = datestr + " " + timestr return datetime.datetime.strptime(datestr, "%Y%m%d %H%M%S%f").timestamp()
[docs]def evr_reader(fname, parse_echofilter_regions=True): """ Echoview region file (EVR) reader. Parameters ---------- fname : str Path to .evr file. parse_echofilter_regions : bool, default=True Whether to separate out echofilter generated regions (passive, removed vbands, and removed patches) from other regions. Returns ------- regions_passive : list of tuples, optional Start and end timestamps for passive regions. regions_removed : list of tuples, optional Start and end timestamps for removed vertical bands. regions_patch : list of lists, optional Start and end timestamps for bad data patches. regions_other : list of dicts Dictionary mapping creation type to points defining each region. """ regions_passive = [] regions_removed = [] regions_patch = [] regions_other = [] # Line 1: EVRG file version header with open(fname, "r") as hf: line = hf.readline() if "EVRG" not in line: raise EnvironmentError("This is not an EVR/EVRG file") # Line 2: Number of regions line = hf.readline().strip("\n\r") n_regions = int(line) for _ in range(n_regions): # Line 3: Intentionally left blank line = hf.readline().strip("\n\r") if len(line): print( "Badly formatted EVRG file. Separating line is not blank: {}".format( line ) ) # Individual region region_is_passive = False region_is_removed = False region_is_patch = False # Region header line = hf.readline().strip("\n\r") line = line.replace(" ", " ") lparts = line.split(" ") if lparts[0] != "13": raise EnvironmentError( "Only Region structure version 13 is supported ({} given).".format( lparts[0] ) ) n_points = int(lparts[1]) _region_id = int(lparts[2]) # noqa: F841 # Selected indicator: lparts[3] == 0 region_ctype = int(lparts[4]) # Dummy: lparts[5] == -1 _has_bbox = bool(int(lparts[6])) # noqa: F841 _left = evdtstr2timestamp(lparts[7], lparts[8]) # noqa: F841 _top = float(lparts[9]) # noqa: F841 _right = evdtstr2timestamp(lparts[10], lparts[11]) # noqa: F841 _bottom = float(lparts[12]) # noqa: F841 # Notes line = hf.readline().strip("\n\r") n_notes = int(line) notes = "\n".join(hf.readline() for _ in range(n_notes)) if notes.startswith("Passive data"): region_is_passive = True if notes.startswith("Removed data block"): region_is_removed = True if notes.startswith("Removed patch"): region_is_patch = True # Detection settings line = hf.readline().strip("\n\r") n_detset = int(line) for _ in range(n_detset): hf.readline() # Region classification _region_classification = hf.readline().strip("\n\r") # noqa: F841 # Points points = hf.readline().strip().replace(" ", " ").split(" ") _region_status = int(points.pop()) # noqa: F841 if len(points) % 3 != 0: print("Points not composed correctly") if len(points) // 3 != n_points: print( "Different number of points to expected: {} vs {}".format( len(points), n_points ) ) points = [ ( evdtstr2timestamp(points[i * 3], points[i * 3 + 1]), float(points[i * 3 + 2]), ) for i in range(n_points) ] _region_name = hf.readline().strip("\n\r") # noqa: F841 # Add region to list if not parse_echofilter_regions: regions_other.append({region_ctype: points}) continue if region_is_passive: if region_ctype != 4: print( "Warning: Region creation type is {} not 4".format(region_ctype) ) regions_passive.append((points[0][0], points[-1][0])) elif region_is_removed: if region_ctype != 4: print( "Warning: Region creation type is {} not 4".format(region_ctype) ) regions_removed.append((points[0][0], points[-1][0])) elif region_is_patch: if region_ctype != 2: print( "Warning: Region creation type is {} not 2".format(region_ctype) ) regions_patch.append(points) else: regions_other.append({region_ctype: points}) if not parse_echofilter_regions: return regions_other return regions_passive, regions_removed, regions_patch, regions_other
[docs]def regions2mask( timestamps, depths, regions_passive=None, regions_removed=None, regions_patch=None, regions_other=None, ): """ Convert regions to mask. Takes the output from :func:evr_reader` and returns a set of masks. Parameters ---------- timestamps : array_like Timestamps for each node in the line. depths : array_like Depths (in meters) for each node in the line. regions_passive : list of tuples, optional Start and end timestamps for passive regions. regions_removed : list of tuples, optional Start and end timestamps for removed vertical bands. regions_patch : list of lists, optional Start and end timestamps for bad data patches. regions_other : list of dicts Dictionary mapping creation type to points defining each region. Returns ------- transect : dict A dictionary with keys: - "is_passive" : numpy.ndarray Logical array showing whether a timepoint is of passive data. Shaped ``(num_timestamps, )``. All passive recording data should be excluded by the mask. - "is_removed" : numpy.ndarray Logical array showing whether a timepoint is entirely removed by the mask. Shaped ``(num_timestamps, )``. - "mask_patches" : numpy.ndarray Logical array indicating which datapoints are inside a patch from regions_patch (``True``) and should be excluded by the mask. Shaped ``(num_timestamps, num_depths)``. - "mask" : numpy.ndarray Logical array indicating which datapoints should be kept (``True``) and which are marked as removed (``False``) by one of the other three outputs. Shaped ``(num_timestamps, num_depths)``. """ if regions_other is not None: raise NotImplementedError("Other regions are not yet supported.") is_passive = np.zeros(timestamps.shape, dtype=bool) is_removed = np.zeros(timestamps.shape, dtype=bool) for ts_start, ts_end in regions_passive: is_passive[(ts_start <= timestamps) & (timestamps <= ts_end)] = 1 for ts_start, ts_end in regions_removed: is_removed[(ts_start <= timestamps) & (timestamps <= ts_end)] = 1 # Create an empty image to store the masked array mask_patches = np.zeros((len(timestamps), len(depths)), dtype=bool) dt = np.abs(timestamps[-1] - timestamps[-2]) dd = np.abs(depths[-1] - depths[-2]) for patch in regions_patch: patch = np.asarray(patch) # Find closest indices to the contour coordinates given x = np.searchsorted(timestamps + dt / 2, patch[:, 0], side="left") y = np.searchsorted(depths + dd / 2, patch[:, 1], side="left") # Make a mask for this patch mask_i = np.zeros_like(mask_patches, dtype="bool") # Create a contour image by using the contour coordinates rounded to their nearest integer value mask_i[x, y] = 1 # Fill in the hole created by the contour boundary mask_i = scipy.ndimage.binary_fill_holes(mask_i) # Add this patch to the overall mask for all patches mask_patches |= mask_i # mask_patches shows where bad patches are; mask shows where good data is mask = ~mask_patches mask[is_passive | is_removed] = 0 transect = { "is_passive": is_passive, "is_removed": is_removed, "mask": mask, "mask_patches": mask_patches, } return transect
[docs]def evl_writer(fname, timestamps, depths, status=1, line_ending="\r\n", pad=False): r""" EVL file writer. Parameters ---------- fname : str Destination of output file. timestamps : array_like Timestamps for each node in the line. depths : array_like Depths (in meters) for each node in the line. status : 0, 1, 2, or 3; optional Status for the line. - ``0`` : none - ``1`` : unverified - ``2`` : bad - ``3`` : good Default is ``1`` (unverified). For more details on line status, see https://support.echoview.com/WebHelp/Using_Echoview/Echogram/Lines/About_Line_Status.htm pad : bool, optional Whether to pad the line with an extra datapoint half a pixel before the first and after the last given timestamp. Default is ``False``. line_ending : str, optional Line ending. Default is ``"\r\n"`` the standard line ending on Windows/DOS, as per the specification for the file format. https://support.echoview.com/WebHelp/Using_Echoview/Exporting/Exporting_data/Exporting_line_data.htm Set to ``"\n"`` to get Unix-style line endings instead. Notes ----- For more details on the format specification, see https://support.echoview.com/WebHelp/Using_Echoview/Exporting/Exporting_data/Exporting_line_data.htm#Line_definition_file_format """ if len(timestamps) != len(depths): raise ValueError( "Number of timestamps ({}) and depths ({}) are not equal".format( len(timestamps), len(depths) ) ) if pad and len(timestamps) > 1: timestamps = timestamps[:] timestamps = np.r_[ timestamps[0] - (timestamps[1] - timestamps[0]) / 2, timestamps, timestamps[-1] + (timestamps[-1] - timestamps[-2]) / 2, ] depths = np.r_[depths[0], depths, depths[-1]] # The file object will automatically replace \n with our chosen line ending with open(fname, "w+", encoding="utf-8-sig", newline=line_ending) as hf: # Write header hf.write("EVBD 3 10.0.270.37090\n") n_row = len(depths) hf.write(str(n_row) + "\n") # Write each row for timestamp, depth in zip(timestamps, depths): # Datetime must be in the format CCYYMMDD HHmmSSssss # where ssss = 0.1 milliseconds. # We have to manually determine the number of "0.1 milliseconds" # from the microsecond component. hf.write("{} {} {} \n".format(timestamp2evdtstr(timestamp), depth, status))
[docs]def evr_writer( fname, rectangles=None, contours=None, common_notes="", default_region_type=0, line_ending="\r\n", ): r""" EVR file writer. Writes regions to an Echoview region file. Parameters ---------- fname : str Destination of output file. rectangles : list of dictionaries, optional Rectangle region definitions. Default is an empty list. Each rectangle region must implement fields ``"depths"`` and ``"timestamps"``, which indicate the extent of the rectangle. Optionally, ``"creation_type"``, ``"region_name"``, ``"region_type"``, and ``"notes"`` may be set. If these are not given, the default creation_type is 4 and region_type is set by ``default_region_type``. contours : list of dictionaries Contour region definitions. Default is an empty list. Each contour region must implement a ``"points"`` field containing a :class:`numpy.ndarray` shaped `(n, 2)` defining the co-ordinates of nodes along the (open) contour in units of timestamp and depth. Optionally, ``"creation_type"``, ``"region_name"``, ``"region_type"``, and ``"notes"`` may be set. If these are not given, the default creation_type is 2 and region_type is set by ``default_region_type``. common_notes : str, optional Notes to include for every region. Default is ``""``, an empty string. default_region_type : int, optional The region type to use for rectangles and contours which do not define a ``"region_type"`` field. Possible region types are - ``0`` : bad (no data) - ``1`` : analysis - ``2`` : marker - ``3`` : fishtracks - ``4`` : bad (empty water) Default is ``0``. line_ending : str, optional Line ending. Default is ``"\r\n"`` the standard line ending on Windows/DOS, as per the specification for the file format. https://support.echoview.com/WebHelp/Using_Echoview/Exporting/Exporting_data/Exporting_line_data.htm Set to ``"\n"`` to get Unix-style line endings instead. Notes ----- For more details on the format specification, see: https://support.echoview.com/WebHelp/Reference/File_formats/Export_file_formats/2D_Region_definition_file_format.htm """ if rectangles is None: rectangles = [] if contours is None: contours = [] # Remove leading/trailing new lines, since we will join with our own line ending common_notes = common_notes.strip("\r\n") # Standardize line endings to be \n, regardless of input common_notes = common_notes.replace("\r\n", "\n").replace("\r", "\n") if len(common_notes) == 0: n_lines_common_notes = 0 else: n_lines_common_notes = 1 + common_notes.count(line_ending) n_regions = len(rectangles) + len(contours) i_region = 0 # The file object will automatically replace \n with our chosen line ending with open(fname, "w+", encoding="utf-8-sig", newline=line_ending) as hf: # Write header hf.write("EVRG 7 10.0.283.37689\n") hf.write(str(n_regions) + "\n") # Write each rectangle for region in rectangles: # Regions are indexed from 1, so increment the counter first i_region += 1 hf.write("\n") # Blank line separates regions # Determine extent of rectangle left = timestamp2evdtstr(np.min(region["timestamps"])) right = timestamp2evdtstr(np.max(region["timestamps"])) top = np.min(region["depths"]) bottom = np.max(region["depths"]) # Region header hf.write( "13 4 {i} 0 {type} -1 1 {left} {top} {right} {bottom}".format( i=i_region, type=region.get("creation_type", 4), left=left, right=right, top=top, bottom=bottom, ) + "\n" ) # Notes notes = region.get("notes", "") if len(notes) == 0: notes = common_notes n_lines_notes = n_lines_common_notes else: notes = notes.strip("\n") if len(common_notes) > 0: notes += "\n" + common_notes n_lines_notes = 1 + notes.count("\n") hf.write(str(n_lines_notes) + "\n") # Number of lines of notes if len(notes) > 0: hf.write(notes + "\n") # Detection settings hf.write("0\n") # Number of lines of detection settings # Region classification string hf.write("Unclassified regions\n") # The points defining the region itself hf.write( "{left} {top} {left} {bottom} {right} {bottom} {right} {top} ".format( left=left, right=right, top=top, bottom=bottom, ) # Terminates with a space, not a new line ) # Region type hf.write(str(region.get("region_type", default_region_type)) + "\n") # Region name hf.write( str(region.get("region_name", "Region {}".format(i_region))) + "\n" ) # Write each contour for region in contours: # Regions are indexed from 1, so increment the counter first i_region += 1 hf.write("\n") # Blank line separates regions # Header line hf.write( "13 {n} {i} 0 {type} -1 1 {left} {top} {right} {bottom}".format( n=region["points"].shape[0], i=i_region, type=region.get("creation_type", 2), left=timestamp2evdtstr(np.min(region["points"][:, 0])), right=timestamp2evdtstr(np.max(region["points"][:, 0])), top=np.min(region["points"][:, 1]), bottom=np.max(region["points"][:, 1]), ) + "\n" ) # Notes notes = region.get("notes", "") if len(notes) == 0: notes = common_notes n_lines_notes = n_lines_common_notes else: notes = notes.strip("\n") if len(common_notes) > 0: notes += "\n" + common_notes n_lines_notes = 1 + notes.count("\n") hf.write(str(n_lines_notes) + "\n") # Number of lines of notes if len(notes) > 0: hf.write(notes + "\n") # Detection settings hf.write("0\n") # Number of lines of detection settings # Region classification string hf.write("Unclassified regions\n") # The region itself for point in region["points"]: hf.write("{} {} ".format(timestamp2evdtstr(point[0]), point[1])) # Region type hf.write(str(region.get("region_type", default_region_type)) + "\n") # Region name hf.write( str(region.get("region_name", "Region {}".format(i_region))) + "\n" )
[docs]def write_transect_regions( fname, transect, depth_range=None, passive_key="is_passive", removed_key="is_removed", patches_key="mask_patches", collate_passive_length=0, collate_removed_length=0, minimum_passive_length=0, minimum_removed_length=0, minimum_patch_area=0, name_suffix="", common_notes="", line_ending="\r\n", verbose=0, verbose_indent=0, ): r""" Convert a transect dictionary to a set of regions and write as an EVR file. Parameters ---------- fname : str Destination of output file. transect : dict Transect dictionary. depth_range : array_like or None, optional The minimum and maximum depth extents (in any order) of the passive and removed block regions. If this is ``None`` (default), the minimum and maximum of ``transect["depths"]`` is used. passive_key : str, optional Field name to use for passive data identification. Default is ``"is_passive"``. removed_key : str, optional Field name to use for removed blocks. Default is ``"is_removed"``. patches_key : str, optional Field name to use for the mask of patch regions. Default is ``"mask_patches"``. collate_passive_length : int, optional Maximum distance (in indices) over which passive regions should be merged together, closing small gaps between them. Default is ``0``. collate_removed_length : int, optional Maximum distance (in indices) over which removed blocks should be merged together, closing small gaps between them. Default is ``0``. minimum_passive_length : int, optional Minimum length (in indices) a passive region must have to be included in the output. Set to -1 to omit all passive regions from the output. Default is ``0``. minimum_removed_length : int, optional Minimum length (in indices) a removed block must have to be included in the output. Set to -1 to omit all removed regions from the output. Default is ``0``. minimum_patch_area : float, optional Minimum amount of area (in input pixel space) that a patch must occupy in order to be included in the output. Set to ``0`` to include all patches, no matter their area. Set to ``-1`` to omit all patches. Default is ``0``. name_suffix : str, optional Suffix to append to variable names. Default is ``""``, an empty string. common_notes : str, optional Notes to include for every region. Default is ``""``, an empty string. line_ending : str, optional Line ending. Default is ``"\r\n"`` the standard line ending on Windows/DOS, as per the specification for the file format, https://support.echoview.com/WebHelp/Using_Echoview/Exporting/Exporting_data/Exporting_line_data.htm Set to ``"\n"`` to get Unix-style line endings instead. verbose : int, optional Verbosity level. Default is ``0``. verbose_indent : int, optional Level of indentation (number of preceding spaces) before verbosity messages. Default is ``0``. """ if depth_range is None: depth_range = transect["depths"] depth_range = [np.min(depth_range), np.max(depth_range)] rectangles = [] # Regions around each period of passive data key = passive_key if key not in transect: key = "p_" + key if key not in transect: raise ValueError("Key {} and {} not found in transect.".format(key[2:], key)) is_passive = transect[key] > 0.5 is_passive = ~utils.squash_gaps(~is_passive, collate_passive_length) passive_starts, passive_ends = utils.get_indicator_onoffsets(is_passive) i_passive = 1 n_passive_skipped = 0 for start_index, end_index in zip(passive_starts, passive_ends): start_index -= 0.5 end_index += 0.5 if minimum_passive_length == -1: # No passive regions break if end_index - start_index <= minimum_passive_length: n_passive_skipped += 1 continue region = {} region["region_name"] = "Passive{} {}".format(name_suffix, i_passive) region["creation_type"] = 4 region["region_type"] = 0 region["depths"] = depth_range region["timestamps"] = scipy.interpolate.interp1d( np.arange(len(transect["timestamps"])), transect["timestamps"], fill_value="extrapolate", )([start_index, end_index]) region["notes"] = textwrap.dedent( """ Passive data Length in pixels: {} Duration in seconds: {} """.format( end_index - start_index, region["timestamps"][1] - region["timestamps"][0], ) ) rectangles.append(region) i_passive += 1 # Regions around each period of removed data key = removed_key if key not in transect: key = "p_" + key if key not in transect: raise ValueError("Key {} and {} not found in transect.".format(key[2:], key)) is_removed = transect[key] > 0.5 is_removed = ~utils.squash_gaps(~is_removed, collate_removed_length) removed_starts, removed_ends = utils.get_indicator_onoffsets(is_removed) i_removed = 1 n_removed_skipped = 0 for start_index, end_index in zip(removed_starts, removed_ends): start_index -= 0.5 end_index += 0.5 if minimum_removed_length == -1: # No passive regions break if end_index - start_index <= minimum_removed_length: n_removed_skipped += 1 continue region = {} region["region_name"] = "Removed block{} {}".format(name_suffix, i_removed) region["creation_type"] = 4 region["region_type"] = 0 region["depths"] = depth_range region["timestamps"] = scipy.interpolate.interp1d( np.arange(len(transect["timestamps"])), transect["timestamps"], fill_value="extrapolate", )([start_index, end_index]) region["notes"] = textwrap.dedent( """ Removed data block Length in pixels: {} Duration in seconds: {} """.format( end_index - start_index, region["timestamps"][1] - region["timestamps"][0], ) ) rectangles.append(region) i_removed += 1 # Contours around each removed patch if patches_key not in transect: raise ValueError("Key {} not found in transect.".format(patches_key)) patches = transect[patches_key] patches = scipy.ndimage.binary_fill_holes(patches > 0.5) contours_coords = skimage.measure.find_contours(patches, 0.5) contour_dicts = [] i_contour = 1 n_contour_skipped = 0 for contour in contours_coords: if minimum_patch_area == -1: # No patches break area = utils.integrate_area_of_contour( contour[:, 0], contour[:, 1], closed=False ) if area < minimum_patch_area: n_contour_skipped += 1 continue region = {} region["region_name"] = "Removed patch{} {}".format(name_suffix, i_contour) region["creation_type"] = 2 region["region_type"] = 0 x = scipy.interpolate.interp1d( np.arange(len(transect["timestamps"])), transect["timestamps"], fill_value="extrapolate", )(contour[:, 0]) y = scipy.interpolate.interp1d( np.arange(len(transect["depths"])), transect["depths"], fill_value="extrapolate", )(contour[:, 1]) region["points"] = np.stack([x, y], axis=-1) region["notes"] = textwrap.dedent( """ Removed patch Area in pixels: {} Area in meter-seconds: {} """.format( area, utils.integrate_area_of_contour(x, y, closed=False) ) ) contour_dicts.append(region) i_contour += 1 if verbose >= 1: print( " " * verbose_indent + "Outputting {} region{}:" " {} passive, {} removed blocks, {} removed patches".format( len(rectangles) + len(contour_dicts), "" if len(rectangles) + len(contour_dicts) == 1 else "s", i_passive - 1, i_removed - 1, i_contour - 1, ) ) n_skipped = n_passive_skipped + n_removed_skipped + n_contour_skipped if n_skipped > 0: print( " " * verbose_indent + style.skip_fmt( "There {} {} skipped (too small) region{}:" " {} passive, {} removed blocks, {} removed patches".format( "was" if n_skipped == 1 else "were", n_skipped, "" if n_skipped == 1 else "s", n_passive_skipped, n_removed_skipped, n_contour_skipped, ) ) ) # Write the output return evr_writer( fname, rectangles=rectangles, contours=contour_dicts, common_notes=common_notes, line_ending=line_ending, )
[docs]def load_transect_data(transect_pth, dataset="mobile", root_data_dir=ROOT_DATA_DIR): """ Load all data for one transect. Parameters ---------- transect_pth : str Relative path to transect, excluding ``"_Sv_raw.csv"``. dataset : str, optional Name of dataset. Default is ``"mobile"``. root_data_dir : str Path to root directory where data is located. Returns ------- timestamps : numpy.ndarray Timestamps (in seconds since Unix epoch), with each entry corresponding to each row in the ``signals`` data. depths : numpy.ndarray Depths from the surface (in metres), with each entry corresponding to each column in the ``signals`` data. signals : numpy.ndarray Echogram Sv data, shaped (num_timestamps, num_depths). turbulence : numpy.ndarray Depth of turbulence line, shaped (num_timestamps, ). bottom : numpy.ndarray Depth of bottom line, shaped (num_timestamps, ). """ dirname = os.path.join(root_data_dir, dataset) raw_fname = os.path.join(dirname, transect_pth + "_Sv_raw.csv") bottom_fname = os.path.join(dirname, transect_pth + "_bottom.evl") turbulence_fname = os.path.join(dirname, transect_pth + "_turbulence.evl") timestamps, depths, signals = transect_loader(raw_fname) t_bottom, d_bottom = evl_loader(bottom_fname) t_turbulence, d_turbulence = evl_loader(turbulence_fname) return ( timestamps, depths, signals, np.interp(timestamps, t_turbulence, d_turbulence), np.interp(timestamps, t_bottom, d_bottom), )
[docs]def get_partition_data( partition, dataset="mobile", partitioning_version="firstpass", root_data_dir=ROOT_DATA_DIR, ): """ Load partition metadata. Parameters ---------- transect_pth : str Relative path to transect, excluding ``"_Sv_raw.csv"``. dataset : str, optional Name of dataset. Default is ``"mobile"``. partitioning_version : str, optional Name of partitioning method. root_data_dir : str Path to root directory where data is located. Returns ------- pandas.DataFrame Metadata for all transects in the partition. Each row is a single sample. """ dirname = os.path.join(root_data_dir, dataset, "sets", partitioning_version) fname_partition = os.path.join(dirname, partition + ".txt") fname_header = os.path.join(dirname, "header.txt") with open(fname_header, "r") as hf: for row in csv.reader(hf): header = [entry.strip() for entry in row] break df = pd.read_csv(fname_partition, header=None, names=header) return df
[docs]def remove_trailing_slash(s): """ Remove trailing forward slashes from a string. Parameters ---------- s : str String representing a path, possibly with trailing slashes. Returns ------- str Same as ``s``, but without trailing forward slashes. """ while s[-1] == "/" or s[-1] == os.path.sep: s = s[:-1] return s
[docs]def list_from_file(fname): """ Get a list from a file. Parameters ---------- fname : str Path to file. Returns ------- list Contents of the file, one line per entry in the list. Trailing whitespace is removed from each end of each line. """ with open(fname, "r") as hf: contents = hf.readlines() contents = [x.strip() for x in contents] return contents
[docs]def get_partition_list( partition, dataset="mobile", full_path=False, partitioning_version="firstpass", root_data_dir=ROOT_DATA_DIR, sharded=False, ): """ Get a list of transects in a single partition. Parameters ---------- transect_pth : str Relative path to transect, excluding ``"_Sv_raw.csv"``. dataset : str, optional Name of dataset. Default is ``"mobile"``. full_path : bool, optional Whether to return the full path to the sample. If ``False``, only the relative path (from the dataset directory) is returned. Default is ``False``. partitioning_version : str, optional Name of partitioning method. root_data_dir : str, optional Path to root directory where data is located. sharded : bool, optional Whether to return path to sharded version of data. Default is ``False``. Returns ------- list Path for each sample in the partition. """ if dataset == "mobile": df = get_partition_data( partition, dataset=dataset, partitioning_version=partitioning_version, root_data_dir=root_data_dir, ) fnames = df["Filename"] fnames = [os.path.join(f.split("_")[0], f.strip()) for f in fnames] else: partition_file = os.path.join( root_data_dir, dataset, "sets", partitioning_version, partition + ".txt", ) fnames = list_from_file(partition_file) fnames = [f.replace("_Sv_raw.csv", "") for f in fnames] if full_path and sharded: root_data_dir = remove_trailing_slash(root_data_dir) fnames = [os.path.join(root_data_dir + "_sharded", dataset, f) for f in fnames] elif full_path: fnames = [os.path.join(root_data_dir, dataset, f) for f in fnames] return fnames