Commit be21a955 authored by Oliver Kirsebom's avatar Oliver Kirsebom
Browse files

bla

parent 5df4f2aa
# Read more about scikit-learn outlier testing here:
# http://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection
import os
import numpy as np
import pandas as pd
import argparse
import json
from enum import Enum
from collections import namedtuple
import datetime
from pint import UnitRegistry
import math
import matplotlib.pyplot as plt
from ketos.utils import detect_peaks, get_member
from ketos.utils import detect_peaks
from ketos.data_handling.parsing import str2bool
import src.parsing as pa
from src.parsing import Detector
import matplotlib
viz = os.environ.get('DISABLE_VIZ')
if viz is not None:
if int(viz) == 1:
matplotlib.use('Agg')
import matplotlib.pyplot as plt
ureg = UnitRegistry()
PeakFindingConfig = namedtuple('PeakFindingConfig', 'separation size multiplicity height')
PeakFindingConfig.__new__.__defaults__ = (60, 3.0, 1, 0)
PeakFindingConfig.__doc__ = '''\
Configuration of peak finding algorithm
separation - Minimum temporal separation between neighboring peaks in seconds
size - Minimum peak height relative to baseline given in multiples of the signal standard devitation (float)
multiplicity - Minimum number of data series in which peak occurs
height - minimum absolute height of peak'''
SVMConfig = namedtuple('SVMConfig', 'nu kernel gamma degree training_data')
SVMConfig.__new__.__defaults__ = (0.01, "poly", 0.001, 2, "None")
SVMConfig.__doc__ = '''\
Configuration of One-class SVM model
See: http://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html
'''
class Detector(Enum):
PEAK_FINDING = 1
ELLIPTIC_ENVELOPE = 2
LOCAL_OUTLIER_FACTOR = 3
ISOLATION_FOREST = 4
ONE_CLASS_SVM = 5
def parse_config(path):
with open(path, "r") as read_file:
# data series
data = json.load(read_file)
data_series = data['data_series']
# detectors
det_names = data['detectors'] # strings
detectors = list() # enums
for det_name in det_names:
n = len(detectors)
detectors.append(get_member(Detector, det_name))
# configs
configs = {}
name, cfg = parse_peak_finding_config(data)
configs[name] = cfg
name, cfg = parse_svm_config(data)
configs[name] = cfg
# outlier fraction
outlier_fraction = 0.1
if data.get('outlier_fraction') is not None:
outlier_fraction = float(data['outlier_fraction'])
# minimum outlier separation in seconds
min_sep = 1
if data.get('anomaly_separation') is not None:
Q = ureg.Quantity
min_sep = Q(data['anomaly_separation'])
min_sep = min_sep.m_as("s")
return data_series, detectors, configs, outlier_fraction, min_sep
def parse_peak_finding_config(data):
s = 'peak_finding_config'
default = PeakFindingConfig()
separation = default.separation
size = default.size
multiplicity = default.multiplicity
if data.get(s) is not None:
d = data[s]
if d['separation'] is not None:
Q = ureg.Quantity
separation = Q(d['separation'])
separation = separation.m_as("s")
if d['prominence'] is not None:
size = float(d['prominence'])
if d['multiplicity'] is not None:
multiplicity = int(d['multiplicity'])
if d['height'] is not None:
height = float(d['height'])
res = PeakFindingConfig(separation=separation, size=size, multiplicity=multiplicity, height=height)
return s, res
def parse_svm_config(data):
s = 'svm_config'
default = SVMConfig()
nu = default.nu
kernel = default.kernel
gamma = default.gamma
degree = default.degree
training_data = default.training_data
if data.get(s) is not None:
d = data[s]
if d['nu'] is not None:
nu = float(d['nu'])
if d['kernel'] is not None:
kernel = d['kernel']
if d['gamma'] is not None:
gamma = float(d['gamma'])
if d['degree'] is not None:
degree = int(d['degree'])
if d['training_data'] is not None:
training_data = d['training_data']
res = SVMConfig(nu=nu, kernel=kernel, gamma=gamma, degree=degree, training_data=training_data)
return s, res
def extract_time_res(df):
time_res = math.inf
for i in range(1,len(df.index)):
t0 = parse_time(df.index[i-1])
t1 = parse_time(df.index[i])
t0 = pa.parse_time(df.index[i-1])
t1 = pa.parse_time(df.index[i])
delta = 1E-6 * (t1 - t0).microseconds
if delta < time_res and delta > 0:
time_res = delta
return time_res
def parse_time(s):
fmt = "%Y-%m-%d %H:%M:%S"
nofrag = s
frag = None
if s.find('.') >= 0:
nofrag, frag = s.split('.')
dt = datetime.datetime.strptime(nofrag, fmt)
if frag is not None:
dt = dt.replace(microsecond=int(1E3*int(frag)))
return time_res
return dt
def zeros_and_ones(x):
x = ((-1)*x + 1) / 2
x = x.astype(int)
return x
def parse_args():
# configure parser
parser = argparse.ArgumentParser(description='Perform outlier- and peak analysis of time-series data.')
parser.add_argument('-c', '--config_file', type=str, help='path to .json config file.', default='anomaly_detector_config.json')
parser.add_argument('-i', '--input_file', type=str, help='.csv file containing time-series data to be analyzed.', default='out.csv')
parser.add_argument('-o', '--output_file', type=str, help='.csv file where analysis report will be outputted.', default='det.csv')
parser.add_argument('-s', '--show_graph', action='store_true', help='Show time-series data')
parser.add_argument('-t', '--time_table', type=str, help='path to .csv file with table of times and file names', default=None)
parser.add_argument('-c', '--config_file', type=str, help='path to json configuration file.', default='settings.json')
parser.add_argument('-i', '--input_file', type=str, help='.csv file containing time-series data to be analyzed.', default='output.csv')
parser.add_argument('-o', '--output_file', type=str, help='.csv file where analysis report will be outputted.', default='detections.csv')
parser.add_argument('-S', '--show_graph', action='store_true', help='Show time-series data')
# parse command-line args
args = parser.parse_args()
return args
def main():
# parse command-line args
......@@ -172,15 +65,21 @@ def main():
input_file = args.input_file
output_file = args.output_file
show_graph = args.show_graph
time_table = args.time_table
# time table
i = input_file.rfind('.')
time_table = input_file[:i] + '_time_table.csv'
# read input data into pandas dataframe
df0 = pd.read_csv(input_file)
assert df0.shape[0] >= 2, "Input data should have at least two rows of data"
# parse settings.json
_, _, detector_config_file = pa.parse_settings(config_file)
# read configuration file
data_series, detectors, configs, outlier_fraction, min_sep = parse_config(config_file)
data_series, detectors, configs, outlier_fraction, min_sep = pa.parse_detect_config(detector_config_file)
# extract relevant columns
data_series.append('time')
......@@ -257,9 +156,9 @@ def main():
sep = 0
times = list()
for i in range(0,n):
tnow = parse_time(df.index[i])
tnow = pa.parse_time(df.index[i])
if i > 0:
tprev = parse_time(df.index[i-1])
tprev = pa.parse_time(df.index[i-1])
delta = (tnow - tprev).total_seconds()
sep += delta
if (s[i] > 0 and (sep > min_sep or i == 0)):
......@@ -277,7 +176,7 @@ def main():
times = list()
for t in df_out['time']:
for _, row in df_tt.iterrows():
t0 = parse_time(row['time'])
t0 = pa.parse_time(row['time'])
f = row['file']
dt = (t0 - t).total_seconds()
if (dt < 0):
......@@ -289,9 +188,8 @@ def main():
# save detections file
df_out.to_csv(output_file)
print(' {0} anomalies detected'.format(len(times)))
print(' {0} boats detected'.format(len(times)))
print(' Detection report saved to: {0}'.format(output_file))
print('')
# plot
......
import datetime
import numpy as np
import pandas as pd
import os
import argparse
import time
from ketos.data_handling.parsing import WinFun
import src.parsing as pa
from ketos.audio_processing.audio import AudioSignal, TimeStampedAudioSignal
from ketos.audio_processing.spectrogram import MagSpectrogram
from ketos.data_handling.data_handling import AudioSequenceReader
batch_no = 0
def make_spec(signal, config):
hamming = False
if config.window_function == WinFun.HAMMING:
hamming = True
# make spectrogram
spec = MagSpectrogram(audio_signal=signal, winlen=config.window_size, winstep=config.step_size,\
hamming=hamming, timestamp=signal.begin(), decibel=True)
return spec
def apply_filters(spec, filters):
# apply filters
for f in filters:
#print(' -',f.name)
f.apply(spec)
# dataframe for output data
t = spec.time_labels()
f = spec.frequency_labels()
df = pd.DataFrame({'time': t})
for i in range(len(f)):
df[f[i]] = spec.image[:,i]
# use date-time column as index
df = df.set_index('time')
df = df.sort_index(ascending=True)
return df
def process(signal, config, filters):
global batch_no
batch_no += 1
# make spectrogram
spec = make_spec(signal=signal, config=config)
# apply filters
filtered_data = apply_filters(spec=spec, filters=filters)
return filtered_data
def parse_args():
# configure parser
parser = argparse.ArgumentParser(description='Split audio signal into frequency bands and produce a time series of the noise magnitude in each band.')
parser.add_argument('-c', '--config_file', type=str, help='path to json configuration file.', default='settings.json')
parser.add_argument('-i', '--input', type=str, help='path to the wav file to be analyzed or directory containing multiple wav files.', default='./')
parser.add_argument('-o', '--output_file', type=str, help='path to output csv file', default='output.csv')
parser.add_argument('-r', '--recursive_search', action='store_true', help='Include subdirectories in search for wav files')
parser.add_argument('-v', '--verbose', action='store_true', help='Print progress updates during processing')
# parse command-line args
args = parser.parse_args()
return args
def main():
start_time = time.time()
# parse command-line args
args = parse_args()
config_file = args.config_file
input_dir = args.input
output_file = args.output_file
recursive = args.recursive_search
verbose = args.verbose
# parse settings.json
fmt, batch_size, detector_config_file = pa.parse_settings(config_file)
# parse detector configuration file
spectr_config, filters = pa.parse_preproc_config(detector_config_file)
# create reader
reader = AudioSequenceReader(source=input_dir, recursive_search=recursive, rate=spectr_config.rate, datetime_fmt=fmt, verbose=verbose)
if verbose:
print(" Found {0} files".format(len(reader.files)))
# loop over batches
outputs = list()
filtered_data = None
while not reader.finished():
if verbose:
global batch_no
print(" Processing batch #{0} ...".format(batch_no+1))
batch = reader.next(size=batch_size) # read next chunk of data
o = process(batch, spectr_config, filters) # process data
outputs.append(o) # collect output
# log of file names and times
time_table = reader.log()
# concatenate
if filtered_data is None:
filtered_data = pd.concat(outputs, ignore_index=False)
else:
filtered_data = pd.concat([filtered_data, outputs[-1]], ignore_index=False)
# save to csv files
rounded = filtered_data.round(3)
rounded.to_csv(output_file)
tt_file = output_file[:output_file.rfind('.')] + '_time_table.csv'
time_table.to_csv(tt_file)
print(" Processed data saved to: {0}".format(output_file))
print(" Time table saved to: {0}".format(tt_file))
# end script
elapsed_time = time.time() - start_time
print(time.strftime(" Elapsed time: %H:%M:%S", time.gmtime(elapsed_time)))
if __name__ == '__main__':
main()
{
"date_time_format": "HMS_%H_%M_%S__DMY_%d_%m_%y",
"batch_size": "5E6",
"filters": ["CROPPING", "AVERAGE", "FAV_THRESHOLD"],
"crop_config": {
"min_frequency": "4 Hz",
......@@ -15,6 +13,16 @@
"window_size": "1.0 s",
"step_size": "0.2 s",
"window_function": "HAMMING"
},
"data_series": ["num_peaks"],
"detectors": ["PEAK_FINDING"],
"median_subtraction": "False",
"anomaly_separation": "2 minute",
"peak_finding_config": {
"separation": "2 minutes",
"prominence": "2.0",
"multiplicity": "1",
"height": "2"
}
}
{
"filters": ["FREQUENCY", "MEDIAN", "MEDIAN_SUBTRACTION"],
"frequency_config": [
{
"name": "31.2Hz",
"range": ["22.1Hz", "44.2Hz"]
},
{
"name": "62.5Hz",
"range": ["44.2Hz", "88.4Hz"]
},
{
"name": "125Hz",
"range": ["88.4Hz", "176.8Hz"]
},
{
"name": "250Hz",
"range": ["176.8Hz", "353.6Hz"]
},
{
"name": "500Hz",
"range": ["353.6Hz", "707.1Hz"]
},
{
"name": "1kHz",
"range": ["707.1Hz", "1414.2Hz"]
},
{
"name": "2kHz",
"range": ["1414.2Hz", "2828.4Hz"]
}
],
"median_config": {
"window_size": "1 min",
"step_size": "1 min"
},
"median_subtraction_config": {
"window_size": "1 hour"
},
"spectrogram": {
"rate": "4000 Hz",
"window_size": "0.1 s",
"step_size": "0.025 s",
"window_function": "HAMMING"
},
"data_series": ["125Hz", "250Hz", "500Hz"],
"detectors": ["PEAK_FINDING"],
"median_subtraction": "True",
"anomaly_separation": "1 minute",
"peak_finding_config": {
"separation": "2 minutes",
"prominence": "3.0",
"multiplicity": "2"
}
}
\ No newline at end of file
{
"date_time_format": "HMS_%H_%M_%S__DMY_%d_%m_%y",
"batch_size": "5E6",
"detector": "detectors/fav.json"
}
from setuptools import setup, find_packages
setup(name='boat_detector',
version='0.0.3',
version='0.0.4',
description="Python scripts for detecting noise made by boats in broadband hydrophone data",
url='https://gitlab.meridian.cs.dal.ca/data_analytics_dal/packages/boat_detector',
author='Oliver Kirsebom, Fabio Frazao',
......@@ -9,8 +9,8 @@ setup(name='boat_detector',
license='GNU General Public License v3.0',
packages=find_packages(),
install_requires=[
'ketos==1.0.3',
'ketos==1.0.9',
],
entry_points = {"console_scripts": ["audio-filterer=bin.audio_filterer:main", "anomaly-detector=bin.anomaly_detector:main"]},
entry_points = {"console_scripts": ["boat-preprocess=bin.preproc:main", "boat-detect=bin.detect:main"]},
include_package_data=True,
zip_safe=False)
import datetime
import numpy as np
import pandas as pd
import os
import matplotlib
viz = os.environ.get('DISABLE_VIZ')
if viz is not None:
if int(viz) == 1:
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import json
import argparse
import math
import ketos.data_handling.parsing as pa
from ketos.audio_processing.audio import AudioSignal, TimeStampedAudioSignal
from ketos.audio_processing.spectrogram import MagSpectrogram
from ketos.data_handling.data_handling import AudioSequenceReader
from ketos.audio_processing.spectrogram_filters import FrequencyFilter, WindowFilter, WindowSubtractionFilter, CroppingFilter, HarmonicFilter, FAVFilter, FAVThresholdFilter
import time
from collections import namedtuple
from pint import UnitRegistry # SI units
import ketos.audio_processing.spectrogram_filters as sf
from ketos.utils import get_member
from pint import UnitRegistry
from enum import Enum
import math
from collections import namedtuple
ureg = UnitRegistry()
PeakFindingConfig = namedtuple('PeakFindingConfig', 'separation size multiplicity height')
PeakFindingConfig.__new__.__defaults__ = (60, 3.0, 1, 0)
PeakFindingConfig.__doc__ = '''\
Configuration of peak finding algorithm
separation - Minimum temporal separation between neighboring peaks in seconds
size - Minimum peak height relative to baseline given in multiples of the signal standard devitation (float)
multiplicity - Minimum number of data series in which peak occurs
height - minimum absolute height of peak'''
SVMConfig = namedtuple('SVMConfig', 'nu kernel gamma degree training_data')
SVMConfig.__new__.__defaults__ = (0.01, "poly", 0.001, 2, "None")
SVMConfig.__doc__ = '''\
Configuration of One-class SVM model
See: http://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html
'''
batch_no = 0
class Detector(Enum):
PEAK_FINDING = 1
ELLIPTIC_ENVELOPE = 2
LOCAL_OUTLIER_FACTOR = 3
ISOLATION_FOREST = 4
ONE_CLASS_SVM = 5
ureg = UnitRegistry() # for parsing values with units
def parse_config(path):
def parse_settings(path):
# default values
fmt = '*HMS_%H_%M_%S__DMY_%d_%m_%y*'
batch_size = 5E6
detector = ['FAV', 'OBI'][0]
with open(path, "r") as read_file:
# load json file
data = json.load(read_file)
# date-time format
fmt = '*HMS_%H_%M_%S__DMY_%d_%m_%y*'
if data.get('date_time_format') is not None:
fmt = data['date_time_format']
# ensure string has asterisks on both sides
fmt = data['date_time_format']
if fmt[0] != '*':
fmt = '*' + fmt
if fmt[-1] != '*':
fmt = fmt + '*'
# max batch size
batch_size = 5E6
if data.get('batch_size') is not None:
batch_size = int(float(data['batch_size']))
batch_size = int(float(data['batch_size']))
# detector
detector = data['detector']
return fmt, batch_size, detector
def parse_preproc_config(path):
with open(path, "r") as read_file:
data = json.load(read_file)
# filters
filters = list()
......@@ -53,39 +82,39 @@ def parse_config(path):
for x in data['filters']:
if x == 'FREQUENCY':
bands, names = parse_frequency_config(data)
f = FrequencyFilter(bands=bands, names=names)
f = sf.FrequencyFilter(bands=bands, names=names)
filters.append(f)
elif x == 'MEDIAN':
window_size, step_size = parse_window_config(data, 'median_config')
f = WindowFilter(window_size=window_size, step_size=step_size, filter_func=np.ma.median)
f = sf.WindowFilter(window_size=window_size, step_size=step_size, filter_func=np.ma.median)
filters.append(f)
elif x == 'MEDIAN_SUBTRACTION':