diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..7039905e77d697993b192cd108aa378fce06f0cf --- /dev/null +++ b/.gitignore @@ -0,0 +1,146 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# DotEnv configuration +.env + +# Database +*.db +*.rdb + +# Pycharm +.idea + +# VS Code +.vscode/ + +# Spyder +.spyproject/ + +# Jupyter NB Checkpoints +.ipynb_checkpoints/ + +# exclude data from source control by default +#data/ + +# Mac OS-specific storage files +.DS_Store + +# Editor temporary/working/backup files # +######################################### +.#* +[#]*# +*~ +*$ +*.bak +*.diff +.idea/ +*.iml +*.ipr +*.iws +*.org +.project +pmip +*.rej +.settings/ +.*.sw[nop] +.sw[nop] +*.tmp +*.vim +tags +cscope.out +# gnu global +GPATH +GRTAGS +GSYMS +GTAGS + +# Compiled source # +################### +*.a +*.com +*.class +*.dll +*.exe +*.o +*.py[ocd] +*.so + +# Packages # +############ +# it's better to unpack these files and commit the raw source +# git has its own built in compression methods +*.7z +*.bz2 +*.bzip2 +*.dmg +*.gz +*.iso +*.jar +*.rar +*.tar +*.tbz2 +*.tgz +*.zip +*.pkl +*.npz + +# Things specific to this project +################################### + + diff --git a/README.md b/README.md index 2ec21ed40c122fd269318841a866f578402a13a5..bb82752c0d2786ae7aa7e62895c60ecd8eb3144a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ DEEP Open Catalogue: Speech to Text ==================================== -[![Build Status](https://jenkins.indigo-datacloud.eu:8080/buildStatus/icon?job=Pipeline-as-code/DEEP-OC-org/speech-to-text-tf/master)](https://jenkins.indigo-datacloud.eu:8080/job/Pipeline-as-code/job/DEEP-OC-org/job/speech-to-text-tf/job/master/) +[![Build Status](https://jenkins.indigo-datacloud.eu/buildStatus/icon?job=Pipeline-as-code/DEEP-OC-org/speech-to-text-tf/master)](https://jenkins.indigo-datacloud.eu/job/Pipeline-as-code/job/DEEP-OC-org/job/speech-to-text-tf/job/master/) **Author:** [Lara Lloret Iglesias](https://github.com/laramaktub) (CSIC) @@ -9,11 +9,12 @@ DEEP Open Catalogue: Speech to Text **Project:** This work is part of the [DEEP Hybrid-DataCloud](https://deep-hybrid-datacloud.eu/) project that has received funding from the European Union’s Horizon 2020 research and innovation programme under grant agreement No 777435. This is a plug-and-play tool to train and evaluate a speech to text tool using deep neural networks. The network architecture is based in one of the tutorials provided by Tensorflow (https://www.tensorflow.org/tutorials/sequences/audio_recognition). + +<p align="center"> <img src="./reports/figures/speech-to-text.png"> </p> + The architecture used in this tutorial is based on some described in the paper [Convolutional Neural Networks for Small-footprint Keyword Spotting](https://static.googleusercontent.com/media/research.google.com/es//pubs/archive/43969.pdf). It was chosen because it's comparatively simple, quick to train, and easy to understand, rather than being state of the art. There are lots of different approaches to building neural network models to work with audio, including recurrent networks or dilated (atrous) convolutions. This tutorial is based on the kind of convolutional network that will feel very familiar to anyone who's worked with image recognition. That may seem surprising at first though, since audio is inherently a one-dimensional continuous signal across time, not a 2D spatial problem. We define a window of time we believe our spoken words should fit into, and converting the audio signal in that window into an image. This is done by grouping the incoming audio samples into short segments, just a few milliseconds long, and calculating the strength of the frequencies across a set of bands. Each set of frequency strengths from a segment is treated as a vector of numbers, and those vectors are arranged in time order to form a two-dimensional array. This array of values can then be treated like a single-channel image, and is known as a spectrogram. An example of what one of these spectrograms looks like: -<p align="center"> -<img src="./reports/figures/spectrogram.png" alt="spectrogram" width="400"> -</p> +<p align="center"> <img src="./reports/figures/spectrogram.png" width="400"> </p> To start using this framework run: @@ -27,7 +28,6 @@ pip install -e . - This project has been tested in Ubuntu 18.04 with Python 3.6.5. Further package requirements are described in the `requirements.txt` file. - It is a requirement to have [Tensorflow>=1.12.0 installed](https://www.tensorflow.org/install/pip) (either in gpu or cpu mode). This is not listed in the `requirements.txt` as it [breaks GPU support](https://github.com/tensorflow/tensorflow/issues/7166). -- Run `python -c 'import cv2'` to check that you installed correctly the `opencv-python` package (sometimes [dependencies are missed](https://stackoverflow.com/questions/47113029/importerror-libsm-so-6-cannot-open-shared-object-file-no-such-file-or-directo) in `pip` installations). ## Project Organization @@ -55,9 +55,9 @@ pip install -e . │ generated with `pip freeze > requirements.txt` ├── test-requirements.txt <- The requirements file for the test environment │ - ├── setup.py <- makes project pip installable (pip install -e .) so imgclas can be imported + ├── setup.py <- makes project pip installable (pip install -e .) so speechclas can be imported ├── speechclas <- Source code for use in this project. - │ ├── __init__.py <- Makes imgclas a Python module + │ ├── __init__.py <- Makes speechclas a Python module │ │ │ ├── dataset <- Scripts to download or generate data │ │ └── make_dataset.py @@ -93,25 +93,17 @@ Please use wav files. ### 2. Train the classifier -Before training the classifier you can customize the default parameters of the configuration file. To have an idea of what parameters you can change, you can explore them using the [dataset exploration notebook](./notebooks/1.0-Dataset_exploration.ipynb). This step is optional and training can be launched with the default configurarion parameters and still offer reasonably good results. - -Once you have customized the configuration parameters in the `./etc/config.yaml` file you can launch the training running `./imgclas/train_runfile.py`. You can monitor the training status using Tensorboard. +Before training the classifier you can customize the default parameters of the configuration file. Once you have customized the configuration parameters in the `./etc/config.yaml` file you can launch the training running `./speechclas/train_runfile.py`. After training you can check training statistics and check the logs where you will be able to find the standard output during the training together with the confusion matrix after the training was finished. Since usually this type of models are used in mobile phone application, the training generates the model in .pb format allowing to use it easily to perfom inference from a mobile phone app. + ### 3. Test the classifier You can test the classifier on a number of tasks: predict a single local wav file (or url) or predict multiple wavs (or urls). - -You can also make and store the predictions of the `test.txt` file (if you provided one). Once you have done that you can visualize the statistics of the predictions like popular metrics (accuracy, recall, precision, f1-score), the confusion matrix, etc by running the [predictions statistics notebook](./notebooks/3.1-Prediction_statistics.ipynb). - - -Finally you can launch a simple web page to use the trained classifier to predict audios (both local and urls) on your favorite brownser. - - ## Launching the full API #### Preliminaries for prediction diff --git a/etc/config.yaml b/etc/config.yaml index 69d3f92dd744f05701388476fde298f4b8d9dd0a..0a50fb77a41a1469f51a269ddf2e91ccb29c0b3d 100644 --- a/etc/config.yaml +++ b/etc/config.yaml @@ -24,7 +24,6 @@ general: directory. If path is relative it will be appended to the package path. - data_dir: value: "data/dataset_files" type: "str" @@ -38,10 +37,8 @@ general: You can indicate here the URL to the tar.gz containing your training audio files - model_settings: - wanted_words: type: "str" value: "yes,no,up,down,left,right,on,off,stop,go" @@ -104,7 +101,6 @@ audio_processor: help: > Range to randomly shift the training audio by in time - sample_rate: type: "int" value: 16000 @@ -144,14 +140,12 @@ training_parameters: help: > Save model checkpoint every save_steps - output_file: type: "str" value: "model.pb" help: > Model file in pb format - model_architecture: type: "str" value: "conv" @@ -194,7 +188,6 @@ training_parameters: help: > If specified, restore this pretrained model before any training - how_many_training_steps: type: "str" value: "10,10" diff --git a/reports/figures/demo-saliency.png b/reports/figures/demo-saliency.png deleted file mode 100644 index 5610f40e5d5d27dc31214af196cf385f6921b71b..0000000000000000000000000000000000000000 Binary files a/reports/figures/demo-saliency.png and /dev/null differ diff --git a/reports/figures/predict.png b/reports/figures/predict.png deleted file mode 100644 index d1b636cdc7129f14e23e8312c2db0b4318b29f6d..0000000000000000000000000000000000000000 Binary files a/reports/figures/predict.png and /dev/null differ diff --git a/reports/figures/speech-to-text.png b/reports/figures/speech-to-text.png new file mode 100644 index 0000000000000000000000000000000000000000..0b6f32137b23b15a28aa8973690760ce76c7a14a Binary files /dev/null and b/reports/figures/speech-to-text.png differ diff --git a/reports/figures/webpage.png b/reports/figures/webpage.png deleted file mode 100644 index 556e6dcf87390d0f406027e571b2369fc690212d..0000000000000000000000000000000000000000 Binary files a/reports/figures/webpage.png and /dev/null differ diff --git a/requirements.txt b/requirements.txt index 2af96e7eea1806025d56b559a12e8f48cf6e3878..23a225958426f56d258ecc388adc96b5ff06ad78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,15 +3,9 @@ Sphinx python-dotenv>=0.5.1 numpy>=1.14.5 tensorboardX>=1.4 -opencv-python>=3.4.1.4 -albumentations>=0.1.8 tqdm>=4.25.0 requests>=2.18.4 PyYAML>=3.12 -matplotlib>=2.2.2 -Jinja2>=2.10 -Flask>=1.0.2 -Werkzeug>=0.14.1 -gunicorn>=19.9.0 -seaborn>=0.8.1 -scikit-learn>=0.19.1 \ No newline at end of file +webargs>=5.5.2 +aiohttp>=3.6.2 +deepaas>=1.0.0 diff --git a/setup.cfg b/setup.cfg index d42c1d79094fb6178ae37ce25a10e6c883127fcc..0925fb8f2389ed188474b1fd185a4e06411e69d5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,11 +1,11 @@ [metadata] name = speechclas -summary = This is a plug-and-play tool to train and evaluate an image classifier on a custom dataset using deep neural networks. -author = Ignacio Heredia (CSIC) -author-email = iheredia@ifca.unican.es +summary = This is a plug-and-play tool to train and evaluate a speech classifier +author = Lara Lloret +author-email = lloret@ifca.unican.es version = 0.1.0 license = Apache -home-page = http://github.com/indigo-dc/speechclas +home-page = http://github.com/deephdc/speech-to-text-tf classifier = Intended Audience :: Information Technology Intended Audience :: System Administrators @@ -23,6 +23,6 @@ packages = speechclas [entry_points] -deepaas.model = +deepaas.v2.model = speechclas = speechclas.api diff --git a/speechclas/api.py b/speechclas/api.py index c0b39a70928039f90ca79a99cc81dae416f38f38..ad15878e28dd5c47060d852db42c7ae7dc662861 100644 --- a/speechclas/api.py +++ b/speechclas/api.py @@ -19,26 +19,23 @@ gevent, uwsgi. import json import os -import tempfile -import warnings from datetime import datetime import pkg_resources import builtins import re -import urllib.request +from collections import OrderedDict -import numpy as np +import urllib.request import requests -from werkzeug.exceptions import BadRequest -import tensorflow as tf -from tensorflow.keras.models import load_model from tensorflow.keras import backend as K +from webargs import fields +from aiohttp.web import HTTPBadRequest from speechclas import paths, utils, config, label_wav -from speechclas.data_utils import load_class_names, load_class_info, mount_nextcloud -from speechclas.test_utils import predict +from speechclas.data_utils import mount_nextcloud from speechclas.train_runfile import train_fn + # Mount NextCloud folders (if NextCloud is available) try: mount_nextcloud('ncplants:/data/dataset_files', paths.get_splits_dir()) @@ -70,9 +67,9 @@ def load_inference_model(): # Set the timestamp timestamps = next(os.walk(paths.get_models_dir()))[1] if not timestamps: - raise BadRequest( - """You have no models in your `./models` folder to be used for inference. - Therefore the API can only be used for training.""") + raise Exception( + "You have no models in your `./models` folder to be used for inference. " + "This module does not come with a pretrained model so you have to train a model to use it for prediction.") else: if 'api' in timestamps: TIMESTAMP = 'api' @@ -81,13 +78,13 @@ def load_inference_model(): paths.timestamp = TIMESTAMP print('Using TIMESTAMP={}'.format(TIMESTAMP)) - # Set the checkpoint model to use to make the prediction ckpts = os.listdir(paths.get_checkpoints_dir()) if not ckpts: - raise BadRequest( - """You have no checkpoints in your `./models/{}/ckpts` folder to be used for inference. - Therefore the API can only be used for training.""".format(TIMESTAMP)) + raise Exception( + "You have no checkpoints in your `./models/{}/ckpts` folder to be used for inference. ".format( + TIMESTAMP) + + "Therefore the API can only be used for training.") else: if 'model.pb' in ckpts: MODEL_NAME = 'model.pb' @@ -101,32 +98,51 @@ def load_inference_model(): LABELS_FILE = sorted([name for name in ckpts if name.endswith('*.txt')])[-1] print('Using LABELS_FILE={}'.format(LABELS_FILE)) - # Clear the previous loaded model K.clear_session() # Load the class names and info ckpts_dir = paths.get_checkpoints_dir() - MODEL_NAME= os.path.join(ckpts_dir, MODEL_NAME ) - LABELS_FILE= os.path.join(ckpts_dir, LABELS_FILE ) - + MODEL_NAME = os.path.join(ckpts_dir, MODEL_NAME ) + LABELS_FILE = os.path.join(ckpts_dir, LABELS_FILE ) # Load training configuration conf_path = os.path.join(paths.get_conf_dir(), 'conf.json') with open(conf_path) as f: conf = json.load(f) - # Set the model as loaded loaded = True +def update_with_query_conf(user_args): + """ + Update the default YAML configuration with the user's input args from the API query + """ + # Update the default conf with the user input + CONF = config.CONF + for group, val in sorted(CONF.items()): + for g_key, g_val in sorted(val.items()): + if g_key in user_args: + g_val['value'] = json.loads(user_args[g_key]) + + # Check and save the configuration + config.check_conf(conf=CONF) + config.conf_dict = config.get_conf_dict(conf=CONF) + + +# Disable warm because the model does not come with any pretrained model +# def warm(): +# if not loaded: +# load_inference_model() + + def catch_error(f): def wrap(*args, **kwargs): try: return f(*args, **kwargs) except Exception as e: - raise e + raise HTTPBadRequest(reason=e) return wrap @@ -135,93 +151,92 @@ def catch_url_error(url_list): url_list=url_list['urls'] # Error catch: Empty query if not url_list: - raise BadRequest('Empty query') + raise ValueError('Empty query') for i in url_list: # Error catch: Inexistent url try: url_type = requests.head(i).headers.get('content-type') except: - raise BadRequest("""Failed url connection: - Check you wrote the url address correctly.""") + raise ValueError("Failed url connection: " + "Check you wrote the url address correctly.") # Error catch: Wrong formatted urls if url_type != 'audio/x-wav': - raise BadRequest("""Url wav format error: - Some urls were not in wav format.""") + raise ValueError("Url wav format error: " + "Some urls were not in wav format.") def catch_localfile_error(file_list): # Error catch: Empty query if not file_list[0].filename: - raise BadRequest('Empty query') - - # Error catch: Image format error - for f in file_list: - extension = f.split('.')[-1] - if extension not in allowed_extensions: - raise BadRequest("""Local image format error: - At least one file is not in a standard image format (jpg|jpeg|png).""") + raise ValueError('Empty query') @catch_error -def predict_url(urls, merge=True): - """ - Function to predict an url - """ - catch_url_error(urls) +def predict(**args): - if not loaded: - load_inference_model() - urllib.request.urlretrieve(urls['urls'][0], '/tmp/file.wav') - pred_lab, pred_prob =label_wav.predict('/tmp/file.wav', LABELS_FILE, MODEL_NAME, "wav_data:0","labels_softmax:0", 3) - return format_prediction(pred_lab, pred_prob) + if (not any([args['urls'], args['files']]) or + all([args['urls'], args['files']])): + raise Exception("You must provide either 'url' or 'data' in the payload") + if args['files']: + args['files'] = [args['files']] # patch until list is available + return predict_data(args) + elif args['urls']: + args['urls'] = [args['urls']] # patch until list is available + return predict_url(args) -@catch_error -def predict_file(filenames, merge=True): +def predict_url(args): """ - Function to predict a local image + Function to predict an url """ - catch_localfile_error(filenames) + # # Check user configuration + # update_with_query_conf(args) + # conf = config.conf_dict + + catch_url_error(args['urls']) + # Load model if needed if not loaded: load_inference_model() - with graph.as_default(): - pred_lab, pred_prob = predict(model=model, - X=filenames, - conf=conf, - top_K=top_K, - filemode='local', - merge=merge) - if merge: - pred_lab, pred_prob = np.squeeze(pred_lab), np.squeeze(pred_prob) + # Download the url + urllib.request.urlretrieve(args['urls'][0], '/tmp/file.wav') + pred_lab, pred_prob = label_wav.predict('/tmp/file.wav', + LABELS_FILE, + MODEL_NAME, + "wav_data:0", + "labels_softmax:0", + 3) return format_prediction(pred_lab, pred_prob) -@catch_error -def predict_data(audios, merge=True): +def predict_data(args): """ Function to predict an audio file """ + # # Check user configuration + # update_with_query_conf(args) + # conf = config.conf_dict + if not loaded: load_inference_model() - if not isinstance(audios, list): - audios = [audios] - filenames = [] - for audio in audios: - thename=audio['files'].filename - thefile="/tmp/"+thename - audio['files'].save(thefile) + # Create a list with the path to the audios + filenames = [f.filename for f in args['files']] - pred_lab, pred_prob =label_wav.predict(thefile, LABELS_FILE, MODEL_NAME, "wav_data:0","labels_softmax:0", 3) - return format_prediction(pred_lab, pred_prob) + pred_lab, pred_prob = label_wav.predict(filenames[0], + LABELS_FILE, + MODEL_NAME, + "wav_data:0", + "labels_softmax:0", + 3) + return format_prediction(pred_lab, pred_prob) def format_prediction(labels, probabilities): @@ -229,7 +244,7 @@ def format_prediction(labels, probabilities): "status": "ok", "predictions": [], } - class_names=conf["model_settings"]["wanted_words"] + class_names = conf["model_settings"]["wanted_words"] for label_id, prob in zip(labels, probabilities): name = label_id @@ -261,44 +276,15 @@ def wikipedia_link(pred_lab): return link -def metadata(): - d = { - "author": None, - "description": None, - "url": None, - "license": None, - "version": None, - } - return d - - -@catch_error -def train(user_conf): +def train(**args): """ - Parameters - ---------- - user_conf : dict - Json dict (created with json.dumps) with the user's configuration parameters that will replace the defaults. - Must be loaded with json.loads() - For example: - user_conf={'num_classes': 'null', 'lr_step_decay': '0.1', 'lr_step_schedule': '[0.7, 0.9]', 'use_early_stopping': 'false'} + Train an image classifier """ - CONF = config.CONF - - # Update the conf with the user input - for group, val in sorted(CONF.items()): - for g_key, g_val in sorted(val.items()): - g_val['value'] = json.loads(user_conf[g_key]) - - # Check the configuration - try: - config.check_conf(conf=CONF) - except Exception as e: - raise BadRequest(e) - - CONF = config.conf_dict(conf=CONF) + # print('#####################') + # raise Exception('error') + update_with_query_conf(user_args=args) + CONF = config.conf_dict timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S') - config.print_conf_table(CONF) K.clear_session() # remove the model loaded for prediction train_fn(TIMESTAMP=timestamp, CONF=CONF) @@ -310,21 +296,10 @@ def train(user_conf): print(e) -@catch_error -def get_train_args(): +def populate_parser(parser, default_conf): """ - Returns a dict of dicts with the following structure to feed the deepaas API parser: - { 'arg1' : {'default': '1', #value must be a string (use json.dumps to convert Python objects) - 'help': '', #can be an empty string - 'required': False #bool - }, - 'arg2' : {... - }, - ... - } + Returns a arg-parse like parser. """ - train_args = {} - default_conf = config.CONF for group, val in default_conf.items(): for g_key, g_val in val.items(): gg_keys = g_val.keys() @@ -335,29 +310,67 @@ def get_train_args(): choices = g_val['choices'] if ('choices' in gg_keys) else None # Additional info in help string - help += '\n' + "Group name: **{}**".format(str(group)) - if choices: help += '\n' + "Choices: {}".format(str(choices)) - if type: help += '\n' + "Type: {}".format(g_val['type']) + help += '\n' + "<font color='#C5576B'> Group name: **{}**".format(str(group)) + if choices: + help += '\n' + "Choices: {}".format(str(choices)) + if type: + help += '\n' + "Type: {}".format(g_val['type']) + help += "</font>" + + # Create arg dict + opt_args = {'missing': json.dumps(g_val['value']), + 'description': help, + 'required': False, + } + if choices: + opt_args['enum'] = [json.dumps(i) for i in choices] + + parser[g_key] = fields.Str(**opt_args) + + return parser + + +def get_train_args(): + + parser = OrderedDict() + default_conf = config.CONF + default_conf = OrderedDict([('general', default_conf['general']), + ('model_settings', default_conf['model_settings']), + ('audio_processor', default_conf['audio_processor']), + ('training_parameters', default_conf['training_parameters'])]) + + return populate_parser(parser, default_conf) + + +def get_predict_args(): - opt_args = {'default': json.dumps(g_val['value']), - 'help': help, - 'required': False} - # if type: opt_args['type'] = type # this breaks the submission because the json-dumping - # => I'll type-check args inside the train_fn + parser = OrderedDict() - train_args[g_key] = opt_args - return train_args + # Add data and url fields + parser['files'] = fields.Field(required=False, + missing=None, + type="file", + data_key="data", + location="form", + description="Select the image you want to classify.") + + # Use field.String instead of field.Url because I also want to allow uploading of base 64 encoded data strings + parser['urls'] = fields.String(required=False, + missing=None, + description="Select an URL of the image you want to classify.") + + # missing action="append" --> append more than one url + + return parser @catch_error -def get_metadata(): +def get_metadata(distribution_name='speechclas'): """ Function to read metadata """ - module = __name__.split('.', 1) - - pkg = pkg_resources.get_distribution(module[0]) + pkg = pkg_resources.get_distribution(distribution_name) meta = { 'Name': None, 'Version': None, diff --git a/speechclas/config.py b/speechclas/config.py index 0bcda989e02900a8dcb7fecb3350d9a4181f4575..836d9ad9c5a7d51c41a4bfb76a214af556b95690 100644 --- a/speechclas/config.py +++ b/speechclas/config.py @@ -46,12 +46,11 @@ def check_conf(conf=CONF): if (g_val['range'][1] != 'None') and (g_val['range'][1] < g_val['value']): raise ValueError('The selected value for {} is higher than the maximal possible value.'.format(g_key)) - - check_conf() -def conf_dict(conf=CONF): + +def get_conf_dict(conf=CONF): """ Return configuration as dict """ @@ -62,7 +61,9 @@ def conf_dict(conf=CONF): conf_d[group][g_key] = g_val['value'] return conf_d -conf_d = conf_dict() + +conf_dict = get_conf_dict() + def print_full_conf(conf=CONF): """ @@ -84,7 +85,7 @@ def print_full_conf(conf=CONF): print('\n') -def print_conf_table(conf=conf_d): +def print_conf_table(conf=conf_dict): """ Print configuration parameters in a table """ diff --git a/speechclas/data_utils.py b/speechclas/data_utils.py index ff44f04a0c1bed99722f0a64d44b2a4b027ef15e..5dea14955655da3660d9ef8ba7b6e7ff22532ff0 100755 --- a/speechclas/data_utils.py +++ b/speechclas/data_utils.py @@ -7,60 +7,9 @@ Email: iheredia@ifca.unican.es Github: ignacioheredia """ -import os -import threading -from multiprocessing import Pool -import queue -from urllib.request import urlopen import subprocess import warnings -import numpy as np -from tqdm import tqdm -from tensorflow.keras.utils import to_categorical, Sequence -import cv2 -import albumentations -from albumentations.augmentations import transforms -from albumentations.imgaug import transforms as imgaug_transforms - - -def load_data_splits(splits_dir, im_dir, split_name='train'): - """ - Load the data arrays from the [train/val/test].txt files. - Lines of txt files have the following format: - 'relative_path_to_image' 'image_label_number' - - Parameters - ---------- - im_dir : str - Absolute path to the image folder. - split_name : str - Name of the data split to load - - Returns - ------- - X : Numpy array of strs - First colunm: Contains 'absolute_path_to_file' to images. - y : Numpy array of int32 - Image label number - """ - if '{}.txt'.format(split_name) not in os.listdir(splits_dir): - raise ValueError("Invalid value for the split_name parameter: there is no `{}.txt` file in the `{}` " - "directory.".format(split_name, splits_dir)) - - # Loading splits - print("Loading {} data...".format(split_name)) - split = np.genfromtxt(os.path.join(splits_dir, '{}.txt'.format(split_name)), dtype='str', delimiter=' ') - X = np.array([os.path.join(im_dir, i) for i in split[:, 0]]) - - #TODO Check this part of the code - if len(split.shape) == 2: - y = split[:, 1].astype(np.int32) - else: # maybe test file has not labels - y = None - - return X, y - def mount_nextcloud(frompath, topath): """ @@ -74,556 +23,3 @@ def mount_nextcloud(frompath, topath): return output, error -def load_class_names(splits_dir): - """ - Load list of class names - - Returns - ------- - Numpy array of shape (N) containing strs with class names - """ - print("Loading class names...") - class_names = np.genfromtxt(os.path.join(splits_dir, 'classes.txt'), dtype='str', delimiter='/n') - return class_names - - -def load_class_info(splits_dir): - """ - Load list of class names - - Returns - ------- - Numpy array of shape (N) containing strs with class names - """ - print("Loading class info...") - class_info = np.genfromtxt(os.path.join(splits_dir, 'info.txt'), dtype='str', delimiter='/n') - return class_info - - -def load_image(filename, filemode='local'): - """ - Function to load a local image path (or an url) into a numpy array. - - Parameters - ---------- - filename : str - Path or url to the image - filemode : {'local','url'} - - 'local': filename is absolute path in local disk. - - 'url': filename is internet url. - - Returns - ------- - A numpy array - """ - if filemode == 'local': - image = cv2.imread(filename, cv2.IMREAD_COLOR) - if image is None: - raise ValueError('The local path does not exist or does not correspond to an image: \n {}'.format(filename)) - - elif filemode == 'url': - try: - data = urlopen(filename).read() - data = np.frombuffer(data, np.uint8) - image = cv2.imdecode(data, cv2.IMREAD_COLOR) - except: - raise ValueError('Incorrect url path: \n {}'.format(filename)) - - else: - raise ValueError('Invalid value for filemode.') - - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # change from default BGR OpenCV format to Python's RGB format - return image - - -def preprocess_batch(batch, mean_RGB, std_RGB, mode='tf', channels_first=False): - """ - Standardize batch to feed the net. Adapted from [1] to take replace the default imagenet mean and std. - [1] https://github.com/keras-team/keras-applications/blob/master/keras_applications/imagenet_utils.py - - Parameters - ---------- - batch : list of numpy arrays - mean_RGB, std_RGB : list of floats, len=3 - Mean/std RGB values for your dataset. - channels_first : bool - Use batch of shape (N, C, H, W) instead of (N, H, W, C) - - Returns - ------- - Numpy array - """ - assert type(batch) is list, "Your batch must be a list of numpy arrays" - - mean_RGB, std_RGB = np.array(mean_RGB), np.array(std_RGB) - batch = np.array(batch) - mean_RGB[None, None, None, :] # mean centering - - if mode == 'caffe': - batch = batch[:, :, :, ::-1] # switch from RGB to BGR - if mode == 'tf': - batch /= 127.5 # scaling between [1, -1] - if mode == 'torch': - batch /= std_RGB - if channels_first: - batch = batch.transpose(0, 3, 1, 2) # shape(N, 3, 224, 224) - return batch.astype(np.float32) - - -def augment(im, params=None): - """ - Perform data augmentation on some image using the albumentations package. - - Parameters - ---------- - im : Numpy array - params : dict or None - Contains the data augmentation parameters - Mandatory keys: - - h_flip ([0,1] float): probability of performing an horizontal left-right mirroring. - - v_flip ([0,1] float): probability of performing an vertical up-down mirroring. - - rot ([0,1] float): probability of performing a rotation to the image. - - rot_lim (int): max degrees of rotation. - - stretch ([0,1] float): probability of randomly stretching an image. - - crop ([0,1] float): randomly take an image crop. - - zoom ([0,1] float): random zoom applied to crop_size. - --> Therefore the effective crop size at each iteration will be a - random number between 1 and crop*(1-zoom). For example: - * crop=1, zoom=0: no crop of the image - * crop=1, zoom=0.1: random crop of random size between 100% image and 90% of the image - * crop=0.9, zoom=0.1: random crop of random size between 90% image and 80% of the image - * crop=0.9, zoom=0: random crop of always 90% of the image - Image size refers to the size of the shortest side. - - blur ([0,1] float): probability of randomly blurring an image. - - pixel_noise ([0,1] float): probability of randomly adding pixel noise to an image. - - pixel_sat ([0,1] float): probability of randomly using HueSaturationValue in the image. - - cutout ([0,1] float): probability of using cutout in the image. - - Returns - ------- - Numpy array - """ - - ## 1) Crop the image - effective_zoom = np.random.rand() * params['zoom'] - crop = params['crop'] - effective_zoom - - ly, lx, channels = im.shape - crop_size = int(crop * min([ly, lx])) - rand_x = np.random.randint(low=0, high=lx - crop_size + 1) - rand_y = np.random.randint(low=0, high=ly - crop_size + 1) - - crop = transforms.Crop(x_min=rand_x, - y_min=rand_y, - x_max=rand_x + crop_size, - y_max=rand_y + crop_size) - - im = crop(image=im)['image'] - - ## 2) Now add the transformations for augmenting the image pixels - transform_list = [] - - # Add random stretching - if params['stretch']: - transform_list.append( - imgaug_transforms.IAAPerspective(scale=0.1, p=params['stretch']) - ) - - # Add random rotation - if params['rot']: - transform_list.append( - transforms.Rotate(limit=params['rot_lim'], p=params['rot']) - ) - - # Add horizontal flip - if params['h_flip']: - transform_list.append( - transforms.HorizontalFlip(p=params['h_flip']) - ) - - # Add vertical flip - if params['v_flip']: - transform_list.append( - transforms.VerticalFlip(p=params['v_flip']) - ) - - # Add some blur to the image - if params['blur']: - transform_list.append( - albumentations.OneOf([ - transforms.MotionBlur(blur_limit=7, p=1.), - transforms.MedianBlur(blur_limit=7, p=1.), - transforms.Blur(blur_limit=7, p=1.), - ], p=params['blur']) - ) - - # Add pixel noise - if params['pixel_noise']: - transform_list.append( - albumentations.OneOf([ - transforms.CLAHE(clip_limit=2, p=1.), - imgaug_transforms.IAASharpen(p=1.), - imgaug_transforms.IAAEmboss(p=1.), - transforms.RandomBrightnessContrast(contrast_limit=0, p=1.), - transforms.RandomBrightnessContrast(brightness_limit=0, p=1.), - transforms.RGBShift(p=1.), - transforms.RandomGamma(p=1.)#, - # transforms.JpegCompression(), - # transforms.ChannelShuffle(), - # transforms.ToGray() - ], p=params['pixel_noise']) - ) - - # Add pixel saturation - if params['pixel_sat']: - transform_list.append( - transforms.HueSaturationValue(p=params['pixel_sat']) - ) - - # Remove randomly remove some regions from the image - if params['cutout']: - ly, lx, channels = im.shape - scale_low, scale_high = 0.05, 0.25 # min and max size of the squares wrt the full image - scale = np.random.uniform(scale_low, scale_high) - transform_list.append( - transforms.Cutout(num_holes=8, max_h_size=int(scale*ly), max_w_size=int(scale*lx), p=params['cutout']) - ) - - # Compose all image transformations and augment the image - augmentation_fn = albumentations.Compose(transform_list) - im = augmentation_fn(image=im)['image'] - - return im - - -def resize_im(im, height, width): - resize_fn = transforms.Resize(height=height, width=width) - return resize_fn(image=im)['image'] - - -def data_generator(inputs, targets, batch_size, mean_RGB, std_RGB, preprocess_mode, aug_params, num_classes, - im_size=224, shuffle=True): - """ - Generator to feed Keras fit function - - Parameters - ---------- - inputs : Numpy array, shape (N, H, W, C) - targets : Numpy array, shape (N) - batch_size : int - shuffle : bool - aug_params : dict - im_size : int - Final image size to feed the net's input (eg. 224 for Resnet). - - Returns - ------- - Generator of inputs and labels - """ - assert len(inputs) == len(targets) - assert len(inputs) >= batch_size - - # Create list of indices - idxs = np.arange(len(inputs)) - if shuffle: - np.random.shuffle(idxs) - - # # Reshape targets to the correct shape - # if len(targets.shape) == 1: - # print('reshaping targets') - # targets = targets.reshape(-1, 1) - - for start_idx in range(0, len(inputs) - batch_size + 1, batch_size): - excerpt = idxs[start_idx:start_idx + batch_size] - batch_X = [] - for i in excerpt: - im = load_image(inputs[i], filemode='local') - im = augment(im, params=aug_params) - im = resize_im(im, height=im_size, width=im_size) - batch_X.append(im) # shape (N, 224, 224, 3) - batch_X = preprocess_batch(batch=batch_X, mean_RGB=mean_RGB, std_RGB=std_RGB, mode=preprocess_mode) - batch_y = to_categorical(targets[excerpt], num_classes=num_classes) - - yield batch_X, batch_y - - -def buffered_generator(source_gen, buffer_size=10): - """ - Generator that runs a slow source generator in a separate thread. Beware of the GIL! - Author: Benanne (github-kaggle/benanne/ndsb) - - Parameters - ---------- - source_gen : generator - buffer_size: the maximal number of items to pre-generate (length of the buffer) - - Returns - ------- - Buffered generator - """ - if buffer_size < 2: - raise RuntimeError("Minimal buffer size is 2!") - - buffer = queue.Queue(maxsize=buffer_size - 1) - # the effective buffer size is one less, because the generation process - # will generate one extra element and block until there is room in the buffer. - - def _buffered_generation_thread(source_gen, buffer): - for data in source_gen: - buffer.put(data, block=True) - buffer.put(None) # sentinel: signal the end of the iterator - - thread = threading.Thread(target=_buffered_generation_thread, args=(source_gen, buffer)) - thread.daemon = True - thread.start() - - for data in iter(buffer.get, None): - yield data - - -class data_sequence(Sequence): - """ - Instance of a Keras Sequence that is safer to use with multiprocessing than a standard generator. - Check https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly - - TODO: Add sample weights on request - """ - - def __init__(self, inputs, targets, batch_size, mean_RGB, std_RGB, preprocess_mode, aug_params, num_classes, - im_size=224, shuffle=True): - """ - Parameters are the same as in the data_generator function - """ - assert len(inputs) == len(targets) - assert len(inputs) >= batch_size - - self.inputs = inputs - self.targets = targets - self.batch_size = batch_size - self.mean_RGB = mean_RGB - self.std_RGB = std_RGB - self.preprocess_mode = preprocess_mode - self.aug_params = aug_params - self.num_classes = num_classes - self.im_size = im_size - self.shuffle = shuffle - self.on_epoch_end() - - def __len__(self): - return int(np.ceil(len(self.inputs) / float(self.batch_size))) - - def __getitem__(self, idx): - batch_idxs = self.indexes[idx*self.batch_size: (idx+1)*self.batch_size] - batch_X = [] - for i in batch_idxs: - im = load_image(self.inputs[i]) - if self.aug_params: - im = augment(im, params=self.aug_params) - im = resize_im(im, height=self.im_size, width=self.im_size) - batch_X.append(im) # shape (N, 224, 224, 3) - batch_X = preprocess_batch(batch=batch_X, mean_RGB=self.mean_RGB, std_RGB=self.std_RGB, mode=self.preprocess_mode) - batch_y = to_categorical(self.targets[batch_idxs], num_classes=self.num_classes) - return batch_X, batch_y - - def on_epoch_end(self): - """Updates indexes after each epoch""" - self.indexes = np.arange(len(self.inputs)) - if self.shuffle: - np.random.shuffle(self.indexes) - - -def standard_tencrop_batch(im, crop_prop=0.9): - """ - Returns an ordered ten crop batch of images from an original image (corners, center + mirrors). - - Parameters - ---------- - im : numpy array, type np.uint8 - crop_prop: float, [0, 1] - Size of the crop with respect to the whole image - - Returns - ------- - List of 10 numpy arrays - """ - batch = [] - - min_side = np.amin(im.shape[:2]) - im = resize_im(im, height=min_side, width=min_side) # resize to shorter border - h, w = min_side, min_side # height, width (square) - crop_size = int(crop_prop * min_side) - - # Crops - c1 = transforms.Crop(x_min=0, - y_min=0, - x_max=crop_size, - y_max=crop_size)(image=im)['image'] # top-left - - c2 = transforms.Crop(x_min=0, - y_min=h-crop_size, - x_max=crop_size, - y_max=h)(image=im)['image'] # bottom-left - - c3 = transforms.Crop(x_min=w-crop_size, - y_min=0, - x_max=w, - y_max=crop_size)(image=im)['image'] # top-right - - c4 = transforms.Crop(x_min=w-crop_size, - y_min=h-crop_size, - x_max=w, - y_max=h)(image=im)['image'] # bottom-right - - c5 = transforms.Crop(x_min=np.round((w-crop_size)/2).astype(int), - y_min=np.round((h-crop_size)/2).astype(int), - x_max=np.round((w+crop_size)/2).astype(int), - y_max=np.round((h+crop_size)/2).astype(int))(image=im)['image'] # center - - # Save crop and its mirror - lr_aug = albumentations.HorizontalFlip(p=1) - for image in [c1, c2, c3, c4, c5]: - batch.append(image) - batch.append(lr_aug(image=image)['image']) - - return batch - - -class k_crop_data_sequence(Sequence): - """ - Data sequence generator for test time to feed to predict_generator. - Each batch delivered is composed by multiple crops (default=10) of the same image. - """ - - def __init__(self, inputs, mean_RGB, std_RGB, preprocess_mode, aug_params, crop_number=10, crop_mode='random', - filemode='local', im_size=224): - """ - Parameters are the same as in the data_generator function except for: - - Parameters - ---------- - crop_number : int - Number of crops of each image to take. - mode :str, {'random', 'standard'} - If 'random' data augmentation is performed randomly. - If 'standard' we take the standard 10 crops (corners +center + mirrors) - filemode : {'local','url'} - - 'local': filename is absolute path in local disk. - - 'url': filename is internet url. - """ - self.inputs = inputs - self.mean_RGB = mean_RGB - self.std_RGB = std_RGB - self.preprocess_mode = preprocess_mode - self.aug_params = aug_params - self.crop_number = crop_number - self.crop_mode = crop_mode - self.filemode = filemode - self.im_size = im_size - - def __len__(self): - return len(self.inputs) - - def __getitem__(self, idx): - batch_X = [] - im = load_image(self.inputs[idx], filemode=self.filemode) - - if self.crop_mode == 'random': - for _ in range(self.crop_number): - if self.aug_params: - im_aug = augment(im, params=self.aug_params) - else: - im_aug = np.copy(im) - im_aug = resize_im(im_aug, height=self.im_size, width=self.im_size) - batch_X.append(im_aug) # shape (N, 224, 224, 3) - - if self.crop_mode == 'standard': - batch_X = standard_tencrop_batch(im) - - batch_X = preprocess_batch(batch=batch_X, mean_RGB=self.mean_RGB, std_RGB=self.std_RGB, mode=self.preprocess_mode) - return batch_X - - -def im_stats(filename): - """ - Helper for function compute_meanRGB - """ - im = load_image(filename, filemode='local') - mean = np.mean(im, axis=(0, 1)) - std = np.std(im, axis=(0, 1)) - return mean.tolist(), std.tolist() - - -def compute_meanRGB(im_list, verbose=False, workers=4): - """ - Returns the mean and std RGB values for the whole dataset. - For example in the plantnet dataset we have: - mean_RGB = np.array([ 107.59348955, 112.1047813 , 80.9982362 ]) - std_RGB = np.array([ 52.78326119, 50.56163087, 50.86486131]) - - Parameters - ---------- - im_list : array of strings - Array where the first column is image_path (or image_url). Shape (N,). - verbose : bool - Show progress bar - workers: int - Numbers of parallel workers to perform the computation with. - - References - ---------- - https://stackoverflow.com/questions/41920124/multiprocessing-use-tqdm-to-display-a-progress-bar - """ - - print('Computing mean RGB pixel with {} workers...'.format(workers)) - - with Pool(workers) as p: - r = list(tqdm(p.imap(im_stats, im_list), - total=len(im_list), - disable=verbose)) - - r = np.asarray(r) - mean, std = r[:, 0], r[:, 1] - mean, std = np.mean(mean, axis=0), np.mean(std, axis=0) - - print('Mean RGB pixel: {}'.format(mean.tolist())) - print('Standard deviation of RGB pixel: {}'.format(std.tolist())) - - return mean.tolist(), std.tolist() - - -def compute_classweights(labels, max_dim=None, mode='balanced'): - """ - Compute the class weights for a set of labels to account for label imbalance. - - Parameters - ---------- - labels : numpy array, type (ints), shape (N) - max_dim : int - Maximum number of classes. Default is the max value in labels. - mode : str, {'balanced', 'log'} - - Returns - ------- - Numpy array, type (float32), shape (N) - """ - if mode is None: - return None - - weights = np.bincount(labels) - weights = np.sum(weights) / weights - - # Fill the count if some high number labels are not present in the sample - if max_dim is not None: - diff = max_dim - len(weights) - if diff != 0: - weights = np.pad(weights, pad_width=(0, diff), mode='constant', constant_values=0) - - # Transform according to different modes - if mode == 'balanced': - pass - elif mode == 'log': - # do not use --> produces numerical instabilities at inference when transferring weights trained on GPU to CPU - weights = np.log(weights) # + 1 - else: - raise ValueError('{} is not a valid option for parameter "mode"'.format(mode)) - - return weights.astype(np.float32) diff --git a/speechclas/freeze.py b/speechclas/freeze.py index 9f99446820b6819a1f651c098826dd45f5dc4fc6..be9c8c070e05faafc46b7f01d1472c99bef41938 100644 --- a/speechclas/freeze.py +++ b/speechclas/freeze.py @@ -51,7 +51,7 @@ from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio from speechclas import input_data, models, paths from tensorflow.python.framework import graph_util -CONF = config.conf_dict() +CONF = config.get_conf_dict() timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S') def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, diff --git a/speechclas/model_utils.py b/speechclas/model_utils.py index ece816c41d3b704eed5955f6791d6f6407baede0..b3d45ce2ce15e10d650835b1ff55036e48e1fce6 100755 --- a/speechclas/model_utils.py +++ b/speechclas/model_utils.py @@ -10,57 +10,14 @@ Github: ignacioheredia import os import json -from tensorflow.keras import applications -from tensorflow.keras import regularizers from tensorflow.keras import backend as K -from tensorflow.keras.models import load_model, Model from tensorflow.python.saved_model import builder as saved_model_builder from tensorflow.python.saved_model.signature_def_utils import predict_signature_def from tensorflow.python.saved_model import tag_constants -from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Flatten from speechclas import paths -model_modes = {'DenseNet121': 'torch', 'DenseNet169': 'torch', 'DenseNet201': 'torch', - 'InceptionResNetV2': 'tf', 'InceptionV3': 'tf', 'MobileNet': 'tf', - 'NASNetLarge': 'tf', 'NASNetMobile': 'tf', 'Xception': 'tf', - 'ResNet50': 'caffe', 'VGG16': 'caffe', 'VGG19': 'caffe'} - - -def create_model(CONF): - """ - Parameters - ---------- - CONF : dict - Contains relevant configuration parameters of the model - """ - architecture = getattr(applications, CONF['model']['modelname']) - - # create the base pre-trained model - img_width, img_height = CONF['model']['image_size'], CONF['model']['image_size'] - base_model = architecture(weights='imagenet', include_top=False, input_shape = (img_width, img_height, 3)) - - # Add custom layers at the top to adapt it to our problem - x = base_model.output - x = GlobalAveragePooling2D()(x) - # x = Flatten()(x) #might work better on large dataset than GlobalAveragePooling https://github.com/keras-team/keras/issues/8470 - x = Dense(1024, - activation='relu')(x) - predictions = Dense(CONF['model']['num_classes'], - activation='softmax')(x) - - # Full model - model = Model(inputs=base_model.input, outputs=predictions) - - # Add L2 reguralization for all the layers in the whole model - if CONF['training']['l2_reg']: - for layer in model.layers: - layer.kernel_regularizer = regularizers.l2(CONF['training']['l2_reg']) - - return model, base_model - - def save_to_pb(keras_model, export_path): """ Save keras model to protobuf for Tensorflow Serving. @@ -90,19 +47,6 @@ def save_to_pb(keras_model, export_path): builder.save() -def export_h5_to_pb(path_to_h5, export_path): - """ - Transform Keras model to protobuf - - Parameters - ---------- - path_to_h5 - export_path - """ - model = load_model(path_to_h5) - save_to_pb(model, export_path) - - def save_conf(conf): """ Save CONF to a txt file to ease the reading and to a json file to ease the parsing. diff --git a/speechclas/paths.py b/speechclas/paths.py index 76cf801289e271f9d7424411ae1bdfabcc032246..46d13ed29b594b571f1f8bb1fc60b08a45eabcd7 100644 --- a/speechclas/paths.py +++ b/speechclas/paths.py @@ -15,7 +15,7 @@ from speechclas import config homedir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -CONF = config.conf_dict() +CONF = config.get_conf_dict() timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S') def get_timestamp(): diff --git a/speechclas/test_utils.py b/speechclas/test_utils.py deleted file mode 100644 index f538115ef8fd3e1b19da6059192c82123428e7b4..0000000000000000000000000000000000000000 --- a/speechclas/test_utils.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -Miscellaneous functions for test time. - -Date: July 2019 -Author: Lara Lloret Iglesias -Email: lloret@ifca.unican.es -Github: laramaktub -""" - -import numpy as np - -from speechclas.data_utils import k_crop_data_sequence - - -def predict(model, X, conf, top_K=None, crop_num=10, filemode='local', merge=False): - """ - Predict function. - - Parameters - ---------- - model: keras model instance - X : str or list - List of images paths of length N. If providing a list of urls, be sure to set correctly the 'filemode' parameter. - If a str is provided it will be understood as a single image to predict. - conf: dict - Configuration parameters. The data augmentation parameters that will be used in the inference can be changed in - conf['augmentation']['val_mode']. - top_k : int - Number of top predictions to return. If None, all predictions will be returned. - crop_num: int - Number of crops to use for test. Default is 10. - filemode : str, {'local','url'} - - 'local': filename is absolute path in local disk. - - 'url': filename is internet url. - merge: Merge the predictions of all the images in the list. This value is tipically set to True when you pass - multiple images of the same observation. - - Returns - ------- - pred_lab: np.array, shape (N, top_k) - Array of predicted labels - pred_prob: np.array, shape (N, top_k) - Array of predicted probabilities - """ - - if top_K is None: - top_K = conf['model']['num_classes'] - if type(X) is str: #if not isinstance(X, list): - X = [X] - - data_gen = k_crop_data_sequence(inputs=X, - im_size=conf['model']['image_size'], - mean_RGB=conf['dataset']['mean_RGB'], - std_RGB=conf['dataset']['std_RGB'], - preprocess_mode=conf['model']['preprocess_mode'], - aug_params=conf['augmentation']['val_mode'], - crop_mode='random', - crop_number=crop_num, - filemode=filemode) - - output = model.predict_generator(generator=data_gen, verbose=1, max_queue_size=10, workers=4, use_multiprocessing=True) - - output = output.reshape(len(X), -1, output.shape[-1]) # reshape to (N, crop_number, num_classes) - output = np.mean(output, axis=1) # take the mean across the crops - - if merge: - output = np.mean(output, axis=0) # take the mean across the images - lab = np.argsort(output)[::-1] # sort labels in descending prob - lab = lab[:top_K] # keep only top_K labels - lab = np.expand_dims(lab, axis=0) # add extra dimension to make to output have a shape (1, top_k) - prob = output[lab] - else: - lab = np.argsort(output, axis=1)[:, ::-1] # sort labels in descending prob - lab = lab[:, :top_K] # keep only top_K labels - prob = output[np.repeat(np.arange(len(lab)), lab.shape[1]), - lab.flatten()].reshape(lab.shape) # retrieve corresponding probabilities - - return lab, prob - - -def topK_accuracy(true_lab, pred_lab, K=1): - """ - Compute the top_K accuracy - - Parameters - ---------- - true_lab: np.array, shape (N) - Array with ground truth labels - pred_lab: np.array, shape (N, M) - Array with predicted labels. M should be bigger than K. - K: int - Accuracy type to compute - """ - assert K<= pred_lab.shape[1], 'K is bigger than your number of predictions' - mask = [lab in pred_lab[i, :K] for i, lab in enumerate(true_lab)] - return np.mean(mask) diff --git a/speechclas/train_runfile.py b/speechclas/train_runfile.py index ceb1af9cf304b2b18be2e8a3b80db70c0c916702..70ef2a872828070914efd0a2048725d13ebfff8b 100644 --- a/speechclas/train_runfile.py +++ b/speechclas/train_runfile.py @@ -84,7 +84,7 @@ from tensorflow.python.platform import gfile from speechclas import paths, config, input_data, models, freeze, utils, model_utils from tensorflow.python.framework import graph_util -CONF = config.conf_dict() +CONF = config.get_conf_dict() timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S') @@ -98,7 +98,6 @@ def train_fn(TIMESTAMP, CONF): utils.create_dir_tree() #Activate only if you want to make a backup of the splits used for the training #utils.backup_splits() - # logging.set_verbosity(logging.INFO) logging.basicConfig(filename=paths.get_logs_dir()+'/train_info.log',level=logging.DEBUG) @@ -203,6 +202,7 @@ def train_fn(TIMESTAMP, CONF): f.write('\n'.join(audio_processor.words_list)) # Training loop. + print('Start training ...') training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. diff --git a/speechclas/utils.py b/speechclas/utils.py index 084e7582583452e191042e499fbd1567d09ab054..0bb8882d28c81e31c47828565b9357360418f733 100644 --- a/speechclas/utils.py +++ b/speechclas/utils.py @@ -8,16 +8,8 @@ Github: ignacioheredia """ import os -import subprocess -from distutils.dir_util import copy_tree -from multiprocessing import Process - -import numpy as np -from tensorflow.keras import callbacks -from tensorflow.keras import backend as K from speechclas import paths -#from speechclas.optimizers import customSGD, customAdam, customAdamW def create_dir_tree(): @@ -38,123 +30,3 @@ def remove_empty_dirs(): d_path = os.path.join(basedir, d) if not os.listdir(d_path): os.rmdir(d_path) - - -def backup_splits(): - """ - Save the data splits used during training to the timestamped dir. - """ - src = paths.get_splits_dir() - dst = paths.get_ts_splits_dir() - copy_tree(src, dst) - - -def get_custom_objects(): - return {'customSGD': customSGD, - 'customAdam': customAdam, - 'customAdamW': customAdamW} - - -class LR_scheduler(callbacks.LearningRateScheduler): - """ - Custom callback to decay the learning rate. Schedule follows a 'step' decay. - - Reference - --------- - https://github.com/keras-team/keras/issues/898#issuecomment-285995644 - """ - def __init__(self, lr_decay=0.1, epoch_milestones=[]): - self.lr_decay = lr_decay - self.epoch_milestones = epoch_milestones - super().__init__(schedule=self.schedule) - - def schedule(self, epoch): - current_lr = K.eval(self.model.optimizer.lr) - if epoch in self.epoch_milestones: - new_lr = current_lr * self.lr_decay - print('Decaying the learning rate to {}'.format(new_lr)) - else: - new_lr = current_lr - return new_lr - - -class LRHistory(callbacks.Callback): - """ - Custom callback to save the learning rate history - - Reference - --------- - https://stackoverflow.com/questions/49127214/keras-how-to-output-learning-rate-onto-tensorboard - """ - def __init__(self): # add other arguments to __init__ if needed - super().__init__() - - def on_epoch_end(self, epoch, logs=None): - logs.update({'lr': K.eval(self.model.optimizer.lr).astype(np.float64)}) - super().on_epoch_end(epoch, logs) - - -def launch_tensorboard(port=6006): - subprocess.call(['tensorboard', - '--logdir', '{}'.format(paths.get_logs_dir()), - '--port', '{}'.format(port), - '--host', '0.0.0.0']) - - -def get_callbacks(CONF, use_lr_decay=True): - """ - Get a callback list to feed fit_generator. - #TODO Use_remote callback needs proper configuration - #TODO Add ReduceLROnPlateau callback? - - Parameters - ---------- - CONF: dict - - Returns - ------- - List of callbacks - """ - - calls = [] - - # Add mandatory callbacks - calls.append(callbacks.TerminateOnNaN()) - calls.append(LRHistory()) - - # Add optional callbacks - if use_lr_decay: - milestones = np.array(CONF['training']['lr_step_schedule']) * CONF['training']['epochs'] - milestones = milestones.astype(np.int) - calls.append(LR_scheduler(lr_decay=CONF['training']['lr_step_decay'], - epoch_milestones=milestones.tolist())) - - if CONF['monitor']['use_tensorboard']: - calls.append(callbacks.TensorBoard(log_dir=paths.get_logs_dir(), write_graph=False)) - - # # Let the user launch Tensorboard - # print('Monitor your training in Tensorboard by executing the following comand on your console:') - # print(' tensorboard --logdir={}'.format(paths.get_logs_dir())) - # Run Tensorboard on a separate Thread/Process on behalf of the user - port = os.getenv('monitorPORT', 6006) - port = int(port) if len(str(port)) >= 4 else 6006 - subprocess.run(['fuser', '-k', '{}/tcp'.format(port)]) # kill any previous process in that port - p = Process(target=launch_tensorboard, args=(port,), daemon=True) - p.start() - - if CONF['monitor']['use_remote']: - calls.append(callbacks.RemoteMonitor()) - - if CONF['training']['use_validation'] and CONF['training']['use_early_stopping']: - calls.append(callbacks.EarlyStopping(patience=int(0.1 * CONF['training']['epochs']))) - - if CONF['training']['ckpt_freq'] is not None: - calls.append(callbacks.ModelCheckpoint( - os.path.join(paths.get_checkpoints_dir(), 'epoch-{epoch:02d}.hdf5'), - verbose=1, - period=max(1, int(CONF['training']['ckpt_freq'] * CONF['training']['epochs'])))) - - if not calls: - calls = None - - return calls