Source code for spykes.io.datasets

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import scipy.io
import numpy as np
import requests
import zipfile

from .. import config


def _urlretrieve(url, filename):
    '''Defines a convenience method for downloading files with requests.

    Args:
        url (str): The URL of the file to download.
        filename (str): The path to save the file.
    '''
    r = requests.get(url, stream=True)
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)


def _load_file(fpath):
    '''Checks whether a file is a .mat or .npy file and loads it.

    This is a convenience method for the other loading functions.

    Args:
        fpath (str): The exact path of where data is located.

    Returns:
        mat or numpy array: The loaded dataset.
    '''
    if fpath[-4:] == '.mat':
        data = scipy.io.loadmat(fpath)
    elif fpath[-4:] == '.npy':
        data = np.load(fpath)
    else:
        raise ValueError('Invalid file type: {}'.format(fpath))
    return data


[docs]def load_spikefinder_data(dir_name='spikefinder'): '''Downloads and returns a dataset of paired calcium recordings. This dataset was used for the Spikefinder competition (DOI: 10.1101/177956), and consists of datasets of paired calcium traces and spike trains collected from multiple sources. Args: dir_name (str): Specifies the directory to which the data files should be downloaded. This is concatenated with the user-set data directory. Returns: tuple: Paths to the downloaded training and testing datasets. Each dataset is a CSV which can be loaded using Pandas, :data:`pd.read_csv(path)`. * :data:`train_data`: List of pairs of strings, where each pair consists of the path to the calcium data (inputs) and the path to the spike data (ground truth) for that dataset pair. * :data:`test_data`: List of strings, where each string is the path to a testing dataset. ''' dpath = os.path.join(config.get_data_directory(), dir_name) if not os.path.exists(dpath): os.makedirs(dpath) url_template = ( 'https://s3.amazonaws.com/neuro.datasets/' 'challenges/spikefinder/spikefinder.{version}.zip' ) # Downloads the two datasets. def _download(version): zipname = os.path.join(dpath, '{}.zip'.format(version)) if not os.path.exists(zipname): url = url_template.format(version=version) _urlretrieve(url, zipname) # Unzips the associated files. unzip_path = os.path.join(dpath, 'spikefinder.{}'.format(version)) if not os.path.exists(unzip_path): zipref = zipfile.ZipFile(zipname, 'r') zipref.extractall(dpath) zipref.close() return unzip_path # Downloads the two datasets. train_path, test_path = _download('train'), _download('test') train_template = os.path.join(train_path, '{index}.train.{mode}.csv') test_template = os.path.join(test_path, '{index}.test.calcium.csv') # Converts each dataset to a file path. train_paths = [( train_template.format(index=i, mode='calcium'), train_template.format(index=i, mode='spikes'), ) for i in range(1, 11)] test_paths = [test_template.format(index=i) for i in range(1, 6)] # Checks that all of the files exist. assert all(os.path.exists(i) and os.path.exists(j) for i, j in train_paths) assert all(os.path.exists(i) for i in test_paths) return train_paths, test_paths
[docs]def load_reward_data(dir_name='reward'): '''Downloads and returns the data for the PopVis example. Downloads and returns data for Neural Coding Reward Example as well as PopVis Example. Dataset comes from `Ramkumar et al's` "Premotor and Motor Cortices Encode Reward" paper. Args: dir_name (str): Specifies the directory to which the data files should be downloaded. This is concatenated with the user-set data directory. Returns: tuple: The two downloaded files. * :data:`sess_one_mat`: :data:`.mat` file for Monkey M, Session 1. * :data:`sess_four_mat`: :data:`.mat` file for Monkey M, Session 4. ''' dpath = os.path.join(config.get_data_directory(), dir_name) if not os.path.exists(dpath): os.makedirs(dpath) def download_mat(fname, url): '''Helper function for downloading the existing MAT files.''' fpath = os.path.join(dpath, fname) if not os.path.exists(fname): _urlretrieve(url, fpath) return _load_file(fpath) # Downloads sess_one_mat. sess_one_mat = download_mat( fname='Mihili_07112013.mat', url='https://ndownloader.figshare.com/files/5652051', ) # Downloads sess_four_mat. sess_four_mat = download_mat( fname='Mihili_08062013.mat', url='https://ndownloader.figshare.com/files/5652060', ) return sess_one_mat, sess_four_mat
[docs]def load_neuropixels_data(dir_name='neuropixels'): '''Downloads and returns data for the Neuropixels example. The dataset comes from `UCL's Cortex Lab <http://data.cortexlab.net/dualPhase3/data/>`_. Args: dir_name (str): Specifies the directory to which the data files should be downloaded. This is concatenated with the user-set data directory. Returns: dict: A dictionary where each key corresponds to a needed file. ''' dpath = os.path.join(config.get_data_directory(), dir_name) if not os.path.exists(dpath): os.makedirs(dpath) base_url = 'http://data.cortexlab.net/dualPhase3/data/' file_dict = dict() parent_fnames = [ 'experiment1stimInfo.mat', 'experiment2stimInfo.mat', 'experiment3stimInfo.mat', 'timeCorrection.mat', 'timeCorrection.npy', ] parent_dir = [ 'frontal/', 'posterior/', ] subdir_fnames = [ 'spike_clusters.npy', 'spike_templates.npy', 'spike_times.npy', 'templates.npy', 'whitening_mat_inv.npy', 'cluster_groups.csv', 'channel_positions.npy', ] for name in parent_fnames: fname = os.path.join(dpath, name) url = os.path.join(base_url, name) if not os.path.exists(fname): _urlretrieve(url, fname) file_dict[name] = _load_file(fname) for directory in parent_dir: if not os.path.exists(os.path.join(dpath, directory)): os.makedirs(os.path.join(dpath, directory)) for subdir in subdir_fnames: fname = os.path.join(dpath, directory, subdir) url = os.path.join(base_url, directory, subdir) if not os.path.exists(fname): _urlretrieve(url, fname) key = os.path.join(directory, subdir) if subdir == 'cluster_groups.csv': file_dict[key] = np.recfromcsv(fname, delimiter='\t') else: file_dict[key] = _load_file(fname) return file_dict
[docs]def load_reaching_data(dir_name='reaching'): '''Downloads and returns data for the Reaching Dataset example. The dataset is publicly available `here <http://goo.gl/eXeUz8>`_. Because this is hosted on DropBox, you have to manually visit the link, then download it to the appropriate location (usually :data:`~/.spykes/reaching/reaching_dataset.h5`). Args: dir_name (str): Specifies the directory to which the data files should be downloaded. This is concatenated with the user-set data directory. Returns: deep dish dataset: The dataset, loaded using :meth:`deepdish.io.load`. ''' # Import is performed here so that deepdish is not required for all of # the "datasets" functions. import deepdish dpath = os.path.join(config.get_data_directory(), dir_name) if not os.path.exists(dpath): os.makedirs(dpath) # Downloads the file if it doesn't exist already. fpath = os.path.join(dpath, 'reaching_dataset.h5') if not os.path.exists(fpath): url = 'http://goo.gl/eXeUz8' _urlretrieve(url, fpath) data = deepdish.io.load(fpath) return data
[docs]def load_reaching_xy(event='goCueTime', feature='endpointOfReach', neuron='M1', window_min=0., window_max=500., threshold=10., dir_name='reaching'): '''Extracts the reach direction and M1 spikes from the reaching dataset. Args: event (str): Event to which to align each trial; :data:`goCueTime`, :data:`targetOnTime` or :data:`rewardTime`. feature (str): The feature to get; :data:`endpointOfReach` or :data:`reward`. neuron (str): The neuron response to use, either :data:`M1` or :data:`PMd`. window_min (double): The lower window value around the align queue to get spike counts, in milliseconds. window_max (double): The upper window value around the align queue to get spike counts, in milliseconds. threshold (double): The threshold for selecting high-firing neurons, representing the minimum firing rate in Hz. dir_name (str): Specifies the directory to which the data files should be downloaded. This is concatenated with the user-set data directory. Returns: tuple: The :data:`x` and :data:`y` features of the dataset. * :data:`x`: Array with shape :data:`(num_samples, num_features)` * :data:`y`: Array with shape :data:`(num_samples, num_neurons)` ''' # Loads the formatted data, if it has already been processed. fname = '{}.npz'.format('_'.join('{}'.format(i) for i in [ event, feature, neuron, window_min, window_max, threshold ])) fpath = os.path.join(config.get_data_directory(), dir_name, fname) if os.path.exists(fpath): with open(fpath, 'rb') as f: data = np.load(f) return data['x'], data['y'] # Loads the reaching data normally. reaching_data = load_reaching_data(dir_name) events = list(reaching_data['events'].keys()) features = list(reaching_data['features'].keys()) # Checks the input arguments, throwing helpful error messages if needed. if event not in events: raise ValueError('Invalid align event: "{}". Must be one of {}.' .format(event, events)) if feature not in features: raise ValueError('Invalid feature: "{}". Must be one of {}.' .format(feature, features)) if neuron not in ('M1', 'PMd'): raise ValueError('Invalid neuron type: "{}". Must be either "M1" or ' '"PMd".'.format(neuron)) neuron_key = 'neurons_{}'.format(neuron) spike_times = np.asarray([ np.squeeze(np.sort(s)) for s in reaching_data[neuron_key] ]) spike_freqs = np.asarray([len(t) / (t[-1] - t[0]) for t in spike_times]) # Applies the cutoff threshold. thresh_idxs = np.where(spike_freqs > threshold)[0] spike_times = spike_times[thresh_idxs] spike_freqs = spike_freqs[thresh_idxs] # Gets the reach angle, in radians. x = reaching_data['features'][feature] * np.pi / 180.0 x = np.arctan2(np.sin(x), np.cos(x)) # Gets the spike responses. event_data = reaching_data['events'][event] def _get_spikecounts(n): return np.asarray([ np.sum(np.all(( n >= e + 1e-3 * window_min, n <= e + 1e-3 * window_max, ), axis=0)) for e in event_data ]) y = np.stack([_get_spikecounts(n) for n in spike_times]).transpose(1, 0) # Saves the dataset after processing it. with open(fpath, 'wb') as f: np.savez(f, x=x, y=y) return x, y