Source code for dataTools

## MyDataPy
## Data processing shelf
## Simon Lebastard - Nov 2018

## External requirements ###########################

import numpy as np
import pandas as pd

# Identification with Google account to access data
#from google.colab import auth
#auth.authenticate_user()
#
## This shelf requires gspread. To install:
## !pip install --upgrade -q gspread
#import gspread
#from oauth2client.client import GoogleCredentials
#
#gc = gspread.authorize(GoogleCredentials.get_application_default())

## Internal requirements ##########################


###################################################

###############################
# Data loading and formatting #
###############################

[docs]def load_data(dsID, set_type='tr', folder_name='data'): """ Loads a dataset from a folder name and a dataset number Parameters ---------- dsID : int the dataset number. Your input data should be stored in files that look like 'Xk.csv', where k=dsID set_type : float the imaginary part (default 0.0) folder_name : str folder where your data is stored Returns ------- pandas dataframe containing data with index starting from 0 ToDo ---- allow for this function to take as input any file name, with a defaut convention name """ Xdata_file = folder_name + '/X' + set_type + str(dsID) + '.csv' X = pd.read_csv(Xdata_file, header=None, names=['Sequence'], dtype={'Sequence': np.unicode_}) if set_type=='tr': Ydata_file = folder_name + '/Y' + set_type + str(dsID) + '.csv' Y = pd.read_csv(Ydata_file, index_col=0, dtype={'Bound': np.dtype(bool)}) Y.index = Y.index - 1000*dsID df = pd.concat([X, Y], axis=1) else: df = X return df
[docs]def format_preds(preds): """ Translates signed predictions (-1/1 or signed with amplitude for confidence) into 0/1 predictions""" return (0.5*(1+np.sign(preds))).astype(int)
[docs]def data_normalization(data, offset_column=False): """ Performs data normalization Parameters ---------- data : numpy array offset_column : boolean true if you want a column of ones appended at the bottom of your data Returns ------- pandas dataframe normalized, and optionally offset """ d_mean = np.mean(data, axis=0) d_std = np.std(data, axis=0) data = (data - d_mean)/d_std if offset_column: data = np.hstack((data,np.ones((len(data),1)))) return data
##################################### # Weighting different classifiers # # to potentially do better than all # #####################################
[docs]def voting(preds, wghts, stochastic=False): """ Produces a label prediction from many predictors Parameters ---------- preds : array of predictors ech predictor is an array of predictions, of a given size N wghts : float array confidence Weights given to the respective predictors stochastic : boolean if you set this to be True, the consensus prediction will be chosen from a binomial distribution from the different prediction votes Returns ------- array of N label predictions """ votes = np.average(preds, axis=1, weights=wghts) if stochastic: return np.random.binomial(1, p=votes).astype(int) else: return (0.5*(1 + np.sign(votes-0.5))).astype(int)
########################################## # Mutual-information based dim reduction # ##########################################
[docs]def get_MI(data, labels, word_idx, bins): """ Returns the mutual information between a word and a binary label Parameters ---------- data : numpy array labels : numpy array of booleans word_idx : ind the index corresponding to the word you wish to compute MI for. You must have defined a table mapping word_idxs to words before you can use this function bins : list discretization bins for probability computation Returns ------- mutual information between word and binary label """ n,p = data.shape idx_bound = np.argwhere(labels==1) idx_unbound = np.argwhere(labels==0) data_bound = np.take(data, idx_bound, axis=0) data_unbound = np.take(data, idx_unbound, axis=0) n_b = len(data_bound) n_ub = n - n_b data_bound = data_bound.reshape((n_b,p)) data_unbound = data_unbound.reshape((n_ub,p)) p_b = n_b*1.0/n p_ub = 1.0 - p_b MI = 0 for abin in bins: b_cond = np.count_nonzero(np.isin(data_bound[:,word_idx], abin))*1.0/n_b ub_cond= np.count_nonzero(np.isin(data_unbound[:,word_idx], abin))*1.0/n_ub cond_data = np.isin(data[:,word_idx], abin) n_cond = np.count_nonzero(cond_data) if n_cond == 0: continue cond_b = np.count_nonzero(labels[cond_data]==1)*1.0/n_cond cond_ub = 1.0 - cond_b if cond_b > 0: MI = MI + b_cond*p_b*np.log(cond_b/p_b) if cond_ub > 0: MI = MI + ub_cond*p_ub*np.log(cond_ub/p_ub) if np.isnan(MI): pdb.set_trace() return MI
[docs]def argmax_MI(data, labels, n_feats, bins): """ Returns the n_feats words that share the most information with the binary label Parameters ---------- data : numpy array labels : numpy array of booleans n_feats : int number of high-information words to yield bins : list Discretization bins for probability computation Returns ------- mutual information between word and binary label """ n,p = data.shape MI = np.zeros(p) for word_idx in range(p): MI[word_idx] = get_MI(data, labels, word_idx, bins) max_MI_idx = np.argsort(MI)[-1:-(n_feats+2):-1] return max_MI_idx, MI[max_MI_idx]
[docs]def MI_dimRed(data, labels, n_feats, bins): """ Reduces the dimensionality of a bag-of-words representation based on mutual information Parameters ---------- data : numpy array labels : numpy array of booleans n_feats : int number of high-information words to yield bins : list Discretization bins for probability computation Returns ------- N*n_feats numpy array reduced BoW representation """ idx, MI_ranked = argmax_MI(data, features, n_feats, bins) data_lowdim = np.take(data, idx, axis=1) return data_lowdim, idx, MI_ranked