## MyDataPy
## Data processing shelf
## Simon Lebastard - Nov 2018
## External requirements ###########################
import numpy as np
import pandas as pd
# Identification with Google account to access data
#from google.colab import auth
#auth.authenticate_user()
#
## This shelf requires gspread. To install:
## !pip install --upgrade -q gspread
#import gspread
#from oauth2client.client import GoogleCredentials
#
#gc = gspread.authorize(GoogleCredentials.get_application_default())
## Internal requirements ##########################
###################################################
###############################
# Data loading and formatting #
###############################
[docs]def load_data(dsID, set_type='tr', folder_name='data'):
"""
Loads a dataset from a folder name and a dataset number
Parameters
----------
dsID : int
the dataset number. Your input data should be stored in files that look like 'Xk.csv', where k=dsID
set_type : float
the imaginary part (default 0.0)
folder_name : str
folder where your data is stored
Returns
-------
pandas dataframe containing data with index starting from 0
ToDo
----
allow for this function to take as input any file name, with a defaut convention name
"""
Xdata_file = folder_name + '/X' + set_type + str(dsID) + '.csv'
X = pd.read_csv(Xdata_file, header=None, names=['Sequence'], dtype={'Sequence': np.unicode_})
if set_type=='tr':
Ydata_file = folder_name + '/Y' + set_type + str(dsID) + '.csv'
Y = pd.read_csv(Ydata_file, index_col=0, dtype={'Bound': np.dtype(bool)})
Y.index = Y.index - 1000*dsID
df = pd.concat([X, Y], axis=1)
else:
df = X
return df
[docs]def data_normalization(data, offset_column=False):
"""
Performs data normalization
Parameters
----------
data : numpy array
offset_column : boolean
true if you want a column of ones appended at the bottom of your data
Returns
-------
pandas dataframe normalized, and optionally offset
"""
d_mean = np.mean(data, axis=0)
d_std = np.std(data, axis=0)
data = (data - d_mean)/d_std
if offset_column:
data = np.hstack((data,np.ones((len(data),1))))
return data
#####################################
# Weighting different classifiers #
# to potentially do better than all #
#####################################
[docs]def voting(preds, wghts, stochastic=False):
"""
Produces a label prediction from many predictors
Parameters
----------
preds : array of predictors
ech predictor is an array of predictions, of a given size N
wghts : float array
confidence Weights given to the respective predictors
stochastic : boolean
if you set this to be True, the consensus prediction will be chosen from a binomial distribution from the different prediction votes
Returns
-------
array of N label predictions
"""
votes = np.average(preds, axis=1, weights=wghts)
if stochastic:
return np.random.binomial(1, p=votes).astype(int)
else:
return (0.5*(1 + np.sign(votes-0.5))).astype(int)
##########################################
# Mutual-information based dim reduction #
##########################################
[docs]def get_MI(data, labels, word_idx, bins):
"""
Returns the mutual information between a word and a binary label
Parameters
----------
data : numpy array
labels : numpy array of booleans
word_idx : ind
the index corresponding to the word you wish to compute MI for. You must have defined a table mapping word_idxs to words before you can use this function
bins : list
discretization bins for probability computation
Returns
-------
mutual information between word and binary label
"""
n,p = data.shape
idx_bound = np.argwhere(labels==1)
idx_unbound = np.argwhere(labels==0)
data_bound = np.take(data, idx_bound, axis=0)
data_unbound = np.take(data, idx_unbound, axis=0)
n_b = len(data_bound)
n_ub = n - n_b
data_bound = data_bound.reshape((n_b,p))
data_unbound = data_unbound.reshape((n_ub,p))
p_b = n_b*1.0/n
p_ub = 1.0 - p_b
MI = 0
for abin in bins:
b_cond = np.count_nonzero(np.isin(data_bound[:,word_idx], abin))*1.0/n_b
ub_cond= np.count_nonzero(np.isin(data_unbound[:,word_idx], abin))*1.0/n_ub
cond_data = np.isin(data[:,word_idx], abin)
n_cond = np.count_nonzero(cond_data)
if n_cond == 0:
continue
cond_b = np.count_nonzero(labels[cond_data]==1)*1.0/n_cond
cond_ub = 1.0 - cond_b
if cond_b > 0:
MI = MI + b_cond*p_b*np.log(cond_b/p_b)
if cond_ub > 0:
MI = MI + ub_cond*p_ub*np.log(cond_ub/p_ub)
if np.isnan(MI):
pdb.set_trace()
return MI
[docs]def argmax_MI(data, labels, n_feats, bins):
"""
Returns the n_feats words that share the most information with the binary label
Parameters
----------
data : numpy array
labels : numpy array of booleans
n_feats : int
number of high-information words to yield
bins : list
Discretization bins for probability computation
Returns
-------
mutual information between word and binary label
"""
n,p = data.shape
MI = np.zeros(p)
for word_idx in range(p):
MI[word_idx] = get_MI(data, labels, word_idx, bins)
max_MI_idx = np.argsort(MI)[-1:-(n_feats+2):-1]
return max_MI_idx, MI[max_MI_idx]
[docs]def MI_dimRed(data, labels, n_feats, bins):
"""
Reduces the dimensionality of a bag-of-words representation based on mutual information
Parameters
----------
data : numpy array
labels : numpy array of booleans
n_feats : int
number of high-information words to yield
bins : list
Discretization bins for probability computation
Returns
-------
N*n_feats numpy array
reduced BoW representation
"""
idx, MI_ranked = argmax_MI(data, features, n_feats, bins)
data_lowdim = np.take(data, idx, axis=1)
return data_lowdim, idx, MI_ranked