Source code for clusteror.core

'''
This module contains ``Clusteror`` class capsulating raw data to discover
clusters from, the cleaned data for a clusteror to run on.

The clustering model encompasses two parts:

1. Neural network:
   Pre-training (often encountered in Deep Learning context)
   is implemented to achieve a goal that the neural network maps the input
   data of higher dimension to a one dimensional representation. Ideally this
   mapping is one-to-one.
   A Denoising Autoencoder (DA) or Stacked Denoising Autoencoder (SDA) is
   implemented for this purpose.
2. One dimensional clustering model:
   A separate model segments the samples against the one dimensional
   representation. Two models are available in this class definition:
       * K-Means
       * Valley model

The pivot idea here is given the neural network is a good one-to-one mapper
the separate clustering model on one dimensional representation is equivalent
to a clustering model on the original high dimensional data.

Note
----
Valley model is explained in details in module ``clusteror.utils``.
'''
# import ipdb
import os
import sys
import json
import timeit
import warnings
import numpy as np
import pandas as pd
import pickle as pk
import theano
import theano.tensor as T
from sklearn.cluster import KMeans
from theano import function
from theano import shared
from theano.tensor.shared_randomstreams import RandomStreams
from .nn import dA
from .nn import SdA
from .settings import numpy_random_seed
from .settings import theano_random_seed
from .utils import find_local_extremes


[docs]class OutRangeError(Exception): ''' Exceptions thrown as cleaned data go beyond range ``[-1, 1]``. ''' pass
[docs]class Clusteror(object): ''' ``Clusteror`` class can train neural networks *DA* or *SDA*, train taggers, or load saved models from files. Parameters ---------- raw_data : Pandas DataFrame Dataframe read from data source. It can be original dataset without any preprocessing or with a certain level of manipulation for future analysis. Attributes ---------- _raw_data : Pandas DataFrame Stores the original dataset. It's the dataset that later post-clustering performance analysis will be based on. _cleaned_data : Pandas DataFrame Preprocessed data. Not necessarily has same number of columns with ``_raw_data`` as a categorical column can derive multiple columns. As the ``tanh`` function is used as activation function for symmetric consideration. All columns should have values in range ``[-1, 1]``, otherwise an ``OutRangeError`` will be raised. _network : str **da** for *DA*; **sda** for *SDA*. Facilating functions called with one or the other algorithm. _da_dim_reducer: Theano function Keeps the Theano function that is from trained DA model. Reduces the dimension of the cleaned data down to one. _sda_dim_reducer: Theano function Keeps the Theano function that is from trained SDA model. Reduces the dimension of the cleaned data down to one. _one_dim_data: Numpy Array The dimension reduced one dimensional data. _valley: Python function Trained valley model tagging sample with their one dimensional representation. _kmeans: Scikit-Learn K-Means model Trained K-Means model tagging samples with their one dimensional representation. _tagger: str Keeps records of which tagger implemented. _field_importance: List Keeps the list of coefficiences that influence the clustering emphasis. ''' def __init__(self, raw_data): self._raw_data = raw_data.copy() @classmethod
[docs] def from_csv(cls, filepath, **kwargs): ''' Class method for directly reading CSV file. Parameters ---------- filepath : str Path to the CSV file **kwargs : keyword arguments Other keyword arguments passed to ``pandas.read_csv`` ''' raw_data = pd.read_csv(filepath, **kwargs) return cls(raw_data)
@property def raw_data(self): ''' Pandas DataFrame: For assgining new values to ``_raw_data``. ''' return self._raw_data @raw_data.setter def raw_data(self, raw_data): self._raw_data = raw_data @property def cleaned_data(self): ''' Pandas DataFrame: For assgining cleaned dataframe to ``_cleaned_dat``. ''' return self._cleaned_data @cleaned_data.setter def cleaned_data(self, cleaned_data): self._cleaned_data = cleaned_data @property def da_dim_reducer(self): ''' Theano function: Function that reduces dataset dimension. Attribute ``_network`` is given **da** to designate the method of the autoencoder as ``DA``. ''' return self._da_dim_reducer @da_dim_reducer.setter def da_dim_reducer(self, da_dim_reducer): self._da_dim_reducer = da_dim_reducer self._network = 'da' @property def sda_dim_reducer(self): ''' Theano function: Function that reduces dataset dimension. Attribute ``_network`` is given **sda** to designate the method of the autoencoder as ``SDA``. ''' return self._sda_dim_reducer @sda_dim_reducer.setter def sda_dim_reducer(self, sda_dim_reducer): self._sda_dim_reducer = sda_dim_reducer self._network = 'sda' @property def one_dim_data(self): ''' Numpy Array: Stores the output of neural network that has dimension one. ''' return self._one_dim_data @one_dim_data.setter def one_dim_data(self, one_dim_data): self._one_dim_data = one_dim_data @property def valley(self): ''' Python function: Trained on the dimension reduced one dimensional data that segregates subjects into concentration of existence in a subset of ``[-1, 1]``, by locating the "valley" in the distribution landscape. ``_tagger`` is given **valley** to facilitate follow-up usages. ''' return self._valley @valley.setter def valley(self, valley): self._valley = valley self._tagger = 'valley' @property def kmeans(self): ''' Python function: Trained on the dimension reduced one dimensional data that segregates subjects into concentration of existence in a subset of ``[-1, 1]`` with K-Means algorithm. ``_tagger`` is given **valley** to facilitate follow-up usages. ''' return self._kmeans @kmeans.setter def kmeans(self, kmeans): self._kmeans = kmeans self._tagger = 'kmeans' @property def tagger(self): ''' str: Name the tagger if necessary to do so, which will facilitate, e.g. prefixing the filepath. ''' return self._tagger @tagger.setter def tagger(self, tagger): self._tagger = tagger @property def field_importance(self): ''' List: Significance that given to fields when training of neural network is done. Fields with a large number will be given more attention. Note ---- The importance is only meaningful relatively between fields. If no values are specified, all fields are treated equally. Parameters ---------- field_importance : List or Dict, default None (List of Ones) * If a list is designated, all fields should be assigned an importance, viz, the length of the list should be equal to the length of the features training the neural network. * It can also be given in a dict. In such a case, the fields can be selectively given a value. Dict key is for field name and value is for the importance. Fields not included will be initiated with the default value one. A warning will be issued when a key is not on the list of field names, mostly because of a typo. ''' return self._field_importance @field_importance.setter def field_importance(self, field_importance): n_fields = self._cleaned_data.shape[1] if isinstance(field_importance, list): assert len(field_importance) == n_fields self._field_importance = field_importance elif isinstance(field_importance, dict): self._field_importance = [1] * n_fields columns = self._cleaned_data.columns.tolist() for field, importance in field_importance.items(): try: index = columns.index(field) self._field_importance[index] = importance except ValueError: msg = '{} isn\'t in fields'.format(field) warnings.warn(msg) def _check_cleaned_data(self): ''' Checks on cleaned data before any work is done. This list of checks can be extended when more checks should be included. ''' cleaned_data_info = ( 'Need first assign your cleaned data to attribute "_cleaned_data"' ) assert self._cleaned_data is not None, cleaned_data_info if (self._cleaned_data.max() > 1).any(): raise OutRangeError('Maximum should be less equal than 1.') if (self._cleaned_data.min() < -1).any(): raise OutRangeError('Minimum should be greater equal than -1') def _check_network(self): ''' Check if network has been correctly setup. ''' network_info = ( 'Clusteror needs to know which network to use in' 'attribute "_network"' ) assert self._network is not None, network_info info = 'Train {} with {} or load it first!' if self._network == 'da': info = info.format('DA', '"train_da_dim_reducer"') assert self._da_dim_reducer is not None, info elif self._network == 'sda': info = info.format('SDA', '"train_sda_dim_reducer"') assert self._sda_dim_reducer is not None, info def _prepare_network_training(self, batch_size): ''' Preparations needed to kick off training neural networks. Parameters ---------- batch_size: int Size of each training batch. Necessary to derive the number of batches. ''' self.np_rs = np.random.RandomState(numpy_random_seed) self.theano_rs = RandomStreams(self.np_rs.randint(theano_random_seed)) # compute number of minibatches for training, validation and testing self.data = np.asarray(self._cleaned_data, dtype=theano.config.floatX) self.train_set = shared(value=self.data, borrow=True) # compute number of minibatches for training # needs one more batch if residual is non-zero # e.g. 5 rows with batch size 2 needs 5 // 2 + 1 self.n_train_batches = ( self.data.shape[0] // batch_size + int(self.data.shape[0] % batch_size > 0) ) def _pretraining_early_stopping( self, train_func, n_train_batches, min_epochs, patience, patience_increase, improvement_threshold, verbose, **kwargs ): ''' Scheme of early stopping if no substantial improvement can be observed. Parameters ---------- train_func: Theano Function Function that takes in training set and updates internal parameters, in this case the weights and biases in neural network, and returns the evaluation of the cost function after each training step. n_train_batches: int Number of training batches derived from the total number of training samples and the batch size. min_epochs: int The mininum number of training epoch to run. It can be exceeded depending on the setup of patience and ad-hoc training progress. patience: int True number of training epochs to run if larger than ``min_epochs``. Note it is potentially increased during the training if the cost is better than the expectation from current cost. patience_increase: int Coefficient used to increase patience against epochs that have been run. improvement_threshold: float, between 0 and 1 Minimum improvement considered as substantial improvement, i.e. new cost over existing lowest cost lower than this value. verbose: boolean Prints out training at each epoch if true. **kwargs: keyword arguments All keyword arguments pass on to ``train_func``. ''' n_epochs = 0 done_looping = False check_frequency = min(min_epochs, patience // 3) best_cost = np.inf assert improvement_threshold > 0 and improvement_threshold < 1 start_time = timeit.default_timer() while (n_epochs < min_epochs) or (not done_looping): n_epochs += 1 # go through training set c = [] for minibatch_index in range(n_train_batches): c.append(train_func(minibatch_index, **kwargs)) cost = np.mean(c) if verbose: print( 'Training epoch {n_epochs}, '.format(n_epochs=n_epochs) + 'cost {cost}.'.format(cost=cost) ) if n_epochs % check_frequency == 0: # check cost every check_frequency if cost < best_cost: benchmark_better_cost = best_cost * improvement_threshold if cost < benchmark_better_cost: # increase patience if cost improves a lot # the increase is a multiplicity of epochs that # have been run patience = max(patience, n_epochs * patience_increase) if verbose: print( 'Epoch {n_epochs},'.format(n_epochs=n_epochs) + ' patience increased to {patience}'.format( patience=patience ) ) best_cost = cost if n_epochs > patience: done_looping = True end_time = timeit.default_timer() if verbose: training_time = (end_time - start_time) sys.stderr.write( os.path.split(__file__)[1] + ' ran for {time:.2f}m\n'.format(time=training_time / 60.))
[docs] def train_da_dim_reducer( self, field_importance=None, batch_size=50, corruption_level=0.3, learning_rate=0.002, min_epochs=200, patience=60, patience_increase=2, improvement_threshold=0.98, verbose=False, ): ''' Trains a ``DA`` neural network. Parameters ---------- field_importance : List or Dict, default None (List of Ones) * If a list is designated, all fields should be assigned an importance, viz, the length of the list should be equal to the length of the features training the neural network. * It can also be given in a dict. In such a case, the fields can be selectively given a value. Dict key is for field name and value is for the importance. Fields not included will be initiated with the default value one. A warning will be issued when a key is not on the list of field names, mostly because of a typo. batch_size: int Size of each training batch. Necessary to derive the number of batches. corruption_level: float, between 0 and 1 Dropout rate in reading input, typical pratice in deep learning to avoid overfitting. learning_rate: float Propagating step size for gredient descent algorithm. min_epochs: int The mininum number of training epoch to run. It can be exceeded depending on the setup of patience and ad-hoc training progress. patience: int True number of training epochs to run if larger than ``min_epochs``. Note it is potentially increased during the training if the cost is better than the expectation from current cost. patience_increase: int Coefficient used to increase patience against epochs that have been run. improvement_threshold: float, between 0 and 1 Minimum improvement considered as substantial improvement, i.e. new cost over existing lowest cost lower than this value. verbose: boolean, default False Prints out training at each epoch if true. ''' self._network = 'da' # note .field_importance indicates the magic of the property # decorator is played to transform the format the input self.field_importance = field_importance self._check_cleaned_data() self._prepare_network_training(batch_size=batch_size) # allocate symbolic variables for the dat # index to a [mini]batch index = T.lscalar('index') x = T.matrix('x') da = dA( n_visible=self.data.shape[1], n_hidden=1, np_rs=self.np_rs, theano_rs=self.theano_rs, field_importance=field_importance, input_data=x, ) cost, updates = da.get_cost_updates( corruption_level=corruption_level, learning_rate=learning_rate ) train_da = theano.function( [index], cost, updates=updates, givens={ x: self.train_set[index * batch_size: (index + 1) * batch_size] } ) self._pretraining_early_stopping( train_func=train_da, n_train_batches=self.n_train_batches, min_epochs=min_epochs, patience=patience, patience_increase=patience_increase, improvement_threshold=improvement_threshold, verbose=verbose ) self.da = da self._da_dim_reducer = function([x], da.get_hidden_values(x)) self.da_reconstruct = function( [x], da.get_reconstructed_input(da.get_hidden_values(x)) )
[docs] def train_sda_dim_reducer( self, field_importance=None, batch_size=50, hidden_layers_sizes=[20], corruption_levels=[0.3], learning_rate=0.002, min_epochs=200, patience=60, patience_increase=2, improvement_threshold=0.98, verbose=False ): ''' Trains a ``SDA`` neural network. Parameters ---------- field_importance : List or Dict, default None (List of Ones) * If a list is designated, all fields should be assigned an importance, viz, the length of the list should be equal to the length of the features training the neural network. * It can also be given in a dict. In such a case, the fields can be selectively given a value. Dict key is for field name and value is for the importance. Fields not included will be initiated with the default value one. A warning will be issued when a key is not on the list of field names, mostly because of a typo. batch_size: int Size of each training batch. Necessary to derive the number of batches. hidden_layers_sizes: List of ints Number of neurons in the hidden layers (all but the input layer). corruption_levels: List of floats, between 0 and 1 Dropout rate in reading input, typical pratice in deep learning to avoid overfitting. learning_rate: float Propagating step size for gredient descent algorithm. min_epochs: int The mininum number of training epoch to run. It can be exceeded depending on the setup of patience and ad-hoc training progress. patience: int True number of training epochs to run if larger than ``min_epochs``. Note it is potentially increased during the training if the cost is better than the expectation from current cost. patience_increase: int Coefficient used to increase patience against epochs that have been run. improvement_threshold: float, between 0 and 1 Minimum improvement considered as substantial improvement, i.e. new cost over existing lowest cost lower than this value. verbose: boolean, default False Prints out training at each epoch if true. ''' # note .field_importance indicates the magic of the property # decorator is played to transform the format the input self.field_importance = field_importance assert hidden_layers_sizes is not None assert isinstance(corruption_levels, list) assert len(hidden_layers_sizes) == len(corruption_levels) self._network = 'sda' self._check_cleaned_data() self._prepare_network_training(batch_size=batch_size) # for the purpose of this excercise, restrict the final layer 1d hidden_layers_sizes.append(1) corruption_levels.append(0) x = T.matrix('x') sda = SdA( n_ins=self.data.shape[1], hidden_layers_sizes=hidden_layers_sizes, np_rs=self.np_rs, theano_rs=self.theano_rs, field_importance=field_importance, input_data=x ) pretraining_fns = sda.pretraining_functions( train_set=self.train_set, batch_size=batch_size ) for ind in range(sda.n_layers): self._pretraining_early_stopping( train_func=pretraining_fns[ind], n_train_batches=self.n_train_batches, min_epochs=min_epochs, patience=patience, patience_increase=patience_increase, improvement_threshold=improvement_threshold, verbose=verbose, corruption_level=corruption_levels[ind], learning_rate=learning_rate ) self.sda = sda self._sda_dim_reducer = function([x], sda.get_final_hidden_layer(x)) self.sda_reconstruct = function( [x], sda.get_first_reconstructed_input(sda.get_final_hidden_layer(x)) )
def _prefix_filepath(self, prefix_type, filepath): ''' Prefixes a filepath with the type stored in the file. Examples -------- >> clusteror._prefix_filepath('network', 'a/b') 'a/da_b' Note ---- Only the filename part is prefixed if there are directories in the path. Parameters ---------- prefix_type: str The type to prefixing the filepath. filepath: str Filepath to be prefixed. Returns ------- Prefixed filepath. ''' filepath_list = list(os.path.split(filepath)) filepath_list[-1] = ( getattr(self, prefix_type) + '_' + filepath_list[-1] ) filepath = os.path.join(tuple(filepath_list)) return filepath
[docs] def save_dim_reducer( self, filepath='dim_reducer.pk', include_network=False ): ''' Save dimension reducer from the neural network training. Parameters ---------- filepath: str Filename to store the dimension reducer. include_network: boolean If true, prefix the filepath with the network type. ''' self._check_network() if include_network: filepath = self._prefix_filepath('network', filepath) with open(filepath, 'wb') as f: if self._network == 'da': pk.dump(self._da_dim_reducer, f) elif self._network == 'sda': pk.dump(self._sda_dim_reducer, f)
[docs] def load_dim_reducer(self, filepath='dim_reducer.pk'): ''' Loads saved dimension reducer. Need to first name the network type. Parameters ---------- filepath: str ''' assert self._network is not None with open(filepath, 'rb') as f: if self._network == 'da': self._da_dim_reducer = pk.load(f) elif self._network == 'sda': self._sda_dim_reducer = pk.load(f)
[docs] def reduce_to_one_dim(self): ''' Reduces the dimension of input dataset to one before the tagging in the next step. Input of the Theano function is the cleaned data and output is a one dimensional data stored in ``_one_dim_data``. ''' self._check_cleaned_data() self._check_network() if self._network == 'da': self._one_dim_data = self._da_dim_reducer(self._cleaned_data) elif self._network == 'sda': self._one_dim_data = self._sda_dim_reducer(self._cleaned_data) self._one_dim_data = self._one_dim_data[:, 0]
def _check_one_dim_data(self): ''' Check if one_dim_data exists. Give error info if not. ''' one_dim_data_info = 'Get reduced one dimensional data first!' assert self._one_dim_data is not None, one_dim_data_info
[docs] def train_valley(self, bins=100, contrast=0.3): ''' Trains the ability to cut the universe of samples into clusters based how the dimension reduced dataset assembles in a histogram. Unlike the K-Means, no need to preset the number of clusters. Parameters ---------- bins: int Number of bins to aggregate the one dimensional data. contrast: float, between 0 and 1 Threshold used to define local minima and local maxima. Detailed explanation in ``utils.find_local_extremes``. Note ---- When getting only one cluster, check the distribution of ``one_dim_data``. Likely the data points flock too close to each other. Try increasing ``bins`` first. If not working, try different neural networks with more or less layers with more or less neurons. ''' bins = np.linspace(-1, 1, bins+1) # use the left point of bins to name the bin left_points = np.asarray(bins[:-1]) self._check_one_dim_data() cuts = pd.cut(self._one_dim_data, bins=bins) # ipdb.set_trace() bin_counts = cuts.describe().reset_index().loc[:, 'counts'] local_min_inds, local_mins, local_max_inds, local_maxs = ( find_local_extremes(bin_counts, contrast) ) self.trained_bins = left_points[local_min_inds].tolist() + [1] if self.trained_bins[0] != -1: self.trained_bins = [-1] + self.trained_bins def valley(one_dim_data): cuts = pd.cut( one_dim_data, bins=self.trained_bins, labels=list(range(len(self.trained_bins) - 1)) ) return cuts.get_values() self._valley = valley self._tagger = 'valley'
def _check_tagger(self): ''' Check tagger existence. Give error info if not. ''' tagger_info = 'Clusteror needs to know which tagger to use' assert self._tagger is not None, tagger_info info = 'Train {} with {} or load it first' if self._tagger == 'valley': info = info.format('"valley"', '"train_valley"') assert self._valley is not None, info elif self._tagger == 'kmeans': info = info.format('"kmeans"', '"train_kmeans"') assert self._kmeans is not None, info
[docs] def save_valley(self, filepath, include_taggername=False): ''' Saves valley tagger. Parameters ---------- filepath: str File path to save the tagger. include_taggername: boolean, default False Include the **valley_** prefix in filename if true. ''' self.check_tagger() if include_taggername: filepath = self._prefix_filepath('tagger', filepath) with open(filepath, 'w') as f: json.dump(self.trained_bins, f)
[docs] def load_valley(self, filepath): ''' Loads a saved valley tagger from a file. Create the valley function from the saved parameters. Parameter --------- filepath: str File path to the file saving the valley tagger. ''' with open(filepath, 'r') as f: self.trained_bins = json.load(f) def valley(one_dim_data): cuts = pd.cut( one_dim_data, bins=self.trained_bins, labels=list(range(len(self.trained_bins) - 1)) ) return cuts.get_values() self._valley = valley self._tagger = 'valley'
[docs] def train_kmeans(self, n_clusters=10, **kwargs): ''' Trains K-Means model on top of the one dimensional data derived from dimension reducers. Parameters ---------- n_clusters: int The number of clusters required to start a K-Means learning. **kwargs: keyword arguments Any other keyword arguments passed on to Scikit-Learn K-Means model. ''' self._check_one_dim_data() self._kmeans = KMeans(n_clusters=n_clusters, **kwargs) self._kmeans.fit(self._one_dim_data.reshape(-1, 1)) self._tagger = 'kmeans'
[docs] def save_kmeans(self, filepath, include_taggername=False): ''' Saves K-Means model to the named file path. Can add a prefix to indicate this saves a K-Means model. Parameters ---------- filepath: str File path for saving the model. include_taggername: boolean, default False Include the **kmean_** prefix in filename if true. ''' self._check_tagger() if include_taggername: filepath = self._prefix_filepath('tagger', filepath) with open(filepath, 'wb') as f: pk.dump(self._kmeans, f)
[docs] def load_kmeans(self, filepath): ''' Loads a saved K-Means tagger from a file. Parameter --------- filepath: str File path to the file saving the K-Means tagger. ''' with open(filepath, 'rb') as f: self._kmeans = pk.load(f) self._tagger = 'kmeans'
[docs] def add_cluster(self): ''' Tags each sample regarding their reduced one dimensional value. Adds an extra column **'cluster'** to ``raw_data``, seggesting a zero-based cluster ID. ''' self._check_tagger() if self._tagger == 'valley': self.raw_data.loc[:, 'cluster'] = self._valley(self._one_dim_data) elif self._tagger == 'kmeans': self.raw_data.loc[:, 'cluster'] = ( self._kmeans.predict(self._one_dim_data.reshape(-1, 1)) )