Source code for diamond.glms.cumulative_logistic

import logging
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.special import expit
from diamond.glms.glm import GLM
from diamond.solvers.diamond_cumulative_logistic import FixedHessianSolverCumulative

LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.INFO)


[docs]class CumulativeLogisticRegression(GLM): """ Cumulative logistic regression model with arbitrary crossed random effects and known covariances """ def __init__(self, train_df, priors_df, copy=False, test_df=None): super(CumulativeLogisticRegression, self).__init__(train_df, priors_df, copy, test_df) self.solver = FixedHessianSolverCumulative() self.H_main = None self.H_inter_LUs = {} self.J = None self.response = None # n x J matrix of response counts
[docs] def initialize(self, formula, **kwargs): r""" Get ready to fit the model by parsing the formula, checking priors, and creating design, penalty, and Hessian matrices Args: formula (string): R-style formula expressing the model to fit. eg. :math:`y \sim 1 + x + (1 + x | group)` Keyword Args: kwargs: additional arguments to pass to solver.fit method """ super(CumulativeLogisticRegression, self).initialize(formula, **kwargs) # ordinal uses separate solvers for main/interaction effects # consequently, need to store main design matrix as its own object self.grouping_designs.pop('main', None)
[docs] def fit(self, formula, **kwargs): r""" Fit the model specified by formula and training data Args: formula (string): R-style formula expressing the model to fit. eg. :math:`y \sim 1 + x + (1 + x | group)` Keyword Args: intercepts : array-like, optional. Initial values for intercepts. \ Must be monotonically increasing and \ have length == number of response levels minus one main : array-like, optional. Initial values for main effects. \ Must have length == number of main effects specified in formula kwargs : additional arguments passed to solver.fit Returns: dict of parameter estimates with keys "intercepts", "main", and one key for each grouping factor """ self.initialize(formula, **kwargs) # set initial parameters self.effects['intercepts'] = kwargs.get('intercepts', np.linspace(-1.0, 1.0, self.J - 1)) self.effects['main'] = kwargs.get('main', np.zeros(self.num_main)) for g in self.grouping_designs: self.effects[g] = np.zeros(self.grouping_designs[g].shape[1]) self.effects = self.solver.fit(Y=self.response, main_design=self.main_design, inter_designs=self.grouping_designs, H_inter_LUs=self.H_inter_LUs, penalty_matrices=self.sparse_inv_covs, effects=self.effects, **kwargs) self._create_output_dict() return self.results_dict
def _create_response_matrix(self): LOGGER.info("Creating response matrix.") df = pd.DataFrame({'index': self.train_df.index, 'y': self.train_df[self.response]}) Y = pd.pivot_table(df, index='index', columns=['y'], aggfunc=len, fill_value=0).as_matrix() self.response = Y self.J = self.response.shape[1] LOGGER.info("Created response matrix with shape (%d, %d)", self.response.shape[0], self.response.shape[1]) def _create_hessians(self): """ Create bounds on the Hessian matrices Args: None Returns: None """ LOGGER.info("creating Hessians") for g in self.groupings.keys(): X = self.grouping_designs[g] H = 0.5 * X.transpose().dot(X) + self.sparse_inv_covs[g].sparse_matrix self.H_inter_LUs[g] = sparse.linalg.splu(H.tocsc()) def _create_output_dict(self): """ Extract coefficients from fitted model Args: None Returns: dictionary with keys "main", "intercepts", and one key for each grouping factor. \ Values of the dictionary are dataframes """ LOGGER.info("extracting coefficients") self.results_dict['intercepts'] = self.effects['intercepts'] self.results_dict['main'] = pd.DataFrame({'variable': self.main_effects, 'main_value': self.effects['main']}) self._create_output_dict_inter()
[docs] def predict(self, new_df): """ Use the estimated model to make predictions. \ New levels of grouping factors are given fixed effects, with zero random effects Args: new_df (DataFrame): data to make predictions on Returns: n x J matrix, where n is the number of rows \ of new_df and J is the number \ of possible response values. The (i, j) entry of \ this matrix is the probability that observation i \ realizes response level j. """ eta = super(CumulativeLogisticRegression, self).predict(new_df) intercepts = self.effects['intercepts'] J = self.J preds = np.zeros((len(eta), J)) preds[:, 0] = expit(intercepts[0] + eta) preds[:, J - 1] = 1.0 - expit(intercepts[J - 2] + eta) for j in xrange(1, J - 1): preds[:, j] = expit(intercepts[j] + eta) - expit(intercepts[j - 1] + eta) return preds