|
| 1 | +""" |
| 2 | +Diagnostics in probit regression. |
| 3 | + |
| 4 | +""" |
| 5 | +__author__ = ( |
| 6 | + |
| 7 | +) |
| 8 | + |
| 9 | +from math import sqrt, pi |
| 10 | + |
| 11 | +from libpysal.common import MISSINGVALUE |
| 12 | +import numpy as np |
| 13 | +import numpy.linalg as la |
| 14 | +import scipy.sparse as SP |
| 15 | +from scipy import stats |
| 16 | +from scipy.stats import norm |
| 17 | + |
| 18 | +__all__ = [ |
| 19 | + "pred_table", |
| 20 | + "probit_fit", |
| 21 | + "probit_lrtest", |
| 22 | + "mcfad_rho", |
| 23 | + "probit_ape", |
| 24 | + "sp_tests", |
| 25 | + "moran_KP", |
| 26 | +] |
| 27 | + |
| 28 | + |
| 29 | +def pred_table(reg): |
| 30 | + """ |
| 31 | + Calculates a table comparing predicted to actual outcomes for a |
| 32 | + discrete choice model |
| 33 | +
|
| 34 | + Parameters |
| 35 | + ---------- |
| 36 | + reg : regression object |
| 37 | + output instance from a probit regression model |
| 38 | +
|
| 39 | + Returns |
| 40 | + ---------- |
| 41 | + predtab_vals : dictionary |
| 42 | + includes margins and cells of actual and predicted |
| 43 | + values for discrete choice model |
| 44 | + actpos : observed positives (=1) |
| 45 | + actneg : observed negatives (=0) |
| 46 | + predpos : predicted positives |
| 47 | + predneg : predicted negatives |
| 48 | + truepos : predicted 1 when actual = 1 |
| 49 | + falsepos : predicted 1 when actual = 0 |
| 50 | + trueneg : predicted 0 when actual = 0 |
| 51 | + falseneg : predicted 0 when actual = 1 |
| 52 | +
|
| 53 | + """ |
| 54 | + predtab_vals = {} |
| 55 | + pos = reg.y.sum() |
| 56 | + predtab_vals["actpos"] = int(pos) |
| 57 | + neg = reg.n - pos |
| 58 | + predtab_vals["actneg"] = int(neg) |
| 59 | + act1 = (reg.y == 1) * 1 |
| 60 | + act0 = (reg.y == 0) * 1 |
| 61 | + ppos = reg.predybin.sum() |
| 62 | + predtab_vals["predpos"] = ppos |
| 63 | + pneg = reg.n - ppos |
| 64 | + predtab_vals["predneg"] = pneg |
| 65 | + pred1 = (reg.predybin == 1) * 1 |
| 66 | + pred0 = (reg.predybin == 0) * 1 |
| 67 | + truep = (pred1 * act1) * 1 |
| 68 | + predtab_vals["truepos"] = truep.sum() |
| 69 | + truen = (pred0 * act0) * 1 |
| 70 | + predtab_vals["trueneg"] = truen.sum() |
| 71 | + fpos = (pred1 * act0) * 1 |
| 72 | + predtab_vals["falsepos"] = fpos.sum() |
| 73 | + fneg = (pred0 * act1) * 1 |
| 74 | + predtab_vals["falseneg"] = fneg.sum() |
| 75 | + |
| 76 | + return predtab_vals |
| 77 | + |
| 78 | + |
| 79 | +def probit_fit(reg): |
| 80 | + """ |
| 81 | + Various measures of fit for discrete choice models, derived from the |
| 82 | + prediction table (pred_table) |
| 83 | + |
| 84 | + Parameters |
| 85 | + ---------- |
| 86 | + reg : regression object |
| 87 | + output instance from a probit regression model |
| 88 | + must contain predtable attribute |
| 89 | +
|
| 90 | + Returns |
| 91 | + ---------- |
| 92 | + prob_fit : a dictionary containing various measures of fit |
| 93 | + TPR : true positive rate (sensitivity, recall, hit rate) |
| 94 | + TNR : true negative rate (specificity, selectivity) |
| 95 | + PREDPC : accuracy, percent correctly predicted |
| 96 | + BA : balanced accuracy |
| 97 | + |
| 98 | + """ |
| 99 | + |
| 100 | + prob_fit = {} |
| 101 | + prob_fit["TPR"] = 100.0 * reg.predtable["truepos"] / reg.predtable["actpos"] |
| 102 | + prob_fit["TNR"] = 100.0 * reg.predtable["trueneg"] / reg.predtable["actneg"] |
| 103 | + prob_fit["BA"] = (prob_fit["TPR"] + prob_fit["TNR"])/2.0 |
| 104 | + prob_fit["PREDPC"] = 100.0 * (reg.predtable["truepos"] + reg.predtable["trueneg"]) / reg.n |
| 105 | + |
| 106 | + return prob_fit |
| 107 | + |
| 108 | +def probit_lrtest(regprob): |
| 109 | + """ |
| 110 | + Likelihood ratio test statistic for probit model |
| 111 | +
|
| 112 | + Parameters |
| 113 | + ---------- |
| 114 | + regprob : probit regression object |
| 115 | +
|
| 116 | + Returns |
| 117 | + ------- |
| 118 | +
|
| 119 | + likratio : dictionary |
| 120 | + contains the statistic for the null model (L0), the LR test(likr), |
| 121 | + the degrees of freedom (df) and the p-value (pvalue) |
| 122 | + L0 : float |
| 123 | + log likelihood of null model |
| 124 | + likr : float |
| 125 | + likelihood ratio statistic |
| 126 | + df : integer |
| 127 | + degrees of freedom |
| 128 | + p-value : float |
| 129 | + p-value |
| 130 | + """ |
| 131 | + |
| 132 | + likratio = {} |
| 133 | + P = np.mean(regprob.y) |
| 134 | + L0 = regprob.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) |
| 135 | + likratio["L0"] = L0 |
| 136 | + LR = -2.0 * (L0 - regprob.logl) |
| 137 | + likratio["likr"] = LR |
| 138 | + likratio["df"] = regprob.k |
| 139 | + pval = stats.chisqprob(LR, regprob.k) |
| 140 | + likratio["p-value"] = pval |
| 141 | + |
| 142 | + return likratio |
| 143 | + |
| 144 | +def mcfad_rho(regprob): |
| 145 | + """ |
| 146 | + McFadden's rho measure of fit |
| 147 | +
|
| 148 | + Parameters |
| 149 | + --------- |
| 150 | + regprob : probit regression object |
| 151 | +
|
| 152 | + Returns |
| 153 | + ------- |
| 154 | + rho : McFadden's rho (1 - L/L0) |
| 155 | + |
| 156 | + """ |
| 157 | + |
| 158 | + rho = 1.0 - (regprob.logl / regprob.L0) |
| 159 | + return rho |
| 160 | + |
| 161 | +def probit_ape(regprob): |
| 162 | + """ |
| 163 | + Average partial effects |
| 164 | +
|
| 165 | + Parameters |
| 166 | + ---------- |
| 167 | + regprob : probit regression object |
| 168 | +
|
| 169 | + Returns |
| 170 | + ------- |
| 171 | + tuple with: |
| 172 | + scale : the scale of the marginal effects, determined by regprob.scalem |
| 173 | + Default: 'phimean' (Mean of individual marginal effects) |
| 174 | + Alternative: 'xmean' (Marginal effects at variables mean) |
| 175 | + slopes : marginal effects or average partial effects (not for constant) |
| 176 | + slopes_vm : estimates of variance of marginal effects (not for constant) |
| 177 | + slopes_std_err : estimates of standard errors of marginal effects |
| 178 | + slopes_z_stat : tuple with z-statistics and p-values for marginal effects |
| 179 | + |
| 180 | + """ |
| 181 | + |
| 182 | + |
| 183 | + if regprob.scalem == "xmean": |
| 184 | + xmb = regprob.xmean.T @ regprob.betas |
| 185 | + scale = stats.norm.pdf(xmb) |
| 186 | + |
| 187 | + elif regprob.scalem == "phimean": |
| 188 | + scale = np.mean(regprob.phiy,axis=0) |
| 189 | + |
| 190 | + # average partial effects (no constant) |
| 191 | + slopes = (regprob.betas[1:,0] * scale).reshape(-1,1) |
| 192 | + |
| 193 | + # variance of partial effects |
| 194 | + xmb = regprob.xmean.T @ regprob.betas |
| 195 | + bxt = regprob.betas @ regprob.xmean.T |
| 196 | + dfdb = np.eye(regprob.k) - xmb * bxt |
| 197 | + slopes_vm = (scale ** 2) * ((dfdb @ regprob.vm) @ dfdb.T) |
| 198 | + |
| 199 | + # standard errors |
| 200 | + slopes_std_err = np.sqrt(slopes_vm[1:,1:].diagonal()).reshape(-1,1) |
| 201 | + |
| 202 | + # z-stats and p-values |
| 203 | + sl_zStat = slopes / slopes_std_err |
| 204 | + slopes_z_stat = [(sl_zStat[i,0],stats.norm.sf(abs(sl_zStat[i,0])) * 2) for i in range(len(slopes))] |
| 205 | + |
| 206 | + |
| 207 | + return (scale, slopes,slopes_vm[1:,1:],slopes_std_err,slopes_z_stat) |
| 208 | + |
| 209 | + |
| 210 | +def sp_tests(regprob=None, obj_list=None): |
| 211 | + """ |
| 212 | + Calculates tests for spatial dependence in Probit models |
| 213 | +
|
| 214 | + Parameters |
| 215 | + ---------- |
| 216 | + regprob : regression object from spreg |
| 217 | + output instance from a probit model |
| 218 | + obj_list : list |
| 219 | + list of regression elements from both libpysal and statsmodels' ProbitResults |
| 220 | + The list should be such as: |
| 221 | + [libpysal.weights, ProbitResults.fittedvalues, ProbitResults.resid_response, ProbitResults.resid_generalized] |
| 222 | + |
| 223 | + Returns |
| 224 | + ------- |
| 225 | + tuple with LM_Err, moran, ps as 2x1 arrays with statistic and p-value |
| 226 | + LM_Err: Pinkse |
| 227 | + moran : Kelejian-Prucha generalized Moran |
| 228 | + ps : Pinkse-Slade |
| 229 | +
|
| 230 | + Examples |
| 231 | + -------- |
| 232 | + The results of this function will be automatically added to the output of the probit model if using spreg. |
| 233 | + If using the Probit estimator from statsmodels, the user can call the function with the obj_list argument. |
| 234 | + The argument obj_list should be a list with the following elements, in this order: |
| 235 | + [libpysal.weights, ProbitResults.fittedvalues, ProbitResults.resid_response, ProbitResults.resid_generalized] |
| 236 | + The function will then return and print the results of the spatial diagnostics. |
| 237 | +
|
| 238 | + >>> import libpysal |
| 239 | + >>> import statsmodels.api as sm |
| 240 | + >>> import geopandas as gpd |
| 241 | + >>> from spreg.diagnostics_probit import sp_tests |
| 242 | +
|
| 243 | + >>> columb = libpysal.examples.load_example('Columbus') |
| 244 | + >>> dfs = gpd.read_file(columb.get_path("columbus.shp")) |
| 245 | + >>> w = libpysal.weights.Queen.from_dataframe(dfs) |
| 246 | + >>> w.transform='r' |
| 247 | +
|
| 248 | + >>> y = (dfs["CRIME"] > 40).astype(float) |
| 249 | + >>> X = dfs[["INC","HOVAL"]] |
| 250 | + >>> X = sm.add_constant(X) |
| 251 | +
|
| 252 | + >>> probit_mod = sm.Probit(y, X) |
| 253 | + >>> probit_res = probit_mod.fit(disp=False) |
| 254 | + >>> LM_err, moran, ps = sp_tests(obj_list=[w, probit_res.fittedvalues, probit_res.resid_response, probit_res.resid_generalized]) |
| 255 | + PROBIT MODEL DIAGNOSTICS FOR SPATIAL DEPENDENCE |
| 256 | + TEST DF VALUE PROB |
| 257 | + Kelejian-Prucha (error) 1 1.721 0.0852 |
| 258 | + Pinkse (error) 1 3.132 0.0768 |
| 259 | + Pinkse-Slade (error) 1 2.558 0.1097 |
| 260 | +
|
| 261 | + """ |
| 262 | + if regprob: |
| 263 | + w, Phi, phi, u_naive, u_gen, n = regprob.w, regprob.predy, regprob.phiy, regprob.u_naive, regprob.u_gen, regprob.n |
| 264 | + elif obj_list: |
| 265 | + w, fittedvalues, u_naive, u_gen = obj_list |
| 266 | + Phi = norm.cdf(fittedvalues) |
| 267 | + phi = norm.pdf(fittedvalues) |
| 268 | + n = w.n |
| 269 | + |
| 270 | + try: |
| 271 | + w = w.sparse |
| 272 | + except: |
| 273 | + w = w |
| 274 | + |
| 275 | + # Pinkse_error: |
| 276 | + Phi_prod = Phi * (1 - Phi) |
| 277 | + sig2 = np.sum((phi * phi) / Phi_prod) / n |
| 278 | + LM_err_num = np.dot(u_gen.T, (w * u_gen)) ** 2 |
| 279 | + trWW = np.sum((w * w).diagonal()) |
| 280 | + trWWWWp = trWW + np.sum((w * w.T).diagonal()) |
| 281 | + LM_err = float(1.0 * LM_err_num / (sig2 ** 2 * trWWWWp)) |
| 282 | + LM_err = np.array([LM_err, stats.chisqprob(LM_err, 1)]) |
| 283 | + # KP_error: |
| 284 | + moran = moran_KP(w, u_naive, Phi_prod) |
| 285 | + # Pinkse-Slade_error: |
| 286 | + u_std = u_naive / np.sqrt(Phi_prod) |
| 287 | + ps_num = np.dot(u_std.T, (w * u_std)) ** 2 |
| 288 | + trWpW = np.sum((w.T * w).diagonal()) |
| 289 | + ps = float(ps_num / (trWW + trWpW)) |
| 290 | + # chi-square instead of bootstrap. |
| 291 | + ps = np.array([ps, stats.chisqprob(ps, 1)]) |
| 292 | + |
| 293 | + if obj_list: |
| 294 | + from .output import _probit_out |
| 295 | + reg_simile = type('reg_simile', (object,), {})() |
| 296 | + reg_simile.Pinkse_error = LM_err |
| 297 | + reg_simile.KP_error = moran |
| 298 | + reg_simile.PS_error = ps |
| 299 | + print("PROBIT MODEL "+_probit_out(reg_simile, spat_diag=True, sptests_only=True)[1:]) |
| 300 | + |
| 301 | + return LM_err, moran, ps |
| 302 | + |
| 303 | +def moran_KP(w, u, sig2i): |
| 304 | + """ |
| 305 | + Calculates Kelejian-Prucha Moran-flavoured tests |
| 306 | +
|
| 307 | + Parameters |
| 308 | + ---------- |
| 309 | +
|
| 310 | + w : W |
| 311 | + PySAL weights instance aligned with y |
| 312 | + u : array |
| 313 | + nx1 array of naive residuals |
| 314 | + sig2i : array |
| 315 | + nx1 array of individual variance |
| 316 | +
|
| 317 | + Returns |
| 318 | + ------- |
| 319 | + moran : array, Kelejian-Prucha Moran's I with p-value |
| 320 | + """ |
| 321 | + try: |
| 322 | + w = w.sparse |
| 323 | + except: |
| 324 | + pass |
| 325 | + moran_num = np.dot(u.T, (w * u)) |
| 326 | + E = SP.lil_matrix(w.get_shape()) |
| 327 | + E.setdiag(sig2i.flat) |
| 328 | + E = E.asformat("csr") |
| 329 | + WE = w * E |
| 330 | + moran_den = np.sqrt(np.sum((WE * WE + (w.T * E) * WE).diagonal())) |
| 331 | + moran = float(1.0 * moran_num / moran_den) |
| 332 | + moran = np.array([moran, stats.norm.sf(abs(moran)) * 2.0]) |
| 333 | + return moran |
| 334 | + |
| 335 | + |
| 336 | +def _test(): |
| 337 | + import doctest |
| 338 | + |
| 339 | + doctest.testmod() |
| 340 | + |
| 341 | + |
| 342 | +if __name__ == "__main__": |
| 343 | + _test() |
0 commit comments