2016-05-04 8 views
0

Ich bin auf der Suche nach einer Implementierung der Yeo-Johnson power transformation in Python. Leider hat SciPy Bibliothek die Implementierung von Box-Cox (gilt nur für positive Zahlen)!Python-Bibliothek: Yeo-Johnson-Transformation

Im Moment verwende ich ein R-Paket (caret::preProcess) über RPY2; Allerdings hat es einen großen Leistungseinbruch.

Dank

PS: Aufruf caret :: Vorprozess von Python:

#!/usr/bin/env python 
from rpy2.robjects.functions import SignatureTranslatedFunction 
import rpy2.robjects as robjects 
from rpy2.robjects.packages import importr 

r_caret = importr("caret") 
r_stats = importr("stats") 

feature_in = list(range(-5, 5)) 

r_caret.preProcess = SignatureTranslatedFunction(r_caret.preProcess, {'r_lambda': 'lambda'}) 
r_scale = r_caret.preProcess(robjects.DataFrame({'a': robjects.FloatVector(feature_in)}), 
          method=robjects.StrVector(["center", "scale", "YeoJohnson"]), 
          r_lambda=robjects.IntVector([-10, 10, 0.1]), 
          na_remove=False) 
temp = r_stats.predict(r_scale, robjects.DataFrame({'a': robjects.FloatVector(feature_in)})) 
feature_out = np.array(temp.rx2(1), dtype=np.float64) 

print(feature_in) # output [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4] 
print(feature_out) # output [-1.46625361 -1.14751589 -0.82725836 -0.50508336 -0.1803113 0.14850228 0.48337191 0.8224039 1.16416754 1.5079769] 

Die Lösung ist wie die folgenden:

#!/usr/bin/env python 
# -*- coding: UTF-8 -*- 
import warnings 
import numpy as np 
import pandas as pd 
import sys 

__author__ = "Mohsen Mesgarpour" 
__copyright__ = "Copyright 2016, https://github.com/mesgarpour" 
__credits__ = ["Mohsen Mesgarpour"] 
__license__ = "GPL" 
__version__ = "1.0" 
__maintainer__ = "Mohsen Mesgarpour" 
__email__ = "[email protected]" 


class YeoJohnson: 
    """ 
    Computing Yeo-Johnson transofrmation, which is an extension of Box-Cox transformation 
    but can handle both positive and negative values. 

    References: 
    Weisberg, S. (2001). Yeo-Johnson Power Transformations. 
    Department of Applied Statistics, University of Minnesota. Retrieved June, 1, 2003. 
    https://www.stat.umn.edu/arc/yjpower.pdf 

    Adapted from CRAN - Package VGAM 
    """ 
    def fit(self, y, lmbda, derivative=0, epsilon=np.finfo(np.float).eps, inverse=False): 
     """ 
     :param y: The variable to be transformed (numeric array). 
     :param lmbda: The function's Lambda value (numeric value or array). 
     :param derivative: The derivative with respect to lambda 
     (non-negative integer; default: ordinary function evaluation). 
     :param epsilon: The lambda's tolerance (positive value). 
     :param inverse: The inverse transformation option (logical value). 
     :return: The Yeo-Johnson transformation or its inverse, or its derivatives with respect to lambda, of y. 
     """ 
     # Validate arguments 
     self.__validate(y, lmbda, derivative, epsilon, inverse) 

     # initialise 
     y = np.array(y, dtype=float) 
     result = y 
     if not (isinstance(lmbda, list) or isinstance(lmbda, np.ndarray)): 
      lmbda, y = np.broadcast_arrays(lmbda, y) 
      lmbda = np.array(lmbda, dtype=float) 
     l0 = np.abs(lmbda) > epsilon 
     l2 = np.abs(lmbda - 2) > epsilon 

     # Inverse 
     with warnings.catch_warnings(): # suppress warnings 
      warnings.simplefilter("ignore") 
      if inverse is True: 
       mask = np.where(((y >= 0) & l0) == True) 
       result[mask] = np.power(np.multiply(y[mask], lmbda[mask]) + 1, 1/lmbda[mask]) - 1 

       mask = np.where(((y >= 0) & ~l0) == True) 
       result[mask] = np.expm1(y[mask]) 

       mask = np.where(((y < 0) & l2) == True) 
       result[mask] = 1 - np.power(np.multiply(-(2 - lmbda[mask]), y[mask]) + 1, 1/(2 - lmbda[mask])) 

       mask = np.where(((y < 0) & ~l2) == True) 
       result[mask] = -np.expm1(-y[mask]) 

      # Derivative 
      else: 
       if derivative == 0: 
        mask = np.where(((y >= 0) & l0) == True) 
        result[mask] = np.divide(np.power(y[mask] + 1, lmbda[mask]) - 1, lmbda[mask]) 

        mask = np.where(((y >= 0) & ~l0) == True) 
        result[mask] = np.log1p(y[mask]) 

        mask = np.where(((y < 0) & l2) == True) 
        result[mask] = np.divide(-(np.power(-y[mask] + 1, 2 - lmbda[mask]) - 1), 2 - lmbda[mask]) 

        mask = np.where(((y < 0) & ~l2) == True) 
        result[mask] = -np.log1p(-y[mask]) 

       # Not Derivative 
       else: 
        p = self.fit(y, lmbda, derivative=derivative - 1, epsilon=epsilon, inverse=inverse) 

        mask = np.where(((y >= 0) & l0) == True) 
        result[mask] = np.divide(np.multiply(np.power(y[mask] + 1, lmbda[mask]), 
                 np.power(np.log1p(y[mask]), derivative)) - 
              np.multiply(derivative, p[mask]), lmbda[mask]) 

        mask = np.where(((y >= 0) & ~l0) == True) 
        result[mask] = np.divide(np.power(np.log1p(y[mask]), derivative + 1), derivative + 1) 

        mask = np.where(((y < 0) & l2) == True) 
        result[mask] = np.divide(-(np.multiply(np.power(-y[mask] + 1, 2 - lmbda[mask]), 
                    np.power(-np.log1p(-y[mask]), derivative)) - 
                 np.multiply(derivative, p[mask])), 2 - lmbda[mask]) 

        mask = np.where(((y < 0) & ~l2) == True) 
        result[mask] = np.divide(np.power(-np.log1p(-y[mask]), derivative + 1), derivative + 1) 

     return result 

    @staticmethod 
    def __validate(y, lmbda, derivative, epsilon, inverse): 
     try: 
      if not isinstance(y, (list, np.ndarray, pd.Series)): 
       raise Exception("Argument 'y' must be a list!") 
      if not isinstance(lmbda, (int, float, np.int, np.float)): 
       if not isinstance(lmbda, (list, np.ndarray, pd.Series)) or len(lmbda) != len(y): 
        raise Exception("Argument 'lmbda' must be a number " 
            "or a list, which its length matches 'y' argument!") 
      if not isinstance(derivative, (int, float, np.int, np.float)) or derivative < 0: 
       raise Exception("Argument 'derivative' must be a non-negative integer!") 
      if not isinstance(epsilon, (int, float, np.int, np.float)) or epsilon <= 0: 
       raise Exception("Argument 'epsilon' must be a positive number!") 
      if not isinstance(inverse, bool): 
       raise Exception("Argument 'inverse' must be boolean!") 
      if inverse is True and derivative != 0: 
       raise Exception("Argument 'derivative' must be zero " 
           "when argument 'inverse' is 'True'!") 
     except(): 
      sys.exit() 
+0

Wenn Sie von VGAM vs caret anrufen, erhalten Sie ein schnelleres Ergebnis (Ich würde mir vorstellen, aber einen Versuch wert) - kann nichts für Python direkt finden – Chris

+0

Ich werde es versuchen. Vielen Dank. –

+0

Die Lösung ist auf dem Gist hochgeladen: https://gist.github.com/mesgarpour/f24769cd186e2db853957b10ff6b7a95 –

Antwort

2

Diese Dinge recht einfach sind mit numpy DIY:

In [2]: def yj(lam, y): 
    lam, y = np.broadcast_arrays(lam, y) 
    l0 = lam == 0 
    l2 = lam == 2 
    result = np.empty_like(lam, dtype=float) 

    mask1 = (~l0) & (y >= 0) 
    ym, lm = y[mask1], lam[mask1] 
    result[mask1] = ((ym + 1.)**lm - 1.)/lm 

    mask2 = (l0) & (y >= 0) 
    result[mask2] = np.log1p(y[mask2]) 

    # TODO: handle two more cases here, this is similar 

    return result 

Beachten Sie, dass das obige eine ungetestete Demo ist, die dtypes nicht prüft, keine Sorgfalt für Randfälle (kleines y, kleines Lambda) usw. anwendet.

+0

Dieser Code schätzt nicht den Wert von "Lam" von Samples obwohl. – ogrisel