2016-05-03 18 views
0

Ich bin ziemlich neu und habe andere gesehen haben den gleichen Fehler, aber nicht zu sehen, wie ich die Lösungen implementieren kann. Ich versuche eine Random Forest Machine Learning-Methode mit einer randomisierten Rastersuche von scikit learn zu schreiben. Es funktioniert gut mit einer Standard-Grid-Suche, scheitert aber mit einem seltsamen Fehler in der Fit-Funktion von Scikit lernen, wenn ich die randomisierte Rastersuche benutze. Irgendwelche Vorschläge, wie man das anspricht, wäre großartigPython TypeError: Bereich() Integer Ende Argument erwartet, habe float. mit Fit-Funktion

Hier ist ein Beispiel, das den Fehler anzeigt.

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
from sklearn.cross_validation import KFold 

data = pd.read_csv("data.csv", sep=",") 
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = data.columns.values # Ues the column headers as the descriptor labels 
data.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(data) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain: 
     ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Fold %d ,\n" %(metcount+1)) 
ftrain.close() 

with open(fi_name,'w') as ffeatimp: 
     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
ffeatimp.close() 

# Begin the K-fold cross validation over ten folds 
kf = KFold(datax, n_folds=10) 
print "------------------- Begining Ten Fold Cross Validation -------------------" 
for train, test in kf: 
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
    ytestdim = yTest.shape[0] 
    i = 0 
    with open (train_name, 'a') as ftrain: 
     while i< ytestdim : 
       ftrain.write(str(round(yTest[i],2))+',\n') 
       i += 1 
    ftrain.close() 

    print "\n" 
    # random forest grid search parameters 
    print "------------------- Begining Random Forest Grid Search -------------------" 
    rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)} 
    rf = RandomForestRegressor(random_state=0,n_jobs=2) 
    RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20) 
    start = time() 
    RfGridSearch.fit(XTrain,yTrain) 

    # Get best random forest parameters 
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
    RFtime = time() - start,len(RfGridSearch.grid_scores_) 
    report(RfGridSearch.grid_scores_) 
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
    ne = RfGridSearch.best_params_['n_estimators'] 
    print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
    mf = RfGridSearch.best_params_['max_features'] 
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
    md = RfGridSearch.best_params_['max_depth'] 
    with open (train_name, 'a') as ftrain: 
      ftrain.write("Random Forest") 
      ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
      ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
      ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
      ftrain.write("Max depth of tree, %s ,\n" % str(md)) 
    ftrain.close() 

Der Fehler, der unter

gegeben ist
Traceback (most recent call last): 
    File "rgscv.py", line 81, in <module> 
    RfGridSearch.fit(XTrain,yTrain) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit 
    return self._fit(X, y, sampled_params) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit 
    for parameters in parameter_iterable 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__ 
    while self.dispatch_one_batch(iterator): 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch 
    self._dispatch(tasks) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch 
    job = ImmediateComputeBatch(batch) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__ 
    self.results = batch() 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__ 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score 
    estimator.fit(X_train, y_train, **fit_params) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit 
    for i in range(n_more_estimators): 
TypeError: range() integer end argument expected, got float. 

Zuerst dachte ich, ich einen Parameter nur verpasst hatte, aber genau dieses Verfahren mit einem geraden Suche nach vorn Gitter scheint kein Problem zu arbeiten. Der Code dafür ist unten. Kann mir jemand vorschlagen, was diesen Fehler verursacht?

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
from sklearn.cross_validation import KFold 

data = pd.read_csv("data.csv", sep=",") 
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = data.columns.values # Ues the column headers as the descriptor labels 
data.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(data) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain: 
     ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Fold %d ,\n" %(metcount+1)) 
ftrain.close() 

with open(fi_name,'w') as ffeatimp: 
     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
ffeatimp.close() 

# Begin the K-fold cross validation over ten folds 
kf = KFold(datax, n_folds=10) 
print "------------------- Begining Ten Fold Cross Validation -------------------" 
for train, test in kf: 
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
    ytestdim = yTest.shape[0] 
    i = 0 
    with open (train_name, 'a') as ftrain: 
     while i< ytestdim : 
       ftrain.write(str(round(yTest[i],2))+',\n') 
       i += 1 
    ftrain.close() 

    print "\n" 
    # random forest grid search parameters 
    print "------------------- Begining Random Forest Grid Search -------------------" 
    #rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)} 
    rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]} 
    rf = RandomForestRegressor(random_state=0,n_jobs=2) 
    RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error') 
    start = time() 
    RfGridSearch.fit(XTrain,yTrain) 

    # Get best random forest parameters 
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
    RFtime = time() - start,len(RfGridSearch.grid_scores_) 
    report(RfGridSearch.grid_scores_) 
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
    ne = RfGridSearch.best_params_['n_estimators'] 
    print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
    mf = RfGridSearch.best_params_['max_features'] 
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
    md = RfGridSearch.best_params_['max_depth'] 
    with open (train_name, 'a') as ftrain: 
       ftrain.write("Random Forest") 
       ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
       ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
       ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
       ftrain.write("Max depth of tree, %s ,\n" % str(md)) 
    ftrain.close() 
+0

@coralv Er verwendet eindeutig eine Bibliothek, es ist in den Site-Paketen dir. Stellen Sie keine unsinnigen Fragen. – Natecat

+1

Dieser Codeabschnitt ist die Standardanpassungsfunktion aus der scikit learn-Bibliothek. Dies ist kein Code, den ich überhaupt bearbeitet habe, und die gleiche Funktion wird erfolgreich in der GridSearchCV-Version verwendet, die wie erwartet funktioniert. – James

+0

[scipy.stats.expon] (http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html#scipy.stats.expon) scheint ein Expon-Objekt zurückzugeben, was nicht funktioniert scheint nicht wie eine Liste zu funktionieren, wie im zweiten Beispiel. Ändert das Ändern zu einer Liste es? – Natecat

Antwort

1

Anzahl der Schätzer hat integer, und Ihr Code erzeugt schwimmt sein. Erstellen Sie eine gültige Liste von n_estimators Werten, die ganze Zahlen enthält, und es wird gut sein.

+0

Vielen Dank, dass es sortiert. – James