Ich bin ziemlich neu und habe andere gesehen haben den gleichen Fehler, aber nicht zu sehen, wie ich die Lösungen implementieren kann. Ich versuche eine Random Forest Machine Learning-Methode mit einer randomisierten Rastersuche von scikit learn zu schreiben. Es funktioniert gut mit einer Standard-Grid-Suche, scheitert aber mit einem seltsamen Fehler in der Fit-Funktion von Scikit lernen, wenn ich die randomisierte Rastersuche benutze. Irgendwelche Vorschläge, wie man das anspricht, wäre großartigPython TypeError: Bereich() Integer Ende Argument erwartet, habe float. mit Fit-Funktion
Hier ist ein Beispiel, das den Fehler anzeigt.
import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold
data = pd.read_csv("data.csv", sep=",")
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()
# Set the numpy global random number seed (similar effect to random_state)
np.random.seed(1)
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain:
ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()
with open(fi_name,'w') as ffeatimp:
ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()
# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
ytestdim = yTest.shape[0]
i = 0
with open (train_name, 'a') as ftrain:
while i< ytestdim :
ftrain.write(str(round(yTest[i],2))+',\n')
i += 1
ftrain.close()
print "\n"
# random forest grid search parameters
print "------------------- Begining Random Forest Grid Search -------------------"
rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
rf = RandomForestRegressor(random_state=0,n_jobs=2)
RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20)
start = time()
RfGridSearch.fit(XTrain,yTrain)
# Get best random forest parameters
print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
RFtime = time() - start,len(RfGridSearch.grid_scores_)
report(RfGridSearch.grid_scores_)
print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
ne = RfGridSearch.best_params_['n_estimators']
print("max_features = %s " % RfGridSearch.best_params_['max_features'])
mf = RfGridSearch.best_params_['max_features']
print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
md = RfGridSearch.best_params_['max_depth']
with open (train_name, 'a') as ftrain:
ftrain.write("Random Forest")
ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
ftrain.write("Number of Trees, %s ,\n" % str(ne))
ftrain.write("Number of feature at split, %s ,\n" % str(mf))
ftrain.write("Max depth of tree, %s ,\n" % str(md))
ftrain.close()
Der Fehler, der unter
gegeben istTraceback (most recent call last):
File "rgscv.py", line 81, in <module>
RfGridSearch.fit(XTrain,yTrain)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit
return self._fit(X, y, sampled_params)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit
for parameters in parameter_iterable
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
while self.dispatch_one_batch(iterator):
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch
job = ImmediateComputeBatch(batch)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__
self.results = batch()
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit
for i in range(n_more_estimators):
TypeError: range() integer end argument expected, got float.
Zuerst dachte ich, ich einen Parameter nur verpasst hatte, aber genau dieses Verfahren mit einem geraden Suche nach vorn Gitter scheint kein Problem zu arbeiten. Der Code dafür ist unten. Kann mir jemand vorschlagen, was diesen Fehler verursacht?
import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold
data = pd.read_csv("data.csv", sep=",")
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()
# Set the numpy global random number seed (similar effect to random_state)
np.random.seed(1)
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain:
ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()
with open(fi_name,'w') as ffeatimp:
ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()
# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
ytestdim = yTest.shape[0]
i = 0
with open (train_name, 'a') as ftrain:
while i< ytestdim :
ftrain.write(str(round(yTest[i],2))+',\n')
i += 1
ftrain.close()
print "\n"
# random forest grid search parameters
print "------------------- Begining Random Forest Grid Search -------------------"
#rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]}
rf = RandomForestRegressor(random_state=0,n_jobs=2)
RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error')
start = time()
RfGridSearch.fit(XTrain,yTrain)
# Get best random forest parameters
print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
RFtime = time() - start,len(RfGridSearch.grid_scores_)
report(RfGridSearch.grid_scores_)
print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
ne = RfGridSearch.best_params_['n_estimators']
print("max_features = %s " % RfGridSearch.best_params_['max_features'])
mf = RfGridSearch.best_params_['max_features']
print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
md = RfGridSearch.best_params_['max_depth']
with open (train_name, 'a') as ftrain:
ftrain.write("Random Forest")
ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
ftrain.write("Number of Trees, %s ,\n" % str(ne))
ftrain.write("Number of feature at split, %s ,\n" % str(mf))
ftrain.write("Max depth of tree, %s ,\n" % str(md))
ftrain.close()
@coralv Er verwendet eindeutig eine Bibliothek, es ist in den Site-Paketen dir. Stellen Sie keine unsinnigen Fragen. – Natecat
Dieser Codeabschnitt ist die Standardanpassungsfunktion aus der scikit learn-Bibliothek. Dies ist kein Code, den ich überhaupt bearbeitet habe, und die gleiche Funktion wird erfolgreich in der GridSearchCV-Version verwendet, die wie erwartet funktioniert. – James
[scipy.stats.expon] (http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html#scipy.stats.expon) scheint ein Expon-Objekt zurückzugeben, was nicht funktioniert scheint nicht wie eine Liste zu funktionieren, wie im zweiten Beispiel. Ändert das Ändern zu einer Liste es? – Natecat