Ich bin sehr neu dazu, so dass jede Art von Informationen hilfreich wäre. Entschuldigung, wenn ich eine sehr triviale Frage gestellt habe. Ich arbeite an einem mittelgroßen Datensatz mit vielen Nullen. Wir haben viele Modelle angewendet und der cv-skf-Score für k = 10 hat 0,85 überschritten, aber der roc_auc-Wert liegt bei 0,5. Ich benutze Sklearn. Unten ist das Code-Snippet.Ein gutes Kreuz Validierungsergebnis aber ein sehr schlecht erhalten Roc_auc Punktzahl
train_dataset = pd.read_csv('./input/train.csv', index_col='ID')
test_dataset = pd.read_csv('./input/test.csv', index_col='ID')
#print_shapes()
# How many nulls are there in the datasets?
nulls_train = (train_dataset.isnull().sum()==1).sum()
nulls_test = (test_dataset.isnull().sum()==1).sum()
#print('There are {} nulls in TRAIN and {} nulls in TEST dataset.'.format(nulls_train, nulls_test))
# Remove constant features
def identify_constant_features(dataframe):
count_uniques = dataframe.apply(lambda x: len(x.unique()))
constants = count_uniques[count_uniques == 1].index.tolist()
return constants
constant_features_train = set(identify_constant_features(train_dataset))
#print('There were {} constant features in TRAIN dataset.'.format(len(constant_features_train)))
# Drop the constant features
train_dataset.drop(constant_features_train, inplace=True, axis=1)
#print_shapes()
# Remove equals features
def identify_equal_features(dataframe):
features_to_compare = list(combinations(dataframe.columns.tolist(),2))
equal_features = []
for compare in features_to_compare:
is_equal = array_equal(dataframe[compare[0]],dataframe[compare[1]])
if is_equal:
equal_features.append(list(compare))
return equal_features
equal_features_train = identify_equal_features(train_dataset)
#print('There were {} pairs of equal features in TRAIN dataset.'.format(len(equal_features_train)))
# Remove the second feature of each pair.
features_to_drop = array(equal_features_train)[:,1]
train_dataset.drop(features_to_drop, axis=1, inplace=True)
#print_shapes()
# Define the variables model.
y_name = 'TARGET'
feature_names = train_dataset.columns.tolist()
feature_names.remove(y_name)
X = train_dataset[feature_names]
y = train_dataset[y_name]
# Save the features selected for later use.
pd.Series(feature_names).to_csv('features_selected_step1.csv', index=False)
#print('Features selected\n{}'.format(feature_names))
# Proportion of classes
y.value_counts()/len(y)
skf = cv.StratifiedKFold(y, n_folds=10, shuffle=True)
score_metric = 'roc_auc'
scores = {}
def score_model(model):
return cv.cross_val_score(model, X, y, cv=skf, scoring=score_metric)
clfxgb = xgb.XGBClassifier()
clfxgb = clfxgb.fit(X, y)
probxgb = clfxgb.predict(X)
# #print 'XGB', np.shape(probxgb)
print metrics.roc_auc_score(y, probxgb)
Ausgang - Bestücken des interaktiven Namensraum von numpy und matplotlib test.csv train.csv
0,502140359687
Für cv-skf -
cv.cross_val_score(xgb.XGBClassifier(), X, y, cv=skf, scoring=score_metric)
Ausgang - Array ([0.83124251, 0.84162387, 0.83580491])
Wir einreichen die CSV-Datei als -
test_dataset.drop(constant_features_train, inplace=True, axis=1)
test_dataset.drop(features_to_drop, axis=1, inplace=True)
print test_dataset.shape
X_SubTest = test_dataset
df_test = pd.read_csv('./input/test.csv')
id_test = df_test['ID']
predTest = model.predict(X_SubTest)
submission = pd.DataFrame({"ID":id_test, "TARGET":predTest})
submission.to_csv("submission_svm_23-3.csv", index=False)