本文整理汇总了Python中tests.pyunit_utils.locate函数的典型用法代码示例。如果您正苦于以下问题:Python locate函数的具体用法?Python locate怎么用?Python locate使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了locate函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: wide_dataset_large
def wide_dataset_large():
print("Reading in Arcene training data for binomial modeling.")
trainDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
trainDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())
print("Run model on 3250 columns of Arcene with strong rules off.")
model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])
print("Test model on validation set.")
validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
validDataResponse = np.where(validDataResponse == -1, 0, 1)
validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
prediction = model.predict(validData)
print("Check performance of predictions.")
performance = model.model_performance(validData)
print("Check that prediction AUC better than guessing (0.5).")
assert performance.auc() > 0.5, "predictions should be better then pure chance"
开发者ID:ndjido,项目名称:h2o-3,代码行数:25,代码来源:pyunit_DEPRECATED_wide_dataset_largeGLM.py
示例2: fiftycatRF
def fiftycatRF():
# Training set has only 45 categories cat1 through cat45
#Log.info("Importing 50_cattest_train.csv data...\n")
train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv"))
train["y"] = train["y"].asfactor()
#Log.info("Summary of 50_cattest_train.csv from H2O:\n")
#train.summary()
# Train H2O DRF Model:
#Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)
# Test dataset has all 50 categories cat1 through cat50
#Log.info("Importing 50_cattest_test.csv data...\n")
test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv"))
#Log.info("Summary of 50_cattest_test.csv from H2O:\n")
#test.summary()
# Predict on test dataset with DRF model:
#Log.info("Performing predictions on test dataset...\n")
preds = model.predict(test)
preds.head()
# Get the confusion matrix and AUC
#Log.info("Confusion matrix of predictions (max accuracy):\n")
perf = model.model_performance(test)
perf.show()
cm = perf.confusion_matrix()
print(cm)
开发者ID:Kaushik512,项目名称:h2o-3,代码行数:34,代码来源:pyunit_DEPRECATED_fiftycatRF.py
示例3: wide_dataset_large
def wide_dataset_large():
print("Reading in Arcene training data for binomial modeling.")
trainDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
trainDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
xtrain = np.transpose(trainDataFeatures).tolist()
ytrain = trainDataResponse.tolist()
trainData = h2o.H2OFrame.fromPython([ytrain]+xtrain)
trainData[0] = trainData[0].asfactor()
print("Run model on 3250 columns of Arcene with strong rules off.")
model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=False, alpha=1)
model.train(x=range(1,3250), y=0, training_frame=trainData)
print("Test model on validation set.")
validDataResponse = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
validDataResponse = np.where(validDataResponse == -1, 0, 1)
validDataFeatures = np.genfromtxt(pyunit_utils.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
xvalid = np.transpose(validDataFeatures).tolist()
yvalid = validDataResponse.tolist()
validData = h2o.H2OFrame.fromPython([yvalid]+xvalid)
prediction = model.predict(validData)
print("Check performance of predictions.")
performance = model.model_performance(validData)
print("Check that prediction AUC better than guessing (0.5).")
assert performance.auc() > 0.5, "predictions should be better then pure chance"
开发者ID:Vishnu24,项目名称:h2o-3,代码行数:29,代码来源:pyunit_wide_dataset_glm_large.py
示例4: user
def user():
a = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4]
a.head()
print(a[0].names) # Column header
print(a[2,0]) # column 0, row 2 value
print(a[2,"sepal_len"]) # Column 0, row 2 value
(a[0] + 2).show() # Add 2 to every element; broadcast a constant
(a[0] + a[1]).show() # Add 2 columns; broadcast parallel add
sum(a).show()
print(a["sepal_len"].mean())
print()
print("Rows 50 through 77 in the `sepal_len` column")
a[50:78, "sepal_len"].show() # print out rows 50 thru 77 inclusive
print()
a["sepal_len"].show()
print(a[50:78, ["sepal_len", "sepal_wid"]].show())
a.show()
print("The column means: ")
print(a.mean())
print()
try:
print(a["Sepal_len"].dim) # Error, misspelt column name
except Exception:
pass # Expected error
b = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))[0:4]
c = a + b
d = c + c + sum(a)
e = c + a + 1
e.show()
# Note that "d=c+..." keeps the internal C expressions alive, until "d" goes
# out of scope even as we nuke "c"
c.show()
c = None
# Internal "ExprNode(c=a+b)" not dead!
print(1 + (a[0] + b[1]).mean())
import collections
c = h2o.H2OFrame(collections.OrderedDict({"A": [1, 2, 3], "B": [4, 5, 6]}))
c.show()
c.describe()
c.head()
c[0].show()
print(c[1,0])
c[0:2,0].show()
sliced = a[0:51,0]
sliced.show()
开发者ID:StevenLOL,项目名称:h2o-3,代码行数:60,代码来源:pyunit_user.py
示例5: checkpoint_new_category_in_predictor
def checkpoint_new_category_in_predictor():
sv1 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
sv2 = h2o.upload_file(pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
vir = h2o.upload_file(pyunit_utils.locate("smalldata/iris/virginica.csv"))
print("checkpoint_new_category_in_predictor-1")
m1 = H2ODeepLearningEstimator(epochs=100)
m1.train(x=[0,1,2,4], y=3, training_frame=sv1)
m2 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
m2.train(x=[0,1,2,4], y=3, training_frame=sv2)
print("checkpoint_new_category_in_predictor-2")
# attempt to continue building model, but with an expanded categorical predictor domain.
# this should fail
try:
m3 = H2ODeepLearningEstimator(epochs=200, checkpoint=m1.model_id)
m3.train(x=[0,1,2,4], y=3, training_frame=vir)
assert False, "Expected continued model-building to fail with new categories introduced in predictor"
except EnvironmentError:
pass
print("checkpoint_new_category_in_predictor-3")
# attempt to predict on new model, but with observations that have expanded categorical predictor domain.
predictions = m2.predict(vir)
print("checkpoint_new_category_in_predictor-4")
开发者ID:StevenLOL,项目名称:h2o-3,代码行数:27,代码来源:pyunit_checkpoint_new_category_in_predictorDL.py
示例6: xgboost_insurance_gaussian_small
def xgboost_insurance_gaussian_small():
assert H2OXGBoostEstimator.available()
# Import big dataset to ensure run across multiple nodes
training_frame = h2o.import_file(pyunit_utils.locate("smalldata/testng/insurance_train1.csv"))
test_frame = h2o.import_file(pyunit_utils.locate("smalldata/testng/insurance_validation1.csv"))
x = ['Age', 'District']
y = 'Claims'
# Model with maximum of 2 trees
model_2_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.7,
booster='gbtree', seed=1, ntrees=2, distribution='gaussian')
model_2_trees.train(x=x, y=y, training_frame=training_frame)
prediction_2_trees = model_2_trees.predict(test_frame)
assert prediction_2_trees.nrows == test_frame.nrows
# Model with 10 trees
model_10_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.7,
booster='gbtree', seed=1, ntrees=10, distribution='gaussian')
model_10_trees.train(x=x, y=y, training_frame=training_frame)
prediction_10_trees = model_10_trees.predict(test_frame)
assert prediction_10_trees.nrows == test_frame.nrows
## Mean square error on model with lower number of decision trees should be higher
assert model_2_trees.mse() > model_10_trees.mse()
开发者ID:StevenLOL,项目名称:h2o-3,代码行数:27,代码来源:pyunit_insurance_gaussian_small.py
示例7: table_check
def table_check():
df = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
print(df[['AGE','RACE']].table(dense=True).head().as_data_frame(True))
print(df[['AGE','RACE']].table(dense=False).head().as_data_frame(True))
print(df[['RACE','AGE']].table(dense=True).head().as_data_frame(True))
print(df[['RACE','AGE']].table(dense=False).head().as_data_frame(True))
iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
# single column (frame)
table1 = iris["C5"].table()
assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1])
assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1])
assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1])
# two-column (one argument)
#dense
table2 = iris["C1"].table(iris["C5"])
#not dense
table3 = iris["C1"].table(iris["C5"],dense=False)
#check same value
assert (table3[table3['C1'] == 5,'Iris-setosa'] == table2[(table2['C1'] == 5) & (table2['C5'] == 'Iris-setosa'),'Counts']).all()
assert (table2 == iris[["C1","C5"]].table()).all()
assert (table3 == iris[["C1","C5"]].table(dense=False)).all()
cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
table = cars[2].table().as_data_frame()
table = dict(table[1:])
table = {k:int(v) for k,v in list(table.items())}
expected = Counter(itertools.chain(*cars[2].as_data_frame()[1:]))
assert table == expected, "Expected {} for table counts but got {}".format(expected, table)
开发者ID:Kaushik512,项目名称:h2o-3,代码行数:34,代码来源:pyunit_table.py
示例8: smallcat_gbm
def smallcat_gbm():
# Training set has 26 categories from A to Z
# Categories A, C, E, G, ... are perfect predictors of y = 1
# Categories B, D, F, H, ... are perfect predictors of y = 0
alphabet = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"))
alphabet["y"] = alphabet["y"].asfactor()
#Log.info("Summary of alphabet_cattest.csv from H2O:\n")
#alphabet.summary()
# Prepare data for scikit use
trainData = np.loadtxt(pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1, converters={0:lambda s: ord(s.decode().split("\"")[1])})
trainDataResponse = trainData[:,1]
trainDataFeatures = trainData[:,0]
# Train H2O GBM Model:
gbm_h2o = H2OGradientBoostingEstimator(distribution="bernoulli",
ntrees=1,
max_depth=1,
nbins=100)
gbm_h2o.train(x="X",y="y", training_frame=alphabet)
gbm_h2o.show()
# Train scikit GBM Model:
# Log.info("scikit GBM with same parameters:")
gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None)
gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
开发者ID:AllCodeNoGyaan,项目名称:h2o-3,代码行数:28,代码来源:pyunit_smallcat_gbm.py
示例9: glrm_catagorical_bug_fix
def glrm_catagorical_bug_fix():
trainData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))
testData = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
glrmModel = H2OGeneralizedLowRankEstimator(k=4)
glrmModel.train(x=trainData.names, training_frame=trainData)
predV = glrmModel.predict(testData)
print(predV)
开发者ID:michalkurka,项目名称:h2o-3,代码行数:7,代码来源:pyunit_PUBDEV_5776_glrm_fix_new_enum_level.py
示例10: dim_checks
def dim_checks():
# Log.info("Uploading logreg/princeton/cuse.dat")
h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
np_data = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate.csv"), delimiter=',', skiprows=1)
h2o_rows, h2o_cols = h2o_data.dim
np_rows, np_cols = list(np_data.shape)
print('The dimensions of h2o frame is: {0} x {1}'.format(h2o_rows, h2o_cols))
print('The dimensions of numpy array is: {0} x {1}'.format(np_rows, np_cols))
assert [h2o_rows, h2o_cols] == [np_rows, np_cols], "expected equal number of columns and rows"
# Log.info("Slice out a column and data frame it, try dim on it...")
h2o_slice = h2o_data[4]
np_slice = np_data[:,4]
h2o_rows, h2o_cols = h2o_slice.dim
np_rows = np_slice.shape[0]
print('The dimensions of h2o column slice is: {0} x {1}'.format(h2o_rows, h2o_cols))
print('The dimensions of numpy array column slice is: {0} x 1'.format(np_rows))
assert [h2o_rows, h2o_cols] == [np_rows, 1], "expected equal number of columns and rows"
# Log.info("OK, now try an operator, e.g. '&', and then check dimensions agao...")
h2oColAmpFive = h2o_slice & 5
assert h2oColAmpFive.nrow == h2o_rows, "expected the number of rows to remain unchanged"
开发者ID:AllCodeNoGyaan,项目名称:h2o-3,代码行数:34,代码来源:pyunit_dim.py
示例11: link_functions_gaussian
def link_functions_gaussian():
print("Read in prostate data.")
h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))
h2o_data.head()
sm_data = pd.read_csv(zipfile.ZipFile(pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")).
open("prostate_complete.csv")).as_matrix()
sm_data_response = sm_data[:,9]
sm_data_features = sm_data[:,1:9]
print("Testing for family: GAUSSIAN")
print("Set variables for h2o.")
myY = "GLEASON"
myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
print("Create models with canonical link: IDENTITY")
h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0)
h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features,
family=sm.families.Gaussian(sm.families.links.identity)).fit()
print("Compare model deviances for link function identity")
h2o_deviance = old_div(h2o_model.residual_deviance(), h2o_model.null_deviance())
sm_deviance = old_div(sm_model.deviance, sm_model.null_deviance)
assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
开发者ID:AllCodeNoGyaan,项目名称:h2o-3,代码行数:25,代码来源:pyunit_link_functions_gaussian_glm.py
示例12: fiftycatGBM
def fiftycatGBM():
# Training set has only 45 categories cat1 through cat45
#Log.info("Importing 50_cattest_train.csv data...\n")
train = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_train.csv"))
train["y"] = train["y"].asfactor()
#Log.info("Summary of 50_cattest_train.csv from H2O:\n")
#train.summary()
# Train H2O GBM Model:
#Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], distribution="bernoulli", ntrees=10, max_depth=5, nbins=20)
model.show()
# Test dataset has all 50 categories cat1 through cat50
#Log.info("Importing 50_cattest_test.csv data...\n")
test = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/50_cattest_test.csv"))
#Log.info("Summary of 50_cattest_test.csv from H2O:\n")
#test.summary()
# Predict on test dataset with GBM model:
#Log.info("Performing predictions on test dataset...\n")
predictions = model.predict(test)
predictions.show()
# Get the confusion matrix and AUC
#Log.info("Confusion matrix of predictions (max accuracy):\n")
performance = model.model_performance(test)
test_cm = performance.confusion_matrix()
test_auc = performance.auc()
开发者ID:Kaushik512,项目名称:h2o-3,代码行数:33,代码来源:pyunit_DEPRECATED_fiftycatGBM.py
示例13: xgboost_milsongs_gaussian_medium
def xgboost_milsongs_gaussian_medium():
assert H2OXGBoostEstimator.available()
# Import big dataset to ensure run across multiple nodes
training_frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
test_frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
x = list(range(1,training_frame.ncol))
y = 0
# Model with maximum of 2 trees
model_2_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.3,
booster='gbtree', seed=1, ntrees=2, distribution='gaussian')
model_2_trees.train(x=x, y=y, training_frame=training_frame)
prediction_2_trees = model_2_trees.predict(test_frame)
assert prediction_2_trees.nrows == test_frame.nrows
# Model with 10 trees
model_10_trees = H2OXGBoostEstimator(training_frame=training_frame, learn_rate=0.3,
booster='gbtree', seed=1, ntrees=10, distribution='gaussian')
model_10_trees.train(x=x, y=y, training_frame=training_frame)
prediction_10_trees = model_10_trees.predict(test_frame)
assert prediction_10_trees.nrows == test_frame.nrows
## Mean square error on model with lower number of decision trees should be higher
assert model_2_trees.mse() > model_10_trees.mse()
开发者ID:StevenLOL,项目名称:h2o-3,代码行数:27,代码来源:pyunit_milsongs_gaussian_medium.py
示例14: export_file
def export_file():
pros_hex = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
pros_hex[1] = pros_hex[1].asfactor()
pros_hex[3] = pros_hex[3].asfactor()
pros_hex[4] = pros_hex[4].asfactor()
pros_hex[5] = pros_hex[5].asfactor()
pros_hex[8] = pros_hex[8].asfactor()
p_sid = pros_hex.runif()
pros_train = pros_hex[p_sid > 0.2, :]
pros_test = pros_hex[p_sid <= 0.2, :]
glm = H2OGeneralizedLinearEstimator(family="binomial")
myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train)
mypred = glm.predict(pros_test)
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return "".join(random.choice(chars) for _ in range(size))
fname = id_generator() + "_prediction.csv"
path = pyunit_utils.locate("results")
dname = path + "/" + fname
h2o.export_file(mypred, dname)
py_pred = pd.read_csv(dname)
print(py_pred.head())
h_pred = mypred.as_data_frame(True)
print(h_pred.head())
# Test to check if py_pred & h_pred are identical
assert_frame_equal(py_pred, h_pred)
开发者ID:h2oai,项目名称:h2o-3,代码行数:33,代码来源:pyunit_export_file.py
示例15: anomaly
def anomaly():
print("Deep Learning Anomaly Detection MNIST")
train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
test = h2o.import_file(pyunit_utils.locate("bigdata/laptop/mnist/test.csv.gz"))
predictors = list(range(0,784))
resp = 784
# unsupervised -> drop the response column (digit: 0-9)
train = train[predictors]
test = test[predictors]
# 1) LEARN WHAT'S NORMAL
# train unsupervised Deep Learning autoencoder model on train_hex
ae_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[2], l1=1e-5, ignore_const_cols=False, epochs=1)
ae_model.train(x=predictors,training_frame=train)
# 2) DETECT OUTLIERS
# anomaly app computes the per-row reconstruction error for the test data set
# (passing it through the autoencoder model and computing mean square error (MSE) for each row)
test_rec_error = ae_model.anomaly(test)
# 3) VISUALIZE OUTLIERS
# Let's look at the test set points with low/median/high reconstruction errors.
# We will now visualize the original test set points and their reconstructions obtained
# by propagating them through the narrow neural net.
# Convert the test data into its autoencoded representation (pass through narrow neural net)
test_recon = ae_model.predict(test)
开发者ID:AllCodeNoGyaan,项目名称:h2o-3,代码行数:31,代码来源:pyunit_anomaly_deeplearning_large.py
示例16: import_multi
def import_multi():
airlines = h2o.import_file(path=[
pyunit_utils.locate("smalldata/testng/airlines_train.csv"),
pyunit_utils.locate("smalldata/testng/airlines_test.csv")
])
assert airlines.nrows == 24421 + 2691
开发者ID:StevenLOL,项目名称:h2o-3,代码行数:7,代码来源:pyunit_import_multi.py
示例17: stackedensemble_metalearner_seed_test
def stackedensemble_metalearner_seed_test():
# Import training set
train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
destination_frame="higgs_train_5k")
test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
destination_frame="higgs_test_5k")
# Identify predictors and response
x = train.columns
y = "response"
x.remove(y)
# Convert response to a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()
# Set number of folds for base learners
nfolds = 3
#Metalearner params for gbm, drf, glm, and deep deeplearning
gbm_params = {"sample_rate" : 0.3, "col_sample_rate" : 0.3}
# Train and cross-validate a GBM
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
ntrees=10,
nfolds=nfolds,
keep_cross_validation_predictions=True,
seed=1)
my_gbm.train(x=x, y=y, training_frame=train)
# Train and cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=10,
nfolds=nfolds,
keep_cross_validation_predictions=True,
seed=1)
my_rf.train(x=x, y=y, training_frame=train)
#Train two SE models with same metalearner seeds
stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
metalearner_params = gbm_params, seed = 55555)
stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
metalearner_params = gbm_params, seed = 55555)
stack_gbm1.train(x=x, y=y, training_frame=train)
stack_gbm2.train(x=x, y=y, training_frame=train)
meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name'])
meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name'])
assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse(train=True), "RMSE should match if same seed"
#Train two SE models with diff metalearner seeds
stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
metalearner_params = gbm_params, seed = 55555)
stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
metalearner_params = gbm_params, seed = 98765)
stack_gbm3.train(x=x, y=y, training_frame=train)
stack_gbm4.train(x=x, y=y, training_frame=train)
meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name'])
meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name'])
assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse(train=True), "RMSE should NOT match if diff seed"
开发者ID:StevenLOL,项目名称:h2o-3,代码行数:60,代码来源:pyunit_stackedensemble_seed.py
示例18: iris_h2o_vs_sciKmeans
def iris_h2o_vs_sciKmeans():
# Connect to a pre-existing cluster
# connect to localhost:54321
iris_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
iris_sci = np.genfromtxt(pyunit_utils.locate("smalldata/iris/iris.csv"), delimiter=',')
iris_sci = iris_sci[:,0:4]
s =[[4.9,3.0,1.4,0.2],
[5.6,2.5,3.9,1.1],
[6.5,3.0,5.2,2.0]]
start = h2o.H2OFrame(s)
h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start, standardize=False)
sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
sci_km.fit(iris_sci)
# Log.info("Cluster centers from H2O:")
print("Cluster centers from H2O:")
h2o_centers = h2o_km.centers()
print(h2o_centers)
# Log.info("Cluster centers from scikit:")
print("Cluster centers from scikit:")
sci_centers = sci_km.cluster_centers_.tolist()
print(sci_centers)
for hcenter, scenter in zip(h2o_centers, sci_centers):
for hpoint, spoint in zip(hcenter,scenter):
assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
开发者ID:Kaushik512,项目名称:h2o-3,代码行数:33,代码来源:pyunit_DEPRECATED_iris_h2o_vs_sciKmeans.py
示例19: col_names_check
def col_names_check():
iris_wheader = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
"Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
iris_wheader.col_names)
iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
"{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)
df = h2o.H2OFrame.from_python(list(zip(*np.random.randn(100,4).tolist())), column_names=list("ABCD"), column_types=["enum"]*4)
df.head()
assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names)
assert list(df.types.values()) == ["enum"]*4, "Expected {} for column types but got {}".format(["enum"]*4, df.types)
df = h2o.H2OFrame(list(zip(*np.random.randn(100,4).tolist())))
df.head()
assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"]
, df.col_names)
assert list(df.types.values()) == ["real"]*4, "Expected {} for column types but got {}".format(["real"]*4, df.types)
df = h2o.H2OFrame({'B': ['a', 'a', 'b', 'NA', 'NA']})
df.head()
assert df.col_names == ["B"], "Expected {} for column names but got {}".format(["B"], df.col_names)
df = h2o.H2OFrame.from_python({'B': ['a', 'a', 'b', 'NA', 'NA']}, column_names=["X"])
df.head()
assert df.col_names == ["X"], "Expected {} for column names but got {}".format(["X"], df.col_names)
开发者ID:ZivkoKrstic,项目名称:h2o-3,代码行数:29,代码来源:pyunit_colnames.py
示例20: shuffling_large
def shuffling_large():
print("Reading in Arcene training data for binomial modeling.")
train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))
print("Create model on original Arcene dataset.")
h2o_model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
h2o_model.train(x=list(range(1000)), y=1000, training_frame=train_data)
print("Create second model on original Arcene dataset.")
h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
h2o_model_2.train(x=list(range(1000)), y=1000, training_frame=train_data)
print("Create model on shuffled Arcene dataset.")
h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
h2o_model_s.train(x=list(range(1000)), y=1000, training_frame=train_data_shuffled)
print("Assert that number of predictors remaining and their respective coefficients are equal.")
for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2.
_model_json['output']['coefficients_table'].cell_values):
assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
if isinstance(x[1],float):
assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
if isinstance(x[2],float):
assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s.
_model_json['output']['coefficients_table'].cell_values):
assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
if isinstance(x[1],float):
assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
if isinstance(x[2],float):
assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
开发者ID:AllCodeNoGyaan,项目名称:h2o-3,代码行数:35,代码来源:pyunit_shuffling_glm_large.py
注:本文中的tests.pyunit_utils.locate函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论