本文整理汇总了Python中utils.make_classification_data函数的典型用法代码示例。如果您正苦于以下问题:Python make_classification_data函数的具体用法?Python make_classification_data怎么用?Python make_classification_data使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了make_classification_data函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_merge_missing_labels
def test_merge_missing_labels():
"""
Test to ensure that labels are sucessfully copied when merging
"""
# create a feature set
fs1, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
train_test_ratio=1.0)
# create a different feature set with no labels specified
fs2, _ = make_classification_data(num_examples=100,
num_features=4,
feature_prefix='g',
empty_labels=True,
num_labels=3,
train_test_ratio=1.0)
# merge the two featuresets in different orders
fs12 = fs1 + fs2
fs21 = fs2 + fs1
# make sure that the labels are the same after merging
assert_array_equal(fs12.labels, fs1.labels)
assert_array_equal(fs21.labels, fs1.labels)
开发者ID:BK-University,项目名称:skll,代码行数:26,代码来源:test_featureset.py
示例2: test_subtract
def test_subtract():
"""
Test to ensure that subtraction works
"""
# create a feature set
fs1, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=2,
train_test_ratio=1.0,
random_state=1234)
# create a different feature set with the same feature names
# but different feature values
fs2, _ = make_classification_data(num_examples=100,
num_features=2,
num_labels=2,
train_test_ratio=1.0,
random_state=5678)
# subtract fs1 from fs2, i.e., the features in fs2
# should be removed from fs1 but nothing else should change
fs = fs1 - fs2
# ensure that the labels are the same in fs and fs1
assert_array_equal(fs.labels, fs1.labels)
# ensure that there are only two features left
eq_(fs.features.shape[1], 2)
# and that they are f3 and f4
assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])
开发者ID:BK-University,项目名称:skll,代码行数:32,代码来源:test_featureset.py
示例3: check_print_model_weights
def check_print_model_weights(task='classification'):
# create some simple classification or regression data
if task == 'classification':
train_fs, _ = make_classification_data(train_test_ratio=0.8)
else:
train_fs, _, _ = make_regression_data(num_features=4,
train_test_ratio=0.8)
# now train the appropriate model
if task == 'classification':
learner = Learner('LogisticRegression')
learner.train(train_fs)
else:
learner = Learner('LinearRegression')
learner.train(train_fs, grid_objective='pearson')
# now save the model to disk
model_file = join(_my_dir, 'output',
'test_print_model_weights.model')
learner.save(model_file)
# now call print_model_weights main() and capture the output
print_model_weights_cmd = [model_file]
err = ''
try:
old_stderr = sys.stderr
old_stdout = sys.stdout
sys.stderr = mystderr = StringIO()
sys.stdout = mystdout = StringIO()
pmw.main(print_model_weights_cmd)
out = mystdout.getvalue()
err = mystderr.getvalue()
finally:
sys.stderr = old_stderr
sys.stdout = old_stdout
print(err)
# now parse the output of the print_model_weight command
# and get the intercept and the feature values
if task == 'classification':
lines_to_parse = [l for l in out.split('\n')[1:] if l]
intercept = safe_float(lines_to_parse[0].split('\t')[0])
feature_values = []
for ltp in lines_to_parse[1:]:
fields = ltp.split('\t')
feature_values.append((fields[2], safe_float(fields[0])))
feature_values = [t[1] for t in sorted(feature_values)]
assert_almost_equal(intercept, learner.model.intercept_[0])
assert_allclose(learner.model.coef_[0], feature_values)
else:
lines_to_parse = [l for l in out.split('\n') if l]
intercept = safe_float(lines_to_parse[0].split('=')[1])
feature_values = []
for ltp in lines_to_parse[1:]:
fields = ltp.split('\t')
feature_values.append((fields[1], safe_float(fields[0])))
feature_values = [t[1] for t in sorted(feature_values)]
assert_almost_equal(intercept, learner.model.intercept_)
assert_allclose(learner.model.coef_, feature_values)
开发者ID:MechCoder,项目名称:skll,代码行数:60,代码来源:test_utilities.py
示例4: test_string_feature
def test_string_feature():
"""
Test to make sure that string-valued features are properly
encoded as binary features
"""
# create a featureset that is derived from an original
# set of features containing 3 numeric features and
# one string-valued feature that can take six possible
# values between 'a' to 'f'. This means that the
# featureset will have 3 numeric + 6 binary features.
fs, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
one_string_feature=True,
num_string_values=6,
train_test_ratio=1.0)
# confirm that the number of features are as expected
eq_(fs.features.shape, (100, 9))
# confirm the feature names
eq_(fs.vectorizer.feature_names_, ['f01', 'f02', 'f03',
'f04=a', 'f04=b', 'f04=c',
'f04=d', 'f04=e', 'f04=f'])
# confirm that the final six features are binary
assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
开发者ID:BK-University,项目名称:skll,代码行数:27,代码来源:test_featureset.py
示例5: test_learner_api_load_into_existing_instance
def test_learner_api_load_into_existing_instance():
"""
Check that `Learner.load()` works as expected
"""
# create a LinearSVC instance and train it on some data
learner1 = Learner('LinearSVC')
(train_fs,
test_fs) = make_classification_data(num_examples=200,
num_features=5,
use_feature_hashing=False,
non_negative=True)
learner1.train(train_fs, grid_search=False)
# now use `load()` to replace the existing instance with a
# different saved learner
other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
learner1.load(other_model_file)
# now load the saved model into another instance using the class method
# `from_file()`
learner2 = Learner.from_file(other_model_file)
# check that the two instances are now basically the same
eq_(learner1.model_type, learner2.model_type)
eq_(learner1.model_params, learner2.model_params)
eq_(learner1.model_kwargs, learner2.model_kwargs)
开发者ID:EducationalTestingService,项目名称:skll,代码行数:27,代码来源:test_classification.py
示例6: check_train_and_score_function
def check_train_and_score_function(model_type):
"""
Check that the _train_and_score() function works as expected
"""
# create train and test data
(train_fs,
test_fs) = make_classification_data(num_examples=500,
train_test_ratio=0.7,
num_features=5,
use_feature_hashing=False,
non_negative=True)
# call _train_and_score() on this data
estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge'
metric = 'accuracy' if model_type == 'classifier' else 'pearson'
learner1 = Learner(estimator_name)
train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric)
# this should yield identical results when training another instance
# of the same learner without grid search and shuffling and evaluating
# that instance on the train and the test set
learner2 = Learner(estimator_name)
learner2.train(train_fs, grid_search=False, shuffle=False)
train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric]
test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric]
eq_(train_score1, train_score2)
eq_(test_score1, test_score2)
开发者ID:EducationalTestingService,项目名称:skll,代码行数:29,代码来源:test_classification.py
示例7: check_filter_labels
def check_filter_labels(inverse=False):
# create a feature set
fs, _ = make_classification_data(num_examples=1000,
num_features=4,
num_labels=5,
train_test_ratio=1.0)
# keep just the instaces with 0, 1 and 2 labels
labels_to_filter = [0, 1, 2]
# do the actual filtering
fs.filter(labels=labels_to_filter, inverse=inverse)
# make sure that we removed the right things
if inverse:
ids_kept = fs.ids[np.where(np.logical_not(np.in1d(fs.labels,
labels_to_filter)))]
else:
ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))]
assert_array_equal(fs.ids, np.array(ids_kept))
# make sure that number of ids, labels and features are the same
eq_(fs.ids.shape[0], fs.labels.shape[0])
eq_(fs.labels.shape[0], fs.features.shape[0])
开发者ID:BK-University,项目名称:skll,代码行数:26,代码来源:test_featureset.py
示例8: make_single_file_featureset_data
def make_single_file_featureset_data():
"""
Write a training file and a test file for tests that check whether
specifying train_file and test_file actually works.
"""
train_fs, test_fs = make_classification_data(num_examples=600,
train_test_ratio=0.8,
num_labels=2,
num_features=3,
non_negative=False)
# Write training feature set to a file
train_path = join(_my_dir, 'train', 'train_single_file.jsonlines')
writer = NDJWriter(train_path, train_fs)
writer.write()
# Write test feature set to a file
test_path = join(_my_dir, 'test', 'test_single_file.jsonlines')
writer = NDJWriter(test_path, test_fs)
writer.write()
# Also write another test feature set that has fewer features than the training set
test_fs.filter(features=['f01', 'f02'])
test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines')
writer = NDJWriter(test_path, test_fs)
writer.write()
开发者ID:EducationalTestingService,项目名称:skll,代码行数:26,代码来源:test_classification.py
示例9: test_skll_convert_libsvm_map
def test_skll_convert_libsvm_map():
"""
Test to check whether the --reuse_libsvm_map option works for skll_convert
"""
# create some simple classification data
orig_fs, _ = make_classification_data(train_test_ratio=1.0,
one_string_feature=True)
# now write out this feature set as a libsvm file
orig_libsvm_file = join(_my_dir, 'other',
'test_skll_convert_libsvm_map.libsvm')
writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True)
writer.write()
# now make a copy of the dataset
swapped_fs = copy.deepcopy(orig_fs)
# now modify this new featureset to swap the first two columns
del swapped_fs.vectorizer.vocabulary_['f01']
del swapped_fs.vectorizer.vocabulary_['f02']
swapped_fs.vectorizer.vocabulary_['f01'] = 1
swapped_fs.vectorizer.vocabulary_['f02'] = 0
tmp = swapped_fs.features[:, 0]
swapped_fs.features[:, 0] = swapped_fs.features[:, 1]
swapped_fs.features[:, 1] = tmp
# now write out this new feature set as a MegaM file
swapped_megam_file = join(_my_dir, 'other',
'test_skll_convert_libsvm_map.megam')
writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True)
writer.write()
# now run skll_convert to convert this into a libsvm file
# but using the mapping specified in the first libsvm file
converted_libsvm_file = join(_my_dir, 'other',
'test_skll_convert_libsvm_map2.libsvm')
# now call skll convert's main function
skll_convert_cmd = ['--reuse_libsvm_map', orig_libsvm_file,
'--quiet', orig_libsvm_file,
converted_libsvm_file]
err = ''
try:
old_stderr = sys.stderr
sys.stderr = mystderr = StringIO()
sk.main(skll_convert_cmd)
err = mystderr.getvalue()
finally:
sys.stderr = old_stderr
print(err)
# now read the converted libsvm file into a featureset
reader = LibSVMReader(converted_libsvm_file, quiet=True)
converted_fs = reader.read()
# now ensure that this new featureset and the original
# featureset are the same
eq_(orig_fs, converted_fs)
开发者ID:MechCoder,项目名称:skll,代码行数:59,代码来源:test_utilities.py
示例10: check_generate_predictions_console
def check_generate_predictions_console(use_threshold=False):
# create some simple classification data without feature hashing
train_fs, test_fs = make_classification_data(num_examples=1000,
num_features=5)
# save the test feature set to an NDJ file
input_file = join(_my_dir, 'test',
'test_generate_predictions.jsonlines')
writer = NDJWriter(input_file, test_fs)
writer.write()
# create a learner that uses an SGD classifier
learner = Learner('SGDClassifier', probability=use_threshold)
# train the learner with grid search
learner.train(train_fs, grid_search=True)
# get the predictions on the test featureset
predictions = learner.predict(test_fs)
# if we asked for probabilities, then use the threshold
# to convert them into binary predictions
if use_threshold:
threshold = 0.6
predictions = [int(p[1] >= threshold) for p in predictions]
else:
predictions = predictions.tolist()
threshold = None
# save the learner to a file
model_file = join(_my_dir, 'output',
'test_generate_predictions_console.model')
learner.save(model_file)
# now call main() from generate_predictions.py
generate_cmd = []
if use_threshold:
generate_cmd.append('-t {}'.format(threshold))
generate_cmd.extend([model_file, input_file])
# we need to capture stdout since that's what main() writes to
err = ''
try:
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = mystdout = StringIO()
sys.stderr = mystderr = StringIO()
gp.main(generate_cmd)
out = mystdout.getvalue()
err = mystderr.getvalue()
predictions_after_saving = [int(x) for x in out.strip().split('\n')]
eq_(predictions, predictions_after_saving)
finally:
sys.stdout = old_stdout
sys.stderr = old_stderr
print(err)
开发者ID:MechCoder,项目名称:skll,代码行数:57,代码来源:test_utilities.py
示例11: test_custom_learner_model_loading
def test_custom_learner_model_loading():
num_labels = 10
class_weights = [(0.5 / (num_labels - 1))
for x in range(num_labels - 1)] + [0.5]
train_fs, test_fs = make_classification_data(num_examples=600,
train_test_ratio=0.8,
num_labels=num_labels,
num_features=5,
non_negative=True,
class_weights=class_weights)
# Write training feature set to a file
train_path = join(_my_dir, 'train',
'test_model_custom_learner.jsonlines')
writer = NDJWriter(train_path, train_fs)
writer.write()
# Write test feature set to a file
test_path = join(_my_dir, 'test',
'test_model_custom_learner.jsonlines')
writer = NDJWriter(test_path, test_fs)
writer.write()
# run the configuration that trains the custom model and saves it
cfgfile = 'test_model_save_custom_learner.template.cfg'
config_template_path = join(_my_dir, 'configs', cfgfile)
config_path = fill_in_config_paths(config_template_path)
run_configuration(config_path, quiet=True)
# save the predictions from disk into memory
# and delete the predictions file
outprefix = 'test_model_custom_learner'
pred_file = join(_my_dir, 'output',
'{}_{}_CustomLogisticRegressionWrapper'
'.predictions'.format(outprefix,
outprefix))
preds1 = read_predictions(pred_file)
os.unlink(pred_file)
# run the configuration that loads the saved model
# and generates the predictions again
cfgfile = 'test_model_load_custom_learner.template.cfg'
config_template_path = join(_my_dir, 'configs', cfgfile)
config_path = fill_in_config_paths(config_template_path)
run_configuration(config_path, overwrite=False, quiet=True)
# load the newly generated predictions
preds2 = read_predictions(pred_file)
# make sure that they are the same as before
assert_array_equal(preds1, preds2)
开发者ID:BK-University,项目名称:skll,代码行数:54,代码来源:test_custom_learner.py
示例12: test_merge_different_vectorizers
def test_merge_different_vectorizers():
"""
Test to ensure rejection of merging featuresets with different vectorizers
"""
# create a featureset each with a DictVectorizer
fs1, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
train_test_ratio=1.0)
# create another featureset using hashing
fs2, _ = make_classification_data(num_examples=100,
num_features=4,
feature_prefix='g',
num_labels=3,
train_test_ratio=1.0,
use_feature_hashing=True)
# This should raise a ValueError
fs1 + fs2
开发者ID:BK-University,项目名称:skll,代码行数:20,代码来源:test_featureset.py
示例13: test_length
def test_length():
"""
Test to whether len() returns the number of instances
"""
# create a featureset
fs, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
train_test_ratio=1.0)
eq_(len(fs), 100)
开发者ID:BK-University,项目名称:skll,代码行数:12,代码来源:test_featureset.py
示例14: test_empty_labels
def test_empty_labels():
"""
Test to check behaviour when labels is None
"""
# create a feature set with empty labels
fs, _ = make_classification_data(num_examples=100,
num_features=4,
num_labels=3,
empty_labels=True,
train_test_ratio=1.0)
assert np.isnan(fs.labels).all()
开发者ID:BK-University,项目名称:skll,代码行数:12,代码来源:test_featureset.py
示例15: test_write_hashed_featureset
def test_write_hashed_featureset():
"""
Test to check that hashed featuresets cannot be written out
"""
fs, _ = make_classification_data(num_examples=100,
num_features=4,
use_feature_hashing=True,
feature_bins=2,
random_state=1234)
output_dir = join(_my_dir, 'output')
writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
writer.write()
开发者ID:EducationalTestingService,项目名称:skll,代码行数:12,代码来源:test_featureset.py
示例16: check_learner_api_grid_search_no_objective
def check_learner_api_grid_search_no_objective(task='train'):
(train_fs,
test_fs) = make_classification_data(num_examples=500,
train_test_ratio=0.7,
num_features=5,
use_feature_hashing=False,
non_negative=True)
learner = Learner('LogisticRegression')
if task == 'train':
_ = learner.train(train_fs)
else:
_ = learner.cross_validate(train_fs)
开发者ID:EducationalTestingService,项目名称:skll,代码行数:13,代码来源:test_classification.py
示例17: test_all_new_labels_in_test
def test_all_new_labels_in_test():
"""
Test classification with all labels in test set unseen
"""
train_fs, test_fs = make_classification_data(num_labels=3,
train_test_ratio=0.8)
# change all test labels
test_fs.labels = test_fs.labels + 3
learner = Learner('SVC')
learner.train(train_fs, grid_search=False)
res = learner.evaluate(test_fs)
yield check_results_with_unseen_labels, res, 6, [3, 4, 5]
yield assert_almost_equal, res[1], 0
开发者ID:EducationalTestingService,项目名称:skll,代码行数:14,代码来源:test_classification.py
示例18: test_merge_different_hashers
def test_merge_different_hashers():
"""
Test to ensure rejection of merging featuresets with different FeatureHashers
"""
# create a feature set with 4 feature hashing bins
fs1, _ = make_classification_data(num_examples=100,
num_features=10,
num_labels=3,
train_test_ratio=1.0,
use_feature_hashing=True,
feature_bins=4)
# create a second feature set with 3 feature hashing bins
fs2, _ = make_classification_data(num_examples=100,
num_features=10,
num_labels=3,
feature_prefix='g',
train_test_ratio=1.0,
use_feature_hashing=True,
feature_bins=3)
# This should raise a ValueError
fs1 + fs2
开发者ID:BK-University,项目名称:skll,代码行数:23,代码来源:test_featureset.py
示例19: test_new_labels_in_test_set
def test_new_labels_in_test_set():
"""
Test classification experiment with an unseen label in the test set.
"""
train_fs, test_fs = make_classification_data(num_labels=3,
train_test_ratio=0.8)
# add new labels to the test set
test_fs.labels[-3:] = 3
learner = Learner('SVC')
learner.train(train_fs, grid_search=False)
res = learner.evaluate(test_fs)
yield check_results_with_unseen_labels, res, 4, [3]
yield assert_almost_equal, res[1], 0.3
开发者ID:EducationalTestingService,项目名称:skll,代码行数:14,代码来源:test_classification.py
示例20: check_predict
def check_predict(model, use_feature_hashing=False):
"""
This tests whether predict task runs and generates the same
number of predictions as samples in the test set. The specified
model indicates whether to generate random regression
or classification data.
"""
# create the random data for the given model
if model._estimator_type == 'regressor':
train_fs, test_fs, _ = \
make_regression_data(use_feature_hashing=use_feature_hashing,
feature_bins=5)
# feature hashing will not work for Naive Bayes since it requires
# non-negative feature values
elif model.__name__ == 'MultinomialNB':
train_fs, test_fs = \
make_classification_data(use_feature_hashing=False,
non_negative=True)
else:
train_fs, test_fs = \
make_classification_data(use_feature_hashing=use_feature_hashing,
feature_bins=25)
# create the learner with the specified model
learner = Learner(model.__name__)
# now train the learner on the training data and use feature hashing when
# specified and when we are not using a Naive Bayes model
learner.train(train_fs, grid_search=False)
# now make predictions on the test set
predictions = learner.predict(test_fs)
# make sure we have the same number of outputs as the
# number of test set samples
eq_(len(predictions), test_fs.features.shape[0])
开发者ID:EducationalTestingService,项目名称:skll,代码行数:37,代码来源:test_classification.py
注:本文中的utils.make_classification_data函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论