本文整理汇总了Python中mrjob.job.MRJob类的典型用法代码示例。如果您正苦于以下问题:Python MRJob类的具体用法?Python MRJob怎么用?Python MRJob使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MRJob类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: reducer
def reducer(self, n, vars):
MRJob.set_status(self, "=============> reducer called")
samples_from_mappers = []
counts_from_mappers = []
# First read all the counts from different mappers fo we know the total number of items and we can give
# each of the sets coming from different mappers their appropriate weight
total_counts_from_mappers = 0
for x in vars:
input = json.loads(x)
total_counts_from_mappers += input[0]
counts_from_mappers.append(input[0])
samples_from_mappers.append(input[1])
# Now based on the number of samples in each mapper we need to select appropriate number of samples form
# samples_from_mappers
i = 0
for sample_set in samples_from_mappers:
weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
number_of_needed_samples = int(round(weight * self.options.sample_size))
for j in range(number_of_needed_samples):
yield 1, sample_set.pop()
i += 1
开发者ID:ddehghan,项目名称:machine_learning_class,代码行数:28,代码来源:mrSample.py
示例2: main
def main(cl_args=None):
arg_parser = _make_arg_parser()
options = arg_parser.parse_args(cl_args)
MRJob.set_up_logging(quiet=options.quiet,
verbose=options.verbose)
# max_hours_idle -> max_mins_idle
max_mins_idle = options.max_mins_idle
if max_mins_idle is None and options.max_hours_idle is not None:
log.warning('--max-hours-idle is deprecated and will be removed'
' in v0.7.0. Please use --max-mins-idle instead.')
max_mins_idle = options.max_hours_idle * 60
if options.mins_to_end_of_hour is not None:
log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0'
' and does nothing')
_maybe_terminate_clusters(
dry_run=options.dry_run,
max_mins_idle=max_mins_idle,
unpooled_only=options.unpooled_only,
now=_boto3_now(),
pool_name=options.pool_name,
pooled_only=options.pooled_only,
max_mins_locked=options.max_mins_locked,
quiet=options.quiet,
**_runner_kwargs(options)
)
开发者ID:okomestudio,项目名称:mrjob,代码行数:29,代码来源:terminate_idle_clusters.py
示例3: test_cmd_line_options
def test_cmd_line_options(self):
mr_job = MRJob(
["--partitioner", "java.lang.Object", "--partitioner", "org.apache.hadoop.mapreduce.Partitioner"]
)
# second option takes priority
self.assertEqual(mr_job.job_runner_kwargs()["partitioner"], "org.apache.hadoop.mapreduce.Partitioner")
开发者ID:ndimiduk,项目名称:mrjob,代码行数:7,代码来源:test_job.py
示例4: reducer_final
def reducer_final(self):
MRJob.set_status(self, "=============> reducer final called")
for label in self.output:
stratum_samples = self.output[label]
yield label, (len(stratum_samples), stratum_samples)
开发者ID:marionleborgne,项目名称:machine_learning,代码行数:7,代码来源:mrStratify.py
示例5: test_spark
def test_spark(self):
job = MRJob(["--spark", "input_dir", "output_dir"])
job.spark = MagicMock()
job.execute()
job.spark.assert_called_once_with("input_dir", "output_dir")
开发者ID:davidmarin,项目名称:mrjob,代码行数:7,代码来源:test_job.py
示例6: mapper_final
def mapper_final(self):
MRJob.set_status(self, "=============> mapper final called")
out = [self.count, self.samples]
jOut = json.dumps(out)
yield 1, jOut
开发者ID:AshKash,项目名称:kit-sink,代码行数:7,代码来源:mrSample.py
示例7: test_bytes_value_protocol
def test_bytes_value_protocol(self):
job = MRJob()
job.OUTPUT_PROTOCOL = BytesValueProtocol
self.assertEqual(
job.parse_output_line(b'one two\n'),
(None, b'one two\n'))
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py
示例8: test_spark_method
def test_spark_method(self):
j = MRJob(["--no-conf"])
j.spark = MagicMock()
self.assertEqual(j.steps(), [SparkStep(j.spark)])
self.assertEqual(j._steps_desc(), [dict(type="spark", spark_args=[])])
开发者ID:davidmarin,项目名称:mrjob,代码行数:7,代码来源:test_job.py
示例9: test_default_protocol
def test_default_protocol(self):
job = MRJob()
data = iter([b'1\t2', b'\n{"3": ', b'4}\t"fi', b've"\n'])
self.assertEqual(
list(job.parse_output(data)),
[(1, 2), ({'3': 4}, 'five')])
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py
示例10: main
def main(args):
# parser command-line args
usage = '%prog [options]'
description = "Collect EMR stats from active jobflows. "
description += "Active jobflows are those in states of: "
description += "BOOTSTRAPPING, RUNNING, STARTING, and WAITING. "
description += "Collected stats include total number of active jobflows"
description += "and total number of Amazon EC2 instances used to execute"
description += "these jobflows. The instance counts are not separated by"
description += "instance type."
option_parser = OptionParser(usage=usage, description=description)
option_parser.add_option(
"-p", "--pretty-print",
action="store_true", dest="pretty_print", default=False,
help=('Pretty print the collected stats'))
add_basic_opts(option_parser)
options, args = option_parser.parse_args(args)
if args:
option_parser.error('takes no arguments')
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
log.info('collecting EMR active jobflows...')
job_flows = collect_active_job_flows(options.conf_paths)
log.info('compiling stats from collected jobflows...')
stats = job_flows_to_stats(job_flows)
if options.pretty_print:
pretty_print(stats)
else:
print(json.dumps(stats))
开发者ID:tempcyc,项目名称:mrjob,代码行数:31,代码来源:collect_emr_stats.py
示例11: main
def main(args=None):
now = _boto3_now()
arg_parser = _make_arg_parser()
options = arg_parser.parse_args(args)
MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
log.info('getting information about running jobs')
min_time = timedelta(hours=options.min_hours)
emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
cluster_summaries = _boto3_paginate(
'Clusters', emr_client, 'list_clusters',
ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])
if not options.exclude:
filtered_cluster_summaries = cluster_summaries
else:
filtered_cluster_summaries = _filter_clusters(
cluster_summaries, emr_client, options.exclude)
job_info = _find_long_running_jobs(
emr_client, filtered_cluster_summaries, min_time, now=now)
_print_report(job_info)
开发者ID:Affirm,项目名称:mrjob,代码行数:27,代码来源:report_long_jobs.py
示例12: __init__
def __init__(self, *args, **kwargs):
MRJob.__init__(self, *args, **kwargs)
## load entities from json file
log("loading entity list")
entities = json.load(urllib.urlopen("https://s3.amazonaws.com/trec-kba-2012/entity-urlnames.json"))
self.entity_representations = toy_kba_algorithm.prepare_entities(entities)
开发者ID:SHENbeyond,项目名称:kba-tools,代码行数:7,代码来源:toy_kba_mrjob.py
示例13: reducer
def reducer(self, n, vars):
MRJob.set_status(self, "=============> reducer called")
print "reducer:", vars
samples_from_mappers = []
counts_from_mappers = []
# First read all the counts from different mappers fo we know the total number of items and we can give
# each of the sets coming from different mappers their appropriate weight
total_counts_from_mappers = 0
for x in vars:
input = json.loads(x)
total_counts_from_mappers += input[0]
counts_from_mappers.append(input[0])
samples_from_mappers.append(input[1])
# Now based on the number of samples in each mapper we need to select appropriate number of samples form
# samples_from_mappers
i = 0
fileOut=open(os.path.join(PROJECT_ROOT , 'output.txt'),"w")
for sample_set in samples_from_mappers:
weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
number_of_needed_samples = int(round(weight * self.options.sample_size))
for j in range(number_of_needed_samples):
fileOut.write(str(sample_set.pop()) + '\n')
i += 1
fileOut.close()
if False: yield 1,2
开发者ID:AshKash,项目名称:kit-sink,代码行数:35,代码来源:mrSample.py
示例14: test_wrong_type_of_step
def test_wrong_type_of_step(self):
mr_job = MRJob()
mr_job.spark = MagicMock()
self.assertRaises(TypeError, mr_job.run_mapper)
self.assertRaises(TypeError, mr_job.run_combiner)
self.assertRaises(TypeError, mr_job.run_reducer)
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py
示例15: test_deprecated_mapper_final_positional_arg
def test_deprecated_mapper_final_positional_arg(self):
def mapper(k, v):
pass
def reducer(k, v):
pass
def mapper_final():
pass
stderr = StringIO()
with no_handlers_for_logger():
log_to_stream('mrjob.job', stderr)
step = MRJob.mr(mapper, reducer, mapper_final)
# should be allowed to specify mapper_final as a positional arg,
# but we log a warning
self.assertEqual(
step,
MRJob.mr(
mapper=mapper, reducer=reducer, mapper_final=mapper_final))
self.assertIn('mapper_final should be specified', stderr.getvalue())
# can't specify mapper_final as a positional and keyword arg
self.assertRaises(
TypeError,
MRJob.mr,
mapper,
reducer,
mapper_final,
mapper_final=mapper_final)
开发者ID:bchess,项目名称:mrjob,代码行数:31,代码来源:test_job.py
示例16: test_empty
def test_empty(self):
mr_job = MRJob()
self.assertEqual(mr_job._runner_kwargs()['hadoop_input_format'],
None)
self.assertEqual(mr_job._runner_kwargs()['hadoop_output_format'],
None)
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py
示例17: test_mr
def test_mr(self):
def mapper(k, v):
pass
def mapper_init():
pass
def mapper_final():
pass
def reducer(k, vs):
pass
def reducer_init():
pass
def reducer_final():
pass
# make sure it returns the format we currently expect
self.assertEqual(MRJob.mr(mapper, reducer),
stepdict(mapper, reducer))
self.assertEqual(MRJob.mr(mapper, reducer,
mapper_init=mapper_init,
mapper_final=mapper_final,
reducer_init=reducer_init,
reducer_final=reducer_final),
stepdict(mapper, reducer,
mapper_init=mapper_init,
mapper_final=mapper_final,
reducer_init=reducer_init,
reducer_final=reducer_final))
self.assertEqual(MRJob.mr(mapper),
stepdict(mapper))
开发者ID:DrMavenRebe,项目名称:mrjob,代码行数:35,代码来源:test_job.py
示例18: test_verbose
def test_verbose(self):
with patch.object(sys, 'stderr', StringIO()) as stderr:
MRJob.set_up_logging(verbose=True)
log = logging.getLogger('__main__')
log.info('INFO')
log.debug('DEBUG')
self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
开发者ID:Yelp,项目名称:mrjob,代码行数:7,代码来源:test_launch.py
示例19: main
def main(cl_args=None):
parser = _make_arg_parser()
options = parser.parse_args(cl_args)
runner_alias = options.runner or _DEFAULT_RUNNER
runner_class = _runner_class(runner_alias)
if options.help or not options.script_or_jar:
_print_help(options, runner_class)
sys.exit(0)
MRJob.set_up_logging(
quiet=options.quiet,
verbose=options.verbose,
)
kwargs = _get_runner_opt_kwargs(options, runner_class)
kwargs.update(_HARD_CODED_OPTS)
kwargs['input_paths'] = [os.devnull]
step = _get_step(options, parser, cl_args)
kwargs['steps'] = [step.description()]
runner = runner_class(**kwargs)
try:
runner.run()
finally:
runner.cleanup()
开发者ID:Affirm,项目名称:mrjob,代码行数:30,代码来源:spark_submit.py
示例20: test_default_options
def test_default_options(self):
with no_handlers_for_logger('__main__'):
with patch.object(sys, 'stderr', StringIO()) as stderr:
MRJob.set_up_logging()
log = logging.getLogger('__main__')
log.info('INFO')
log.debug('DEBUG')
self.assertEqual(stderr.getvalue(), 'INFO\n')
开发者ID:etiennebatise,项目名称:mrjob,代码行数:8,代码来源:test_launch.py
注:本文中的mrjob.job.MRJob类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论