本文整理汇总了Python中tests.mr_two_step_job.MRTwoStepJob类的典型用法代码示例。如果您正苦于以下问题:Python MRTwoStepJob类的具体用法?Python MRTwoStepJob怎么用?Python MRTwoStepJob使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MRTwoStepJob类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_failed_job
def test_failed_job(self):
mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
mr_job.sandbox()
with no_handlers_for_logger('mrjob.dataproc'):
stderr = StringIO()
log_to_stream('mrjob.dataproc', stderr)
self._dataproc_client.job_get_advances_states = (
collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR']))
with mr_job.make_runner() as runner:
self.assertIsInstance(runner, DataprocJobRunner)
self.assertRaises(StepFailedException, runner.run)
self.assertIn(' => ERROR\n', stderr.getvalue())
cluster_id = runner.get_cluster_id()
# job should get terminated
cluster = (
self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
cluster_state = self._dataproc_client.get_state(cluster)
self.assertEqual(cluster_state, 'DELETING')
开发者ID:okomestudio,项目名称:mrjob,代码行数:25,代码来源:test_dataproc.py
示例2: test_attach_to_existing_job_flow
def test_attach_to_existing_job_flow(self):
emr_conn = EMRJobRunner(conf_path=False).make_emr_conn()
# set log_uri to None, so that when we describe the job flow, it
# won't have the loguri attribute, to test Issue #112
emr_job_flow_id = emr_conn.run_jobflow(
name='Development Job Flow', log_uri=None)
stdin = StringIO('foo\nbar\n')
self.mock_emr_output = {(emr_job_flow_id, 1): [
'1\t"bar"\n1\t"foo"\n2\tnull\n']}
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path,
'--emr-job-flow-id', emr_job_flow_id])
mr_job.sandbox(stdin=stdin)
results = []
with mr_job.make_runner() as runner:
runner.run()
# Issue 182: don't create the bootstrap script when
# attaching to another job flow
assert_equal(runner._master_bootstrap_script, None)
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
assert_equal(sorted(results),
[(1, 'bar'), (1, 'foo'), (2, None)])
开发者ID:hblanks,项目名称:mrjob,代码行数:30,代码来源:emr_test.py
示例3: test_owner_and_label_switches
def test_owner_and_label_switches(self):
runner_opts = ["--no-conf", "--owner=ads", "--label=ads_chain"]
runner = MRTwoStepJob(runner_opts).make_runner()
match = JOB_NAME_RE.match(runner.get_job_name())
self.assertEqual(match.group(1), "ads_chain")
self.assertEqual(match.group(2), "ads")
开发者ID:pyzen,项目名称:mrjob,代码行数:7,代码来源:test_runner.py
示例4: test_bootstrap_python_comes_before_bootstrap
def test_bootstrap_python_comes_before_bootstrap(self):
mr_job = MRTwoStepJob(['-r', 'dataproc', '--bootstrap', 'true'])
with mr_job.make_runner() as runner:
self.assertEqual(
runner._bootstrap,
self.EXPECTED_BOOTSTRAP + [['true']])
开发者ID:Jeremyfanfan,项目名称:mrjob,代码行数:7,代码来源:test_dataproc.py
示例5: test_python_dash_v_as_python_bin
def test_python_dash_v_as_python_bin(self):
python_cmd = cmd_line([sys.executable or 'python', '-v'])
mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf',
'-r', 'local'])
mr_job.sandbox(stdin=[b'bar\n'])
with mr_job.make_runner() as runner:
runner.run()
# expect python -v crud in stderr
with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
self.assertTrue(any(
'import mrjob' in line or # Python 2
"import 'mrjob'" in line
for line in lines))
with open(runner._task_stderr_path('mapper', 0, 0)) as lines:
self.assertTrue(any(
'#' in line for line in lines))
# should still get expected results
self.assertEqual(
sorted(to_lines(runner.cat_output())),
sorted([b'1\tnull\n', b'1\t"bar"\n']))
开发者ID:Affirm,项目名称:mrjob,代码行数:25,代码来源:test_local.py
示例6: test_dont_take_down_cluster_on_failure
def test_dont_take_down_cluster_on_failure(self):
runner = DataprocJobRunner(conf_paths=[])
cluster_body = runner.api_client.cluster_create()
cluster_id = cluster_body['clusterName']
mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
'--cluster-id', cluster_id])
mr_job.sandbox()
self._dataproc_client.job_get_advances_states = collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR'])
with mr_job.make_runner() as runner:
self.assertIsInstance(runner, DataprocJobRunner)
with logger_disabled('mrjob.dataproc'):
self.assertRaises(StepFailedException, runner.run)
cluster = self.get_cluster_from_runner(runner, cluster_id)
cluster_state = self._dataproc_client.get_state(cluster)
self.assertEqual(cluster_state, 'RUNNING')
# job shouldn't get terminated by cleanup
cluster = self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]
cluster_state = self._dataproc_client.get_state(cluster)
self.assertEqual(cluster_state, 'RUNNING')
开发者ID:Jeremyfanfan,项目名称:mrjob,代码行数:26,代码来源:test_dataproc.py
示例7: test_failed_job
def test_failed_job(self):
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path])
mr_job.sandbox()
self.add_mock_s3_data({'walrus': {}})
self.mock_emr_failures = {('j-MOCKJOBFLOW0', 0): None}
with mr_job.make_runner() as runner:
assert isinstance(runner, EMRJobRunner)
with logger_disabled('mrjob.emr'):
assert_raises(Exception, runner.run)
emr_conn = botoemr.EmrConnection()
job_flow_id = runner.get_emr_job_flow_id()
for i in range(10):
emr_conn.simulate_progress(job_flow_id)
job_flow = emr_conn.describe_jobflow(job_flow_id)
assert_equal(job_flow.state, 'FAILED')
# job should get terminated on cleanup
emr_conn = runner.make_emr_conn()
job_flow_id = runner.get_emr_job_flow_id()
for i in range(10):
emr_conn.simulate_progress(job_flow_id)
job_flow = emr_conn.describe_jobflow(job_flow_id)
assert_equal(job_flow.state, 'TERMINATED')
开发者ID:boursier,项目名称:mrjob,代码行数:30,代码来源:emr_test.py
示例8: test_attach_to_existing_cluster
def test_attach_to_existing_cluster(self):
runner = DataprocJobRunner(conf_paths=[])
cluster_body = runner.api_client.cluster_create()
cluster_id = cluster_body['clusterName']
stdin = BytesIO(b'foo\nbar\n')
mr_job = MRTwoStepJob(['-r', 'dataproc', '-v',
'--cluster-id', cluster_id])
mr_job.sandbox(stdin=stdin)
results = []
with mr_job.make_runner() as runner:
runner.run()
# Generate fake output
self.put_job_output_parts(runner, [
b'1\t"bar"\n1\t"foo"\n2\tnull\n'
])
# Issue 182: don't create the bootstrap script when
# attaching to another cluster
self.assertIsNone(runner._master_bootstrap_script_path)
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
self.assertEqual(sorted(results),
[(1, 'bar'), (1, 'foo'), (2, None)])
开发者ID:Jeremyfanfan,项目名称:mrjob,代码行数:32,代码来源:test_dataproc.py
示例9: test_owner_and_label_switches
def test_owner_and_label_switches(self):
runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain']
runner = MRTwoStepJob(runner_opts).make_runner()
match = _JOB_KEY_RE.match(runner.get_job_key())
self.assertEqual(match.group(1), 'ads_chain')
self.assertEqual(match.group(2), 'ads')
开发者ID:anirudhreddy92,项目名称:mrjob,代码行数:7,代码来源:test_runner.py
示例10: test_end_to_end
def test_end_to_end(self):
# read from STDIN, a regular file, and a .gz
stdin = StringIO("foo\nbar\n")
input_path = os.path.join(self.tmp_dir, "input")
with open(input_path, "w") as input_file:
input_file.write("bar\nqux\n")
input_gz_path = os.path.join(self.tmp_dir, "input.gz")
input_gz = gzip.GzipFile(input_gz_path, "w")
input_gz.write("foo\n")
input_gz.close()
mr_job = MRTwoStepJob(["-c", self.mrjob_conf_path, "-", input_path, input_gz_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, LocalMRJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
assert os.path.exists(local_tmp_dir)
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
开发者ID:ksho,项目名称:mrjob,代码行数:34,代码来源:local_test.py
示例11: test_end_to_end
def test_end_to_end(self):
# read from STDIN, a regular file, and a .gz
stdin = BytesIO(b'foo\nbar\n')
input_path = join(self.tmp_dir, 'input')
with open(input_path, 'w') as input_file:
input_file.write('bar\nqux\n')
input_gz_path = join(self.tmp_dir, 'input.gz')
input_gz = gzip.GzipFile(input_gz_path, 'wb')
input_gz.write(b'foo\n')
input_gz.close()
mr_job = MRTwoStepJob(
['--runner', 'inline', '-', input_path, input_gz_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, InlineMRJobRunner)
runner.run()
results.extend(mr_job.parse_output(runner.cat_output()))
local_tmp_dir = runner._get_local_tmp_dir()
assert exists(local_tmp_dir)
# make sure cleanup happens
assert not exists(local_tmp_dir)
self.assertEqual(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
开发者ID:Yelp,项目名称:mrjob,代码行数:34,代码来源:test_inline.py
示例12: test_two_step_job
def test_two_step_job(self):
# good all-around test. MRTwoStepJob's first step logs counters, but
# its second step does not
job = MRTwoStepJob(['-r', 'spark'])
job.sandbox(stdin=BytesIO(b'foo\nbar\n'))
with job.make_runner() as runner:
runner.run()
counters = runner.counters()
# should have two steps worth of counters, even though it runs as a
# single Spark job
self.assertEqual(len(counters), 2)
# first step counters should be {'count': {'combiners': <int>}}
self.assertEqual(sorted(counters[0]), ['count'])
self.assertEqual(sorted(counters[0]['count']), ['combiners'])
self.assertIsInstance(counters[0]['count']['combiners'], int)
# second step counters should be empty
self.assertEqual(counters[1], {})
log_output = '\n'.join(c[0][0] for c in self.log.info.call_args_list)
log_lines = log_output.split('\n')
# should log first step counters but not second step
self.assertIn('Counters for step 1: 1', log_lines)
self.assertIn('\tcount', log_output)
self.assertNotIn('Counters for step 2', log_output)
开发者ID:Yelp,项目名称:mrjob,代码行数:30,代码来源:test_runner.py
示例13: test_bootstrap_python_switch
def test_bootstrap_python_switch(self):
mr_job = MRTwoStepJob(["-r", "dataproc", "--bootstrap-python"])
with mr_job.make_runner() as runner:
self.assertEqual(runner._opts["bootstrap_python"], True)
self.assertEqual(runner._bootstrap_python(), self.EXPECTED_BOOTSTRAP)
self.assertEqual(runner._bootstrap, self.EXPECTED_BOOTSTRAP)
开发者ID:davidmarin,项目名称:mrjob,代码行数:7,代码来源:test_dataproc.py
示例14: test_owner_and_label_switches
def test_owner_and_label_switches(self):
runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain']
runner = MRTwoStepJob(runner_opts).make_runner()
match = JOB_NAME_RE.match(runner.get_job_name())
assert_equal(match.group(1), 'ads_chain')
assert_equal(match.group(2), 'ads')
开发者ID:chomp,项目名称:mrjob,代码行数:7,代码来源:runner_test.py
示例15: test_missing_input
def test_missing_input(self):
mr_job = MRTwoStepJob(['-r', 'inline', '/some/bogus/file/path'])
mr_job.sandbox()
with mr_job.make_runner() as runner:
assert isinstance(runner, InlineMRJobRunner)
self.assertRaises(IOError, runner.run)
开发者ID:Yelp,项目名称:mrjob,代码行数:7,代码来源:test_inline.py
示例16: test_default
def test_default(self):
mr_job = MRTwoStepJob(['-r', 'dataproc'])
with mr_job.make_runner() as runner:
self.assertEqual(runner._opts['bootstrap_python'], True)
self.assertEqual(runner._bootstrap_python(),
self.EXPECTED_BOOTSTRAP)
self.assertEqual(runner._bootstrap,
self.EXPECTED_BOOTSTRAP)
开发者ID:Jeremyfanfan,项目名称:mrjob,代码行数:8,代码来源:test_dataproc.py
示例17: test_streaming_step_not_okay
def test_streaming_step_not_okay(self):
job = MRTwoStepJob()
job.sandbox()
with job.make_runner() as runner:
self.assertRaises(
TypeError,
runner._spark_script_args, 0)
开发者ID:Yelp,项目名称:mrjob,代码行数:8,代码来源:test_runner.py
示例18: test_end_to_end
def test_end_to_end(self):
# read from STDIN, a local file, and a remote file
stdin = StringIO('foo\nbar\n')
local_input_path = os.path.join(self.tmp_dir, 'input')
with open(local_input_path, 'w') as local_input_file:
local_input_file.write('bar\nqux\n')
input_to_upload = os.path.join(self.tmp_dir, 'remote_input')
with open(input_to_upload, 'w') as input_to_upload_file:
input_to_upload_file.write('foo\n')
remote_input_path = 'hdfs:///data/foo'
check_call([self.hadoop_bin,
'fs', '-put', input_to_upload, remote_input_path])
# doesn't matter what the intermediate output is; just has to exist.
add_mock_hadoop_output([''])
add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'])
mr_job = MRTwoStepJob(['-r', 'hadoop', '-v',
'--no-conf', '--hadoop-arg', '-libjar',
'--hadoop-arg', 'containsJars.jar',
'-', local_input_path, remote_input_path])
mr_job.sandbox(stdin=stdin)
local_tmp_dir = None
results = []
with mr_job.make_runner() as runner:
assert isinstance(runner, HadoopJobRunner)
runner.run()
for line in runner.stream_output():
key, value = mr_job.parse_output_line(line)
results.append((key, value))
local_tmp_dir = runner._get_local_tmp_dir()
# make sure cleanup hasn't happened yet
assert os.path.exists(local_tmp_dir)
assert any(runner.ls(runner.get_output_dir()))
# make sure we're writing to the correct path in HDFS
hdfs_root = os.environ['MOCK_HDFS_ROOT']
assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user'])
home_dir = os.path.join(hdfs_root, 'user', getpass.getuser())
assert_equal(os.listdir(home_dir), ['tmp'])
assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob'])
assert_equal(runner._opts['hadoop_extra_args'],
['-libjar', 'containsJars.jar'])
assert_equal(sorted(results),
[(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
# make sure cleanup happens
assert not os.path.exists(local_tmp_dir)
assert not any(runner.ls(runner.get_output_dir()))
开发者ID:nedrocks,项目名称:mrjob,代码行数:56,代码来源:hadoop_test.py
示例19: test_debugging_works
def test_debugging_works(self):
mr_job = MRTwoStepJob(['-r', 'emr', '-v',
'-c', self.mrjob_conf_path,
'--enable-emr-debugging'])
mr_job.sandbox()
with mr_job.make_runner() as runner:
runner.run()
flow = runner.make_emr_conn().describe_jobflow(runner._emr_job_flow_id)
assert_equal(flow.steps[0].name, 'Setup Hadoop Debugging')
开发者ID:hblanks,项目名称:mrjob,代码行数:10,代码来源:emr_test.py
示例20: test_echo_as_steps_python_bin
def test_echo_as_steps_python_bin(self):
mr_job = MRTwoStepJob(["--steps", "--steps-python-bin", "echo", "--no-conf", "-r", "local"])
mr_job.sandbox()
with mr_job.make_runner() as runner:
assert isinstance(runner, LocalMRJobRunner)
# MRTwoStepJob populates _steps in the runner, so un-populate
# it here so that the runner actually tries to get the steps
# via subprocess
runner._steps = None
self.assertRaises(ValueError, runner._get_steps)
开发者ID:alanhdu,项目名称:mrjob,代码行数:11,代码来源:test_local.py
注:本文中的tests.mr_two_step_job.MRTwoStepJob类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论