本文整理汇总了Python中mrjob.py2.to_string函数的典型用法代码示例。如果您正苦于以下问题:Python to_string函数的具体用法?Python to_string怎么用?Python to_string使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了to_string函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: _parse_counters_0_18
def _parse_counters_0_18(counter_string):
# 0.18 counters look like this:
# GroupName.CounterName:Value,Group1.Crackers:3,Group2.Nerf:243,...
groups = _COUNTER_RE_0_18.finditer(counter_string)
if groups is None:
log.warning("Cannot parse Hadoop counter string: %s" % counter_string)
for m in groups:
yield (to_string(m.group("group")), to_string(m.group("name")), int(m.group("value")))
开发者ID:tempcyc,项目名称:mrjob,代码行数:9,代码来源:parse.py
示例2: invoke_hadoop
def invoke_hadoop(self, args, ok_returncodes=None, ok_stderr=None,
return_stdout=False):
"""Run the given hadoop command, raising an exception on non-zero
return code. This only works for commands whose output we don't
care about.
Args:
ok_returncodes -- a list/tuple/set of return codes we expect to
get back from hadoop (e.g. [0,1]). By default, we only expect 0.
If we get an unexpected return code, we raise a CalledProcessError.
ok_stderr -- don't log STDERR or raise CalledProcessError if stderr
matches a regex in this list (even if the returncode is bad)
return_stdout -- return the stdout from the hadoop command rather
than logging it. If this is False, we return the returncode
instead.
"""
args = self._hadoop_bin + args
log.debug('> %s' % cmd_line(args))
proc = Popen(args, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate()
log_func = log.debug if proc.returncode == 0 else log.error
if not return_stdout:
for line in BytesIO(stdout):
log_func('STDOUT: ' + to_string(line.rstrip(b'\r\n')))
# check if STDERR is okay
stderr_is_ok = False
if ok_stderr:
for stderr_re in ok_stderr:
if stderr_re.match(stderr):
stderr_is_ok = True
break
if not stderr_is_ok:
for line in BytesIO(stderr):
log_func('STDERR: ' + to_string(line.rstrip(b'\r\n')))
ok_returncodes = ok_returncodes or [0]
if not stderr_is_ok and proc.returncode not in ok_returncodes:
raise CalledProcessError(proc.returncode, args)
if return_stdout:
return stdout
else:
return proc.returncode
开发者ID:DanisHack,项目名称:mrjob,代码行数:49,代码来源:hadoop.py
示例3: find_hadoop_java_stack_trace
def find_hadoop_java_stack_trace(lines):
"""Scan a log file or other iterable for a java stack trace from Hadoop,
and return it as a list of lines (bytes).
In logs from EMR, we find java stack traces in ``task-attempts/*/syslog``
Sample stack trace::
2010-07-27 18:25:48,397 WARN org.apache.hadoop.mapred.TaskTracker (main): Error running child
java.lang.OutOfMemoryError: Java heap space
at org.apache.hadoop.mapred.IFile$Reader.readNextBlock(IFile.java:270)
at org.apache.hadoop.mapred.IFile$Reader.next(IFile.java:332)
at org.apache.hadoop.mapred.Merger$Segment.next(Merger.java:147)
at org.apache.hadoop.mapred.Merger$MergeQueue.adjustPriorityQueue(Merger.java:238)
at org.apache.hadoop.mapred.Merger$MergeQueue.next(Merger.java:255)
at org.apache.hadoop.mapred.Merger.writeFile(Merger.java:86)
at org.apache.hadoop.mapred.Merger$MergeQueue.merge(Merger.java:377)
at org.apache.hadoop.mapred.Merger.merge(Merger.java:58)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:277)
at org.apache.hadoop.mapred.TaskTracker$Child.main(TaskTracker.java:2216)
(We omit the "Error running child" line from the results)
"""
for line in lines:
if line.rstrip(b'\r\n').endswith(b"Error running child"):
st_lines = []
for line in lines:
st_lines.append(line)
for line in lines:
if not line.startswith(b' at '):
break
st_lines.append(line)
return [to_string(line) for line in st_lines]
else:
return None
开发者ID:Roguelazer,项目名称:mrjob,代码行数:35,代码来源:parse.py
示例4: _run_job_in_hadoop
def _run_job_in_hadoop(self):
self._counters = []
for step_num in range(self._num_steps()):
log.debug("running step %d of %d" % (step_num + 1, self._num_steps()))
step_args = self._args_for_step(step_num)
log.debug("> %s" % cmd_line(step_args))
# try to use a PTY if it's available
try:
pid, master_fd = pty.fork()
except (AttributeError, OSError):
# no PTYs, just use Popen
step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE)
self._process_stderr_from_streaming(step_proc.stderr)
# there shouldn't be much output to STDOUT
for line in step_proc.stdout:
log.error("STDOUT: " + to_string(line.strip(b"\n")))
returncode = step_proc.wait()
else:
# we have PTYs
if pid == 0: # we are the child process
os.execvp(step_args[0], step_args)
else:
with os.fdopen(master_fd, "rb") as master:
# reading from master gives us the subprocess's
# stderr and stdout (it's a fake terminal)
self._process_stderr_from_streaming(master)
_, returncode = os.waitpid(pid, 0)
if returncode == 0:
# parsing needs step number for whole job
self._fetch_counters([step_num + self._start_step_num])
# printing needs step number relevant to this run of mrjob
self.print_counters([step_num + 1])
else:
msg = "Job failed with return code %d: %s" % (returncode, step_args)
log.error(msg)
# look for a Python traceback
cause = self._find_probable_cause_of_failure([step_num + self._start_step_num])
if cause:
# log cause, and put it in exception
cause_msg = [] # lines to log and put in exception
cause_msg.append("Probable cause of failure (from %s):" % cause["log_file_uri"])
cause_msg.extend(line.strip("\n") for line in cause["lines"])
if cause["input_uri"]:
cause_msg.append("(while reading from %s)" % cause["input_uri"])
for line in cause_msg:
log.error(line)
# add cause_msg to exception message
msg += "\n" + "\n".join(cause_msg) + "\n"
raise CalledProcessError(returncode, step_args)
开发者ID:senseb,项目名称:mrjob,代码行数:60,代码来源:hadoop.py
示例5: _process_stderr_from_streaming
def _process_stderr_from_streaming(self, stderr):
def treat_eio_as_eof(iter):
# on Linux, the PTY gives us a specific IOError when the
# when the child process exits, rather than EOF.
while True:
try:
yield next(iter) # okay for StopIteration to bubble up
except IOError as e:
if e.errno == errno.EIO:
return
else:
raise
for line in treat_eio_as_eof(stderr):
line = HADOOP_STREAMING_OUTPUT_RE.match(line).group(2)
log.info("HADOOP: " + to_string(line))
if b"Streaming Job Failed!" in line:
raise Exception(line)
# The job identifier is printed to stderr. We only want to parse it
# once because we know how many steps we have and just want to know
# what Hadoop thinks the first step's number is.
m = HADOOP_JOB_TIMESTAMP_RE.match(line)
if m and self._job_timestamp is None:
self._job_timestamp = m.group("timestamp")
self._start_step_num = int(m.group("step_num"))
开发者ID:ZhouYunan,项目名称:mrjob,代码行数:27,代码来源:hadoop.py
示例6: _cat_log
def _cat_log(fs, path):
"""fs.cat() the given log, converting lines to strings, and logging
errors."""
try:
for line in fs.cat(path):
yield to_string(line)
except IOError as e:
log.warning("couldn't cat() %s: %r" % (path, e))
开发者ID:parastoo-62,项目名称:mrjob,代码行数:8,代码来源:interpret.py
示例7: stderr_to_log
def stderr_to_log(lines):
for line in lines:
line = to_string(line)
if _HADOOP_NON_LOG_LINE_RE.match(line):
# use error because this is usually "Streaming Command Failed!"
_log_line_from_hadoop(line, level=logging.ERROR)
else:
yield line
开发者ID:BeeswaxIO,项目名称:mrjob,代码行数:8,代码来源:hadoop.py
示例8: parse_mr_job_stderr
def parse_mr_job_stderr(stderr, counters=None):
"""Parse counters and status messages out of MRJob output.
:param stderr: a filehandle, a list of lines (bytes), or bytes
:param counters: Counters so far, to update; a map from group (string to
counter name (string) to count.
Returns a dictionary with the keys *counters*, *statuses*, *other*:
- *counters*: counters so far; same format as above
- *statuses*: a list of status messages encountered
- *other*: lines (strings) that aren't either counters or status messages
"""
# For the corresponding code in Hadoop Streaming, see ``incrCounter()`` in
# http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/streaming/src/java/org/apache/hadoop/streaming/PipeMapRed.java?view=markup # noqa
if isinstance(stderr, bytes):
stderr = BytesIO(stderr)
if counters is None:
counters = {}
statuses = []
other = []
for line in stderr:
m = _COUNTER_RE.match(line.rstrip(b'\r\n'))
if m:
group, counter, amount_str = m.groups()
# don't leave these as bytes on Python 3
group = to_string(group)
counter = to_string(counter)
counters.setdefault(group, {})
counters[group].setdefault(counter, 0)
counters[group][counter] += int(amount_str)
continue
m = _STATUS_RE.match(line.rstrip(b'\r\n'))
if m:
# don't leave as bytes on Python 3
statuses.append(to_string(m.group(1)))
continue
other.append(to_string(line))
return {'counters': counters, 'statuses': statuses, 'other': other}
开发者ID:Roguelazer,项目名称:mrjob,代码行数:46,代码来源:parse.py
示例9: _cat_log
def _cat_log(fs, path):
"""fs.cat() the given log, converting lines to strings, and logging
errors."""
try:
if not fs.exists(path):
return
for line in fs.cat(path):
yield to_string(line)
except (IOError, OSError) as e:
log.warning("couldn't cat() %s: %r" % (path, e))
开发者ID:davidmarin,项目名称:mrjob,代码行数:10,代码来源:wrap.py
示例10: yield_lines
def yield_lines():
try:
for line in stderr:
yield to_string(line)
except IOError as e:
# this is just the PTY's way of saying goodbye
if e.errno == errno.EIO:
return
else:
raise
开发者ID:imtiaz39,项目名称:mrjob,代码行数:10,代码来源:step.py
示例11: ls
def ls(self, path_glob):
components = urlparse(path_glob)
hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)
version = self.get_hadoop_version()
# use ls -R on Hadoop 2 (see #1152)
if uses_yarn(version):
args = ['fs', '-ls', '-R', path_glob]
else:
args = ['fs', '-lsr', path_glob]
try:
stdout = self.invoke_hadoop(args, return_stdout=True,
ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
except CalledProcessError:
raise IOError("Could not ls %s" % path_glob)
for line in BytesIO(stdout):
line = line.rstrip(b'\r\n')
# ignore total item count
if line.startswith(b'Found '):
continue
fields = line.split(b' ')
# Throw out directories
if fields[0].startswith(b'd'):
continue
# Try to figure out which part of the line is the path
# Expected lines:
#
# HDFS:
# -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar
#
# S3:
# -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar
path_index = None
for index, field in enumerate(fields):
# look for time field, and pick one after that
# (can't use field[2] because that's an int in Python 3)
if len(field) == 5 and field[2:3] == b':':
path_index = (index + 1)
if not path_index:
raise IOError("Could not locate path in string %r" % line)
path = to_string(line.split(b' ', path_index)[-1])
# handle fully qualified URIs from newer versions of Hadoop ls
# (see Pull Request #577)
if is_uri(path):
yield path
else:
yield hdfs_prefix + path
开发者ID:kodizant,项目名称:mrjob,代码行数:55,代码来源:hadoop.py
示例12: find_python_traceback
def find_python_traceback(lines):
"""Scan a log file or other iterable for a Python traceback,
and return it as a list of lines (bytes).
In logs from EMR, we find python tracebacks in ``task-attempts/*/stderr``
"""
# Essentially, we detect the start of the traceback, and continue
# until we find a non-indented line, with some special rules for exceptions
# from subprocesses.
# Lines to pass back representing entire error found
all_tb_lines = []
# This is used to store a working list of lines in a single traceback
tb_lines = []
# This is used to store a working list of non-traceback lines between the
# current traceback and the previous one
non_tb_lines = []
# Track whether or not we are in a traceback rather than consuming the
# iterator
in_traceback = False
for line in lines:
# don't return bytes in Python 3
line = to_string(line)
if in_traceback:
tb_lines.append(line)
# If no indentation, this is the last line of the traceback
if line.lstrip() == line:
in_traceback = False
if line.startswith('subprocess.CalledProcessError'):
# CalledProcessError may mean that the subprocess printed
# errors to stderr which we can show the user
all_tb_lines += non_tb_lines
all_tb_lines += tb_lines
# Reset all working lists
tb_lines = []
non_tb_lines = []
else:
if line.startswith('Traceback (most recent call last):'):
tb_lines.append(line)
in_traceback = True
else:
non_tb_lines.append(line)
if all_tb_lines:
return all_tb_lines
else:
return None
开发者ID:Roguelazer,项目名称:mrjob,代码行数:55,代码来源:parse.py
示例13: cleanup
def cleanup():
# this does someties happen; see #1396
for line in cat_proc.stderr:
log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))
cat_proc.stdout.close()
cat_proc.stderr.close()
returncode = cat_proc.wait()
if returncode != 0:
raise IOError("Could not stream %s" % filename)
开发者ID:davidmarin,项目名称:mrjob,代码行数:12,代码来源:hadoop.py
示例14: ssh_terminate_single_job
def ssh_terminate_single_job(ssh_bin, address, ec2_key_pair_file):
"""Terminate the only job running the Hadoop cluster with master node
*address* using 'hadoop job -kill JOB_ID'. Return string output of command
or None if there was no job to termiante. Raise :py:class:`IOError` if some
other error occurred.
:param ssh_bin: Path to ``ssh`` binary
:param address: Address of your job's master node (obtained via
:py:meth:`boto.emr.EmrConnection.describe_jobflow`)
:param ec2_key_pair_file: Path to the key pair file (argument to ``-i``)
:return: ``True`` if successful, ``False`` if no job was running
"""
job_list_out = to_string(check_output(*ssh_run(
ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list'])))
job_list_lines = job_list_out.splitlines()
def job_list_output_error():
raise IOError('Could not read results of "hadoop job -list" and so'
' could not terminate job:\n%s' % job_list_out)
num_jobs_match = HADOOP_JOB_LIST_NUM_RE.match(job_list_lines[0])
if not num_jobs_match:
job_list_output_error()
if int(num_jobs_match.group(1)) > 1:
raise IOError('More than one job is running; unclear which one to'
' terminate, so not terminating any jobs')
if int(num_jobs_match.group(1)) == 0:
return None
job_info_match = HADOOP_JOB_LIST_INFO_RE.match(job_list_lines[2])
if not job_info_match:
job_list_output_error()
job_id = to_string(job_info_match.group(1))
job_kill_out = to_string(check_output(*ssh_run(
ssh_bin, address, ec2_key_pair_file,
['hadoop', 'job', '-kill', job_id])))
return job_kill_out
开发者ID:DanisHack,项目名称:mrjob,代码行数:40,代码来源:ssh.py
示例15: _run_on_all_nodes
def _run_on_all_nodes(runner, output_dir, cmd_args, print_stderr=True):
"""Given an :py:class:`EMRJobRunner`, run the command specified by
*cmd_args* on all nodes in the cluster and save the stdout and stderr of
each run to subdirectories of *output_dir*.
You should probably have run :py:meth:`_enable_slave_ssh_access()` on the
runner before calling this function.
"""
master_addr = runner._address_of_master()
addresses = [master_addr]
ssh_bin = runner._opts['ssh_bin']
ec2_key_pair_file = runner._opts['ec2_key_pair_file']
keyfile = None
slave_addrs = runner.fs.ssh_slave_hosts(master_addr)
if slave_addrs:
addresses += ['%s!%s' % (master_addr, slave_addr)
for slave_addr in slave_addrs]
# copying key file like a boss (name of keyfile doesn't really matter)
keyfile = 'mrboss-%s.pem' % random_identifier()
_ssh_copy_key(ssh_bin, master_addr, ec2_key_pair_file, keyfile)
for addr in addresses:
stdout, stderr = _ssh_run_with_recursion(
ssh_bin,
addr,
ec2_key_pair_file,
keyfile,
cmd_args,
)
if print_stderr:
print('---')
print('Command completed on %s.' % addr)
print(to_string(stderr), end=' ')
if '!' in addr:
base_dir = os.path.join(output_dir, 'slave ' + addr.split('!')[1])
else:
base_dir = os.path.join(output_dir, 'master')
if not os.path.exists(base_dir):
os.makedirs(base_dir)
with open(os.path.join(base_dir, 'stdout'), 'wb') as f:
f.write(stdout)
with open(os.path.join(base_dir, 'stderr'), 'wb') as f:
f.write(stderr)
开发者ID:davidmarin,项目名称:mrjob,代码行数:52,代码来源:mrboss.py
示例16: _ssh_slave_addresses
def _ssh_slave_addresses(ssh_bin, master_address, ec2_key_pair_file):
"""Get the IP addresses of the slave nodes. Fails silently because it
makes testing easier and if things are broken they will fail before this
function is called.
"""
if not ec2_key_pair_file or not os.path.exists(ec2_key_pair_file):
return [] # this is a testing environment
cmd = "hadoop dfsadmin -report | grep ^Name | cut -f2 -d: | cut -f2 -d' '"
args = ['bash -c "%s"' % cmd]
ips = to_string(_check_output(
*_ssh_run(ssh_bin, master_address, ec2_key_pair_file, args)))
return [ip for ip in ips.split('\n') if ip]
开发者ID:pieces201020,项目名称:mrjob,代码行数:13,代码来源:ssh.py
示例17: _parse_counters_0_20
def _parse_counters_0_20(counter_string):
# 0.20 counters look like this:
# {(groupid)(groupname)[(counterid)(countername)(countervalue)][...]...}
groups = _GROUP_RE_0_20.findall(counter_string)
if not groups:
log.warning('Cannot parse Hadoop counter string: %s' % counter_string)
for group_id, group_name, counter_str in groups:
matches = _COUNTER_RE_0_20.findall(counter_str)
try:
group_name = counter_unescape(group_name)
except ValueError:
log.warning("Could not decode group name %r" % group_name)
group_name = to_string(group_name)
for counter_id, counter_name, counter_value in matches:
try:
counter_name = counter_unescape(counter_name)
except ValueError:
log.warning("Could not decode counter name %r" % counter_name)
counter_name = to_string(counter_name)
yield group_name, counter_name, int(counter_value)
开发者ID:Roguelazer,项目名称:mrjob,代码行数:24,代码来源:parse.py
示例18: get_hadoop_version
def get_hadoop_version(self):
"""Invoke the hadoop executable to determine its version"""
# mkdir() needs this
if not self._hadoop_version:
stdout = self.invoke_hadoop(['version'], return_stdout=True)
if stdout:
first_line = stdout.split(b'\n')[0]
m = _HADOOP_VERSION_RE.match(first_line)
if m:
self._hadoop_version = to_string(m.group('version'))
log.info("Using Hadoop version %s" % self._hadoop_version)
else:
raise Exception('Unable to determine Hadoop version.')
return self._hadoop_version
开发者ID:kodizant,项目名称:mrjob,代码行数:15,代码来源:hadoop.py
示例19: find_input_uri_for_mapper
def find_input_uri_for_mapper(lines):
"""Scan a log file or other iterable for the path of an input file
for the first mapper on Hadoop. Just returns the path, or None if
no match.
In logs from EMR, we find python tracebacks in ``task-attempts/*/syslog``
Matching log lines look like::
2010-07-27 17:54:54,344 INFO org.apache.hadoop.fs.s3native.NativeS3FileSystem (main): Opening 's3://yourbucket/logs/2010/07/23/log2-00077.gz' for reading
"""
val = None
for line in lines:
match = _OPENING_FOR_READING_RE.match(line)
if match:
val = to_string(match.group(1))
return val
开发者ID:Roguelazer,项目名称:mrjob,代码行数:17,代码来源:parse.py
示例20: find_job_log_multiline_error
def find_job_log_multiline_error(lines):
"""Scan a log file for an arbitrary multi-line error. Return it as a list
of lines, or None of nothing was found.
Here is an example error::
MapAttempt TASK_TYPE="MAP" TASKID="task_201106280040_0001_m_000218" TASK_ATTEMPT_ID="attempt_201106280040_0001_m_000218_5" TASK_STATUS="FAILED" FINISH_TIME="1309246900665" HOSTNAME="/default-rack/ip-10-166-239-133.us-west-1.compute.internal" ERROR="Error initializing attempt_201106280040_0001_m_000218_5:
java.io.IOException: Cannot run program "bash": java.io.IOException: error=12, Cannot allocate memory
at java.lang.ProcessBuilder.start(ProcessBuilder.java:460)
at org.apache.hadoop.util.Shell.runCommand(Shell.java:149)
at org.apache.hadoop.util.Shell.run(Shell.java:134)
at org.apache.hadoop.fs.DF.getAvailable(DF.java:73)
at org.apache.hadoop.fs.LocalDirAllocator$AllocatorPerContext.getLocalPathForWrite(LocalDirAllocator.java:296)
at org.apache.hadoop.fs.LocalDirAllocator.getLocalPathForWrite(LocalDirAllocator.java:124)
at org.apache.hadoop.mapred.TaskTracker.localizeJob(TaskTracker.java:648)
at org.apache.hadoop.mapred.TaskTracker.startNewTask(TaskTracker.java:1320)
at org.apache.hadoop.mapred.TaskTracker.offerService(TaskTracker.java:956)
at org.apache.hadoop.mapred.TaskTracker.run(TaskTracker.java:1357)
at org.apache.hadoop.mapred.TaskTracker.main(TaskTracker.java:2361)
Caused by: java.io.IOException: java.io.IOException: error=12, Cannot allocate memory
at java.lang.UNIXProcess.<init>(UNIXProcess.java:148)
at java.lang.ProcessImpl.start(ProcessImpl.java:65)
at java.lang.ProcessBuilder.start(ProcessBuilder.java:453)
... 10 more
"
The first line returned will only include the text after ``ERROR="``, and
discard the final line with just ``"``.
These errors are parsed from jobs/\*.jar.
"""
for line in lines:
m = _MULTILINE_JOB_LOG_ERROR_RE.match(line)
if m:
st_lines = []
if m.group('first_line'):
st_lines.append(m.group('first_line'))
for line in lines:
st_lines.append(line)
for line in lines:
if line.strip() == b'"':
break
st_lines.append(line)
return [to_string(line) for line in st_lines]
return None
开发者ID:Roguelazer,项目名称:mrjob,代码行数:45,代码来源:parse.py
注:本文中的mrjob.py2.to_string函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论