本文整理汇总了Python中mrjob.util.read_file函数的典型用法代码示例。如果您正苦于以下问题:Python read_file函数的具体用法?Python read_file怎么用?Python read_file使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了read_file函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_cat_compressed_stream
def test_cat_compressed_stream(self):
input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
input_gz = gzip.GzipFile(input_gz_path, 'w')
input_gz.write('foo\nbar\n')
input_gz.close()
# restrict a file object to only the read() method
class OnlyReadWrapper(object):
def __init__(self, fp):
self.fp = fp
def read(self, *args, **kwargs):
return self.fp.read(*args, **kwargs)
output = []
with open(input_gz_path) as f:
for line in read_file(input_gz_path, fileobj=OnlyReadWrapper(f)):
output.append(line)
self.assertEqual(output, ['foo\n', 'bar\n'])
input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
input_bz2.write('bar\nbar\nfoo\n')
input_bz2.close()
output = []
for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)):
output.append(line)
self.assertEqual(output, ['bar\n', 'bar\n', 'foo\n'])
开发者ID:ENuge,项目名称:mrjob,代码行数:32,代码来源:test_util.py
示例2: _cat_file
def _cat_file(self, filename):
if is_uri(filename):
# stream from HDFS
cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename]
log.debug('> %s' % cmd_line(cat_args))
cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)
def stream():
for line in cat_proc.stdout:
yield line
# there shouldn't be any stderr
for line in cat_proc.stderr:
log.error('STDERR: ' + line)
returncode = cat_proc.wait()
if returncode != 0:
raise CalledProcessError(returncode, cat_args)
return read_file(filename, stream())
else:
# read from local filesystem
return super(HadoopJobRunner, self)._cat_file(filename)
开发者ID:BrandonHaynes,项目名称:mrjob,代码行数:25,代码来源:hadoop.py
示例3: test_read_file_uncompressed_stream
def test_read_file_uncompressed_stream(self):
input_path = os.path.join(self.tmp_dir, 'input')
with open(input_path, 'w') as input_file:
input_file.write('bar\nfoo\n')
output = []
for line in read_file(input_path, fileobj=open(input_path)):
output.append(line)
self.assertEqual(output, ['bar\n', 'foo\n'])
开发者ID:dataartisan,项目名称:mrjob,代码行数:10,代码来源:test_util.py
示例4: test_read_uncompressed_file
def test_read_uncompressed_file(self):
input_path = os.path.join(self.tmp_dir, 'input')
with open(input_path, 'wb') as input_file:
input_file.write(b'bar\nfoo\n')
output = []
for line in read_file(input_path):
output.append(line)
self.assertEqual(output, [b'bar\n', b'foo\n'])
开发者ID:anirudhreddy92,项目名称:mrjob,代码行数:10,代码来源:test_util.py
示例5: test_read_gz_file
def test_read_gz_file(self):
input_gz_path = os.path.join(self.tmp_dir, "input.gz")
input_gz = gzip.GzipFile(input_gz_path, "wb")
input_gz.write(b"foo\nbar\n")
input_gz.close()
output = []
for line in read_file(input_gz_path):
output.append(line)
self.assertEqual(output, [b"foo\n", b"bar\n"])
开发者ID:tempcyc,项目名称:mrjob,代码行数:11,代码来源:test_util.py
示例6: test_read_bz2_file
def test_read_bz2_file(self):
input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')
input_bz2.write(b'bar\nbar\nfoo\n')
input_bz2.close()
output = []
for line in read_file(input_bz2_path):
output.append(line)
self.assertEqual(output, [b'bar\n', b'bar\n', b'foo\n'])
开发者ID:anirudhreddy92,项目名称:mrjob,代码行数:11,代码来源:test_util.py
示例7: _cat_file
def _cat_file(self, filename):
# stream lines from the s3 key
s3_key = self.get_s3_key(filename)
# stream the key to a fileobj
stream = StringIO()
s3_key.get_file(stream)
stream.seek(0)
buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=stream)
return buffer_iterator_to_line_iterator(buffer_iterator)
开发者ID:duedil-ltd,项目名称:mrjob,代码行数:11,代码来源:s3.py
示例8: _cat_file
def _cat_file(self, gcs_uri):
tmp_fd, tmp_path = tempfile.mkstemp()
with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj:
self._download_io(gcs_uri, tmp_fileobj)
tmp_fileobj.seek(0)
line_gen = read_file(
gcs_uri, fileobj=tmp_fileobj, yields_lines=False)
for current_line in line_gen:
yield current_line
开发者ID:Jeremyfanfan,项目名称:mrjob,代码行数:12,代码来源:gcs.py
示例9: test_read_gz_file_from_fileobj
def test_read_gz_file_from_fileobj(self):
input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
input_gz = gzip.GzipFile(input_gz_path, 'w')
input_gz.write('foo\nbar\n')
input_gz.close()
output = []
with open(input_gz_path) as f:
for line in read_file(input_gz_path, fileobj=OnlyReadWrapper(f)):
output.append(line)
self.assertEqual(output, ['foo\n', 'bar\n'])
开发者ID:Asana,项目名称:mrjob,代码行数:12,代码来源:test_util.py
示例10: test_read_bz2_file_from_fileobj
def test_read_bz2_file_from_fileobj(self):
input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
input_bz2 = bz2.BZ2File(input_bz2_path, 'wb')
input_bz2.write(b'bar\nbar\nfoo\n')
input_bz2.close()
output = []
with open(input_bz2_path, 'rb') as f:
for line in read_file(input_bz2_path, fileobj=OnlyReadWrapper(f)):
output.append(line)
self.assertEqual(output, [b'bar\n', b'bar\n', b'foo\n'])
开发者ID:anirudhreddy92,项目名称:mrjob,代码行数:12,代码来源:test_util.py
示例11: test_cat_compressed_stream
def test_cat_compressed_stream(self):
input_gz_path = os.path.join(self.tmp_dir, "input.gz")
input_gz = gzip.GzipFile(input_gz_path, "w")
input_gz.write("foo\nbar\n")
input_gz.close()
output = []
for line in read_file(input_gz_path, fileobj=open(input_gz_path)):
output.append(line)
self.assertEqual(output, ["foo\n", "bar\n"])
input_bz2_path = os.path.join(self.tmp_dir, "input.bz2")
input_bz2 = bz2.BZ2File(input_bz2_path, "w")
input_bz2.write("bar\nbar\nfoo\n")
input_bz2.close()
output = []
for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)):
output.append(line)
self.assertEqual(output, ["bar\n", "bar\n", "foo\n"])
开发者ID:bchess,项目名称:mrjob,代码行数:22,代码来源:test_util.py
示例12: _cat_file
def _cat_file(self, filename):
ssh_match = SSH_URI_RE.match(filename)
addr = ssh_match.group('hostname') or self._address_of_master()
if '!' in addr and self.ssh_key_name is None:
raise ValueError('ssh_key_name must not be None')
output = ssh_cat(
self._ssh_bin,
addr,
self._ec2_key_pair_file,
ssh_match.group('filesystem_path'),
self.ssh_key_name,
)
return read_file(filename, fileobj=StringIO(output))
开发者ID:Anihc,项目名称:mrjob,代码行数:13,代码来源:ssh.py
示例13: _cat_file
def _cat_file(self, filename):
ssh_match = _SSH_URI_RE.match(filename)
addr = ssh_match.group('hostname') or self._address_of_master()
keyfile = self._key_filename_for(addr)
output = _ssh_cat(
self._ssh_bin,
addr,
self._ec2_key_pair_file,
ssh_match.group('filesystem_path'),
keyfile,
)
return read_file(filename, fileobj=BytesIO(output))
开发者ID:Dean838,项目名称:mrjob,代码行数:14,代码来源:ssh.py
示例14: _cat_file
def _cat_file(self, filename):
# stream from HDFS
cat_args = self._hadoop_bin + ['fs', '-cat', filename]
log.debug('> %s' % cmd_line(cat_args))
cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)
def cleanup():
# there shouldn't be any stderr
for line in cat_proc.stderr:
log.error('STDERR: ' + line)
returncode = cat_proc.wait()
if returncode != 0:
raise IOError("Could not stream %s" % filename)
return read_file(filename, cat_proc.stdout, cleanup=cleanup)
开发者ID:Python-Z,项目名称:mrjob,代码行数:18,代码来源:hadoop.py
示例15: test_dont_split_gz
def test_dont_split_gz(self):
contents_gz = ['bar\n', 'qux\n', 'foo\n', 'bar\n', 'qux\n', 'foo\n']
contents_normal = ['foo\n', 'bar\n', 'bar\n']
all_contents_sorted = sorted(contents_gz + contents_normal)
input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
input_gz = gzip.GzipFile(input_gz_path, 'w')
input_gz.write(''.join(contents_gz))
input_gz.close()
input_path2 = os.path.join(self.tmp_dir, 'input2')
with open(input_path2, 'w') as input_file:
input_file.write(''.join(contents_normal))
runner = LocalMRJobRunner(conf_paths=[])
# split into 3 files
file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)
# Make sure that input.gz occurs in a single split that starts at
# its beginning and ends at its end
for split_info in file_splits.values():
if split_info['orig_name'] == input_gz_path:
self.assertEqual(split_info['start'], 0)
self.assertEqual(split_info['length'],
os.stat(input_gz_path)[stat.ST_SIZE])
# make sure we get 3 files
self.assertEqual(len(file_splits), 3)
# make sure all the data is preserved
content = []
for file_name in file_splits:
lines = list(read_file(file_name))
# make sure the input_gz split got its entire contents
if file_name == input_gz_path:
self.assertEqual(lines, contents_gz)
content.extend(lines)
self.assertEqual(sorted(content),
all_contents_sorted)
开发者ID:DrMavenRebe,项目名称:mrjob,代码行数:43,代码来源:test_local.py
示例16: test_read_large_bz2_file
def test_read_large_bz2_file(self):
# catch incorrect use of bz2 library (Issue #814)
input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2')
input_bz2 = bz2.BZ2File(input_bz2_path, 'w')
# can't just repeat same value, because we need the file to be
# compressed! 50000 lines is too few to catch the bug.
random.seed(0)
for _ in xrange(100000):
input_bz2.write('%016x\n' % random.randint(0, 2 ** 64 - 1))
input_bz2.close()
random.seed(0)
num_lines = 0
for line in read_file(input_bz2_path):
self.assertEqual(line, '%016x\n' % random.randint(0, 2 ** 64 - 1))
num_lines += 1
self.assertEqual(num_lines, 100000)
开发者ID:Asana,项目名称:mrjob,代码行数:20,代码来源:test_util.py
示例17: test_read_large_bz2_file
def test_read_large_bz2_file(self):
# catch incorrect use of bz2 library (Issue #814)
input_bz2_path = os.path.join(self.tmp_dir, "input.bz2")
input_bz2 = bz2.BZ2File(input_bz2_path, "wb")
# can't just repeat same value, because we need the file to be
# compressed! 50000 lines is too few to catch the bug.
with random_seed(0):
for _ in range(100000):
input_bz2.write((random_identifier() + "\n").encode("ascii"))
input_bz2.close()
# now expect to read back the same bytes
with random_seed(0):
num_lines = 0
for line in read_file(input_bz2_path):
self.assertEqual(line, (random_identifier() + "\n").encode("ascii"))
num_lines += 1
self.assertEqual(num_lines, 100000)
开发者ID:tempcyc,项目名称:mrjob,代码行数:21,代码来源:test_util.py
示例18: _cat_file
def _cat_file(self, filename):
# stream from HDFS
cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename]
log.debug('> %s' % cmd_line(cat_args))
cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)
def cleanup():
# this does someties happen; see #1396
for line in cat_proc.stderr:
log.error('STDERR: ' + to_string(line.rstrip(b'\r\n')))
cat_proc.stdout.close()
cat_proc.stderr.close()
returncode = cat_proc.wait()
if returncode != 0:
raise IOError("Could not stream %s" % filename)
return read_file(filename, cat_proc.stdout, cleanup=cleanup)
开发者ID:davidmarin,项目名称:mrjob,代码行数:21,代码来源:hadoop.py
示例19: gz_test
def gz_test(self, dir_path_name):
contents_gz = [b"bar\n", b"qux\n", b"foo\n", b"bar\n", b"qux\n", b"foo\n"]
contents_normal = [b"foo\n", b"bar\n", b"bar\n"]
all_contents_sorted = sorted(contents_gz + contents_normal)
input_gz_path = os.path.join(dir_path_name, "input.gz")
input_gz = gzip.GzipFile(input_gz_path, "wb")
input_gz.write(b"".join(contents_gz))
input_gz.close()
input_path2 = os.path.join(dir_path_name, "input2")
with open(input_path2, "wb") as input_file:
input_file.write(b"".join(contents_normal))
runner = LocalMRJobRunner(conf_paths=[])
# split into 3 files
file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)
# Make sure that input.gz occurs in a single split that starts at
# its beginning and ends at its end
for split_info in file_splits.values():
if split_info["orig_name"] == input_gz_path:
self.assertEqual(split_info["start"], 0)
self.assertEqual(split_info["length"], os.stat(input_gz_path)[stat.ST_SIZE])
# make sure we get 3 files
self.assertEqual(len(file_splits), 3)
# make sure all the data is preserved
content = []
for file_name in file_splits:
lines = list(read_file(file_name))
# make sure the input_gz split got its entire contents
if file_name == input_gz_path:
self.assertEqual(lines, contents_gz)
content.extend(lines)
self.assertEqual(sorted(content), all_contents_sorted)
开发者ID:alanhdu,项目名称:mrjob,代码行数:40,代码来源:test_local.py
示例20: _cat_file
def _cat_file(self, filename):
# stream lines from the s3 key
s3_key = self.get_s3_key(filename)
buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=s3_key)
return buffer_iterator_to_line_iterator(buffer_iterator)
开发者ID:inncapsule,项目名称:mrjob,代码行数:5,代码来源:s3.py
注:本文中的mrjob.util.read_file函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论