本文整理汇总了Python中pydoop.hdfs.open函数的典型用法代码示例。如果您正苦于以下问题:Python open函数的具体用法?Python open怎么用?Python open使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了open函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: open
def open(self):
for test_path in self.hdfs_paths[0], self.local_paths[0]:
with hdfs.open(test_path, "w") as f:
f.write(self.data)
f.fs.close()
with hdfs.open(test_path) as f:
self.assertEqual(f.read(), self.data)
f.fs.close()
开发者ID:ZEMUSHKA,项目名称:pydoop,代码行数:8,代码来源:test_hdfs.py
示例2: dump
def dump(self):
for test_path in self.hdfs_paths[0], self.local_paths[0]:
hdfs.dump(self.data, test_path)
with hdfs.open(test_path) as fi:
rdata = fi.read()
fi.fs.close()
self.assertEqual(rdata, self.data)
开发者ID:ZEMUSHKA,项目名称:pydoop,代码行数:7,代码来源:test_hdfs.py
示例3: xml_from_hdfs
def xml_from_hdfs(url):
with hdfs.open(url, "r") as f:
lines = f.read().strip().split('\n')
docs, doc = [], None
for line in lines:
if line.startswith('<doc'):
doc = line
elif line.startswith('</doc>'):
docs.append(doc + line)
else:
#line = line.replace('&', '').replace('"', "'")
doc += line.replace('"', "'")
for doc in docs:
dom = bs(doc).find('doc')
doc = {}
try:
doc['id'] = dom.attrs['id']
doc['url'] = dom.attrs['url']
doc['title'] = dom.attrs['title']
except AttributeError, e:
continue
doc['content'] = dom.text
doc['md5'] = hashlib.md5(str(doc)).hexdigest()
yield doc
开发者ID:legendlee1314,项目名称:ooni,代码行数:25,代码来源:hdfs2mongo_distributed.py
示例4: map
def map(self, ctx):
p = BioImgPlane(ctx.value)
pixels = p.get_xy()
bn = '%s-z%04d-c%04d-t%04d.npy' % (p.name, p.z, p.c, p.t)
fn = hdfs.path.join(self.out_dir, p.name, bn)
with hdfs.open(fn, 'w') as fo:
np.save(fo, pixels)
ctx.emit(fn, '%s\t%s' % (p.dimension_order, pixels.shape))
开发者ID:IDR,项目名称:pydoop-features,代码行数:8,代码来源:try_input_format.py
示例5: __init__
def __init__(self, ctx):
super(AvroReader, self).__init__(ctx)
isplit = ctx.input_split
self.region_start = isplit.offset
self.region_end = isplit.offset + isplit.length
self.reader = SeekableDataFileReader(hdfs.open(isplit.filename),
DatumReader())
self.reader.align_after(isplit.offset)
开发者ID:CynthiaYiqingHuang,项目名称:pydoop,代码行数:8,代码来源:avrolib.py
示例6: put
def put(self):
src = hdfs.path.split(self.local_paths[0])[-1]
dest = self.hdfs_paths[0]
with open(src, "w") as f:
f.write(self.data)
hdfs.put(src, dest)
with hdfs.open(dest) as fi:
rdata = fi.read()
self.assertEqual(rdata, self.data)
开发者ID:ZEMUSHKA,项目名称:pydoop,代码行数:9,代码来源:test_hdfs.py
示例7: __init__
def __init__(self, context):
super(Reader, self).__init__()
self.isplit = pp.InputSplit(context.getInputSplit())
self.file = hdfs.open(self.isplit.filename)
self.file.seek(self.isplit.offset)
self.bytes_read = 0
if self.isplit.offset > 0:
discarded = self.file.readline() # read by reader of previous split
self.bytes_read += len(discarded)
开发者ID:ilveroluca,项目名称:pydoop,代码行数:9,代码来源:wordcount-rr.py
示例8: __init__
def __init__(self, context):
super(Writer, self).__init__(context)
self.logger = LOGGER.getChild("Writer")
jc = context.job_conf
outfn = context.get_default_work_file()
self.logger.info("writing to %s", outfn)
hdfs_user = jc.get("pydoop.hdfs.user", None)
self.sep = jc.get("mapreduce.output.textoutputformat.separator", "\t")
self.file = hdfs.open(outfn, "wt", user=hdfs_user)
开发者ID:crs4,项目名称:pydoop,代码行数:9,代码来源:map_only_python_writer.py
示例9: __init__
def __init__(self, context):
super(Writer, self).__init__(context)
self.logger = logging.getLogger("Writer")
jc = context.getJobConf()
jc_configure_int(self, jc, "mapred.task.partition", "part")
jc_configure(self, jc, "mapred.work.output.dir", "outdir")
jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
self.outfn = "%s/part-%05d" % (self.outdir, self.part)
self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
开发者ID:ilveroluca,项目名称:pydoop,代码行数:10,代码来源:wordcount-full.py
示例10: json_from_hdfs
def json_from_hdfs(url):
assert hdfs.path.isdir(url)
file_lists = hdfs.ls(url)
for fi in file_lists:
with hdfs.open(fi, "r") as f:
items = f.read().strip().split('\n')
for it in items:
it = loads(it)
it['md5'] = hashlib.md5(str(it)).hexdigest()
yield it
开发者ID:legendlee1314,项目名称:ooni,代码行数:10,代码来源:hdfs2mongo.py
示例11: __init__
def __init__(self, context):
super(Writer, self).__init__(context)
self.logger = LOGGER.getChild("Writer")
jc = context.job_conf
part = jc.get_int("mapred.task.partition")
out_dir = jc["mapred.work.output.dir"]
outfn = "%s/part-%05d" % (out_dir, part)
hdfs_user = jc.get("pydoop.hdfs.user", None)
self.file = hdfs.open(outfn, "w", user=hdfs_user)
self.sep = jc.get("mapred.textoutputformat.separator", "\t")
开发者ID:CynthiaYiqingHuang,项目名称:pydoop,代码行数:10,代码来源:wordcount_full.py
示例12: _choose_break_points
def _choose_break_points(cls, args):
n_records, n_breakpoints, path = args
block_size = n_records * RECORD_LENGTH
with hdfs.open(path, 'r') as f:
data = f.read(block_size)
assert len(data) == block_size
step = max(n_records // n_breakpoints, 1)
keys = sorted([data[k:k + KEY_LENGTH]
for k in range(0, block_size, RECORD_LENGTH)])
return [_ for _ in it.islice(keys, step, n_records, step)]
开发者ID:elzaggo,项目名称:pydoop,代码行数:10,代码来源:pterasort.py
示例13: processLine
def processLine(myfile, topic):
with hdfs.open(myfile["name"]) as handle:
for i, line in enumerate(handle):
#strip line
line = line.strip()
#Submit data (my function)
submitLine(topic, line, trials=3)
if i % 20000 == 0 and i != 0:
logger.info("%s lines submitted for %s" %(i, myfile["name"]))
开发者ID:bunop,项目名称:ccc-capstone,代码行数:11,代码来源:kafka-producer.py
示例14: __init__
def __init__(self, context):
super(AvroWriter, self).__init__(context)
self.logger = LOGGER.getChild('AvroWriter')
job_conf = context.job_conf
part = int(job_conf['mapreduce.task.partition'])
outdir = job_conf["mapreduce.task.output.dir"]
outfn = "%s/part-r-%05d.avro" % (outdir, part)
wh = hdfs.open(outfn, "w")
self.logger.debug('created hdfs file %s', outfn)
self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
self.logger.debug('opened AvroWriter')
开发者ID:wtj,项目名称:pydoop,代码行数:11,代码来源:avrolib.py
示例15: __init__
def __init__(self, context):
super(Writer, self).__init__(context)
self.logger = LOGGER.getChild("Writer")
jc = context.job_conf
part = jc.get_int("mapred.task.partition")
out_dir = jc["mapred.work.output.dir"]
self.logger.debug("part: %d", part)
self.logger.debug("outdir: %s", out_dir)
outfn = "%s/part-%05d" % (out_dir, part)
hdfs_user = jc.get("pydoop.hdfs.user", None)
self.file = hdfs.open(outfn, "wb", user=hdfs_user)
开发者ID:elzaggo,项目名称:pydoop,代码行数:11,代码来源:ioformats.py
示例16: mapper
def mapper(_, record, writer, conf):
out_dir = conf.get('out.dir', utils.make_random_str())
if not hdfs.path.isdir(out_dir):
hdfs.mkdir(out_dir)
hdfs.chmod(out_dir, 'g+rwx')
img_path = record.strip()
a = get_array(img_path)
out_a = calc_features(a)
out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path))
with hdfs.open(out_path, 'w') as fo:
np.save(fo, out_a) # actual output
hdfs.chmod(out_path, 'g+rw')
writer.emit(img_path, fo.name) # info (tab-separated input-output)
开发者ID:manics,项目名称:pydoop-features,代码行数:13,代码来源:features.py
示例17: collect_output
def collect_output(mr_out_dir, out_file=None):
"""
Return all mapreduce output in ``mr_out_dir``.
Append the output to ``out_file`` if provided. Otherwise, return
the result as a single string (it is the caller's responsibility to
ensure that the amount of data retrieved fits into memory).
"""
if out_file is None:
output = []
for fn in iter_mr_out_files(mr_out_dir):
with hdfs.open(fn, "rt") as f:
output.append(f.read())
return "".join(output)
else:
block_size = 16777216
with open(out_file, 'a') as o:
for fn in iter_mr_out_files(mr_out_dir):
with hdfs.open(fn) as f:
data = f.read(block_size)
while len(data) > 0:
o.write(data)
data = f.read(block_size)
开发者ID:crs4,项目名称:pydoop,代码行数:23,代码来源:hadut.py
示例18: read
def read(readFlag):
print(readFlag);
if (readFlag == True):
targetFile = config.targetFile.strip()
targetDirectory = config.targetDirectory.strip()
targetPath = config.targetPath
print(targetPath)
# instantiate hadoop
hdfs.hdfs()
# read from hadoop
fileToRead = hdfs.open(targetPath)
print(fileToRead.read())
开发者ID:davedwards,项目名称:beautiful-data,代码行数:15,代码来源:hadoopReader.py
示例19: main
def main(argv=None):
parser = make_parser()
args, unknown_args = parser.parse_known_args(argv)
args.job_name = 'pteracheck'
args.module = 'pteracheck'
args.do_not_use_java_record_reader = True
args.do_not_use_java_record_writer = False
args.num_reducers = 1
args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
submitter = PydoopSubmitter()
submitter.set_args(args, [] if unknown_args is None else unknown_args)
submitter.run()
path = os.path.join(args.output, 'part-r-00000')
with hdfs.open(path, 'rb') as f:
data = f.read()
check_rows(data.split(b'\n')[:-1])
开发者ID:elzaggo,项目名称:pydoop,代码行数:16,代码来源:checkrecords.py
示例20: check_transpose
def check_transpose(mr_out_dir):
output = []
for fn in hadut.iter_mr_out_files(mr_out_dir):
with hdfs.open(fn, "rt") as f:
for line in f:
row = line.rstrip().split("\t")
index = int(row.pop(0))
output.append((index, row))
output = [_[1] for _ in sorted(output)]
exp_output = []
in_fn = os.path.join(THIS_DIR, "data", "transpose_input", "matrix.txt")
with open(in_fn) as f:
for line in f:
for i, item in enumerate(line.split()):
try:
exp_output[i].append(item)
except IndexError:
exp_output.append([item])
return output == exp_output
开发者ID:crs4,项目名称:pydoop,代码行数:19,代码来源:check.py
注:本文中的pydoop.hdfs.open函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论