本文整理汇总了Python中pyarrow.parquet.write_table函数的典型用法代码示例。如果您正苦于以下问题:Python write_table函数的具体用法?Python write_table怎么用?Python write_table使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了write_table函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_pandas_parquet_configuration_options
def test_pandas_parquet_configuration_options(tmpdir):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
'uint64': np.arange(size, dtype=np.uint64),
'int8': np.arange(size, dtype=np.int16),
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0
})
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df)
for use_dictionary in [True, False]:
pq.write_table(arrow_table, filename.strpath,
version="2.0",
use_dictionary=use_dictionary)
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)
for compression in ['NONE', 'SNAPPY', 'GZIP']:
pq.write_table(arrow_table, filename.strpath,
version="2.0",
compression=compression)
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:34,代码来源:test_parquet.py
示例2: test_read_multiple_parquet_files
def test_read_multiple_parquet_files(self):
import pyarrow.parquet as pq
nfiles = 10
size = 5
tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())
self.hdfs.mkdir(tmpdir)
test_data = []
paths = []
for i in range(nfiles):
df = test_parquet._test_dataframe(size, seed=i)
df['index'] = np.arange(i * size, (i + 1) * size)
# Hack so that we don't have a dtype cast in v1 files
df['uint32'] = df['uint32'].astype(np.int64)
path = pjoin(tmpdir, '{0}.parquet'.format(i))
table = pa.Table.from_pandas(df, preserve_index=False)
with self.hdfs.open(path, 'wb') as f:
pq.write_table(table, f)
test_data.append(table)
paths.append(path)
result = self.hdfs.read_parquet(tmpdir)
expected = pa.concat_tables(test_data)
pdt.assert_frame_equal(result.to_pandas()
.sort_values(by='index').reset_index(drop=True),
expected.to_pandas())
开发者ID:NonVolatileComputing,项目名称:arrow,代码行数:35,代码来源:test_hdfs.py
示例3: test_read_single_row_group
def test_read_single_row_group():
# ARROW-471
N, K = 10000, 4
df = alltypes_sample(size=N)
a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
buf = io.BytesIO()
pq.write_table(a_table, buf, row_group_size=N / K,
compression='snappy', version='2.0')
buf.seek(0)
pf = pq.ParquetFile(buf)
assert pf.num_row_groups == K
row_groups = [pf.read_row_group(i) for i in range(K)]
result = pa.concat_tables(row_groups)
pdt.assert_frame_equal(df, result.to_pandas())
cols = df.columns[:2]
row_groups = [pf.read_row_group(i, columns=cols)
for i in range(K)]
result = pa.concat_tables(row_groups)
pdt.assert_frame_equal(df[cols], result.to_pandas())
开发者ID:StevenMPhillips,项目名称:arrow,代码行数:26,代码来源:test_parquet.py
示例4: test_pandas_parquet_1_0_rountrip
def test_pandas_parquet_1_0_rountrip(tmpdir):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
'uint64': np.arange(size, dtype=np.uint64),
'int8': np.arange(size, dtype=np.int16),
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
'str': [str(x) for x in range(size)],
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
'empty_str': [''] * size
})
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df)
pq.write_table(arrow_table, filename.strpath, version="1.0")
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
# We pass uint32_t as int64_t if we write Parquet version 1.0
df['uint32'] = df['uint32'].values.astype(np.int64)
pdt.assert_frame_equal(df, df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:29,代码来源:test_parquet.py
示例5: _write_table
def _write_table(table, path, **kwargs):
import pyarrow.parquet as pq
if isinstance(table, pd.DataFrame):
table = pa.Table.from_pandas(table)
pq.write_table(table, path, **kwargs)
return table
开发者ID:NonVolatileComputing,项目名称:arrow,代码行数:8,代码来源:test_parquet.py
示例6: make_sample_file
def make_sample_file(df):
a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
buf = io.BytesIO()
pq.write_table(a_table, buf, compression='SNAPPY', version='2.0')
buf.seek(0)
return pq.ParquetFile(buf)
开发者ID:kiril-me,项目名称:arrow,代码行数:8,代码来源:test_parquet.py
示例7: test_pandas_parquet_2_0_rountrip
def test_pandas_parquet_2_0_rountrip(tmpdir):
df = alltypes_sample(size=10000)
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
pq.write_table(arrow_table, filename.strpath, version="2.0")
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:9,代码来源:test_parquet.py
示例8: test_pandas_parquet_native_file_roundtrip
def test_pandas_parquet_native_file_roundtrip(tmpdir):
df = _test_dataframe(10000)
arrow_table = A.from_pandas_dataframe(df)
imos = paio.InMemoryOutputStream()
pq.write_table(arrow_table, imos, version="2.0")
buf = imos.get_result()
reader = paio.BufferReader(buf)
df_read = pq.read_table(reader).to_pandas()
pdt.assert_frame_equal(df, df_read)
开发者ID:apache,项目名称:arrow,代码行数:9,代码来源:test_parquet.py
示例9: test_column_of_lists
def test_column_of_lists(tmpdir):
df, schema = dataframe_with_arrays()
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema)
pq.write_table(arrow_table, filename.strpath, version="2.0")
table_read = pq.read_table(filename.strpath)
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df, df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:9,代码来源:test_parquet.py
示例10: _write_partition_pyarrow
def _write_partition_pyarrow(df, open_with, filename, write_index,
metadata_path=None, **kwargs):
import pyarrow as pa
from pyarrow import parquet
t = pa.Table.from_pandas(df, preserve_index=write_index)
with open_with(filename, 'wb') as fil:
parquet.write_table(t, fil, **kwargs)
if metadata_path is not None:
with open_with(metadata_path, 'wb') as fil:
kwargs.pop('compression', None)
parquet.write_metadata(t.schema, fil, **kwargs)
开发者ID:postelrich,项目名称:dask,代码行数:13,代码来源:parquet.py
示例11: read_parquet
def read_parquet(fn):
""" read parquet file with Spark """
print("Loading parquest file: %s..."% fn)
file_name = 'parquet_sample.dat'
read_parquest(file_name)
fn = 'sample.parquet'
tbl = pq.read_table(fn)
df = tbl.to_pandas()
d=df.iloc[:, 0:3]
table = pa.Table.from_pandas(d)
pq.write_table(table, 'example.parquet')
pass
开发者ID:teckoo,项目名称:HTMLCSS_Javascript_cookbook,代码行数:14,代码来源:file_io.py
示例12: test_min_chunksize
def test_min_chunksize():
data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
table = pa.Table.from_pandas(data.reset_index())
buf = io.BytesIO()
pq.write_table(table, buf, chunk_size=-1)
buf.seek(0)
result = pq.read_table(buf)
assert result.equals(table)
with pytest.raises(ValueError):
pq.write_table(table, buf, chunk_size=0)
开发者ID:StevenMPhillips,项目名称:arrow,代码行数:14,代码来源:test_parquet.py
示例13: test_client
def test_client(tmpdir, data):
# construct with a path to a file
d = tmpdir / 'pq'
d.mkdir()
for k, v in data.items():
f = d / "{}.parquet".format(k)
table = pa.Table.from_pandas(v)
pq.write_table(table, str(f))
c = ParquetClient(tmpdir)
assert c.list_databases() == ['pq']
assert c.database().pq.list_tables() == ['close', 'open']
开发者ID:cloudera,项目名称:ibis,代码行数:14,代码来源:test_parquet.py
示例14: test_pandas_column_selection
def test_pandas_column_selection(tmpdir):
size = 10000
np.random.seed(0)
df = pd.DataFrame({
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16)
})
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df)
pq.write_table(arrow_table, filename.strpath)
table_read = pq.read_table(filename.strpath, columns=['uint8'])
df_read = table_read.to_pandas()
pdt.assert_frame_equal(df[['uint8']], df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:14,代码来源:test_parquet.py
示例15: test_fastparquet_read_with_hdfs
def test_fastparquet_read_with_hdfs():
fs = hdfs_test_client()
df = tm.makeDataFrame()
table = pa.Table.from_pandas(df)
path = '/tmp/testing.parquet'
with fs.open(path, 'wb') as f:
pq.write_table(table, f)
parquet_file = fastparquet.ParquetFile(path, open_with=fs.open)
result = parquet_file.to_pandas()
tm.assert_frame_equal(result, df)
开发者ID:dremio,项目名称:arrow,代码行数:14,代码来源:parquet_interop.py
示例16: parquet
def parquet(tmpdir, data):
pa = pytest.importorskip('pyarrow')
import pyarrow.parquet as pq # noqa: E402
from ibis.file.parquet import ParquetClient
# create single files
d = tmpdir.mkdir('pq')
for k, v in data.items():
f = d / '{}.parquet'.format(k)
table = pa.Table.from_pandas(v)
pq.write_table(table, str(f))
return ParquetClient(tmpdir).database()
开发者ID:cloudera,项目名称:ibis,代码行数:14,代码来源:conftest.py
示例17: test_read_no_metadata
def test_read_no_metadata(tmpdir, engine):
# use pyarrow.parquet to create a parquet file without
# pandas metadata
pa = pytest.importorskip("pyarrow")
import pyarrow.parquet as pq
tmp = str(tmpdir) + "table.parq"
table = pa.Table.from_arrays([pa.array([1, 2, 3]),
pa.array([3, 4, 5])],
names=['A', 'B'])
pq.write_table(table, tmp)
result = dd.read_parquet(tmp, engine=engine)
expected = pd.DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]})
assert_eq(result, expected)
开发者ID:caseyclements,项目名称:dask,代码行数:14,代码来源:test_parquet.py
示例18: test_single_pylist_column_roundtrip
def test_single_pylist_column_roundtrip(tmpdir):
for dtype in [int, float]:
filename = tmpdir.join('single_{}_column.parquet'
.format(dtype.__name__))
data = [pa.from_pylist(list(map(dtype, range(5))))]
table = pa.Table.from_arrays(data, names=('a', 'b'), name='table_name')
pq.write_table(table, filename.strpath)
table_read = pq.read_table(filename.strpath)
for col_written, col_read in zip(table.itercolumns(),
table_read.itercolumns()):
assert col_written.name == col_read.name
assert col_read.data.num_chunks == 1
data_written = col_written.data.chunk(0)
data_read = col_read.data.chunk(0)
assert data_written.equals(data_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:15,代码来源:test_parquet.py
示例19: test_multithreaded_read
def test_multithreaded_read():
df = alltypes_sample(size=10000)
table = pa.Table.from_pandas(df, timestamps_to_ms=True)
buf = io.BytesIO()
pq.write_table(table, buf, compression='SNAPPY', version='2.0')
buf.seek(0)
table1 = pq.read_table(buf, nthreads=4)
buf.seek(0)
table2 = pq.read_table(buf, nthreads=1)
assert table1.equals(table2)
开发者ID:kiril-me,项目名称:arrow,代码行数:15,代码来源:test_parquet.py
示例20: parquet
def parquet(tables, data_directory, ignore_missing_dependency, **params):
try:
import pyarrow as pa # noqa: F401
import pyarrow.parquet as pq # noqa: F401
except ImportError:
msg = 'PyArrow dependency is missing'
if ignore_missing_dependency:
logger.warning('Ignored: %s', msg)
return 0
else:
raise click.ClickException(msg)
data_directory = Path(data_directory)
for table, df in read_tables(tables, data_directory):
arrow_table = pa.Table.from_pandas(df)
target_path = data_directory / '{}.parquet'.format(table)
pq.write_table(arrow_table, str(target_path))
开发者ID:cloudera,项目名称:ibis,代码行数:17,代码来源:datamgr.py
注:本文中的pyarrow.parquet.write_table函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论