• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Python parquet.write_table函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pyarrow.parquet.write_table函数的典型用法代码示例。如果您正苦于以下问题:Python write_table函数的具体用法?Python write_table怎么用?Python write_table使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了write_table函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_pandas_parquet_configuration_options

def test_pandas_parquet_configuration_options(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)

    for use_dictionary in [True, False]:
        pq.write_table(arrow_table, filename.strpath,
                       version="2.0",
                       use_dictionary=use_dictionary)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)

    for compression in ['NONE', 'SNAPPY', 'GZIP']:
        pq.write_table(arrow_table, filename.strpath,
                       version="2.0",
                       compression=compression)
        table_read = pq.read_table(filename.strpath)
        df_read = table_read.to_pandas()
        pdt.assert_frame_equal(df, df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:34,代码来源:test_parquet.py


示例2: test_read_multiple_parquet_files

    def test_read_multiple_parquet_files(self):
        import pyarrow.parquet as pq

        nfiles = 10
        size = 5

        tmpdir = pjoin(self.tmp_path, 'multi-parquet-' + guid())

        self.hdfs.mkdir(tmpdir)

        test_data = []
        paths = []
        for i in range(nfiles):
            df = test_parquet._test_dataframe(size, seed=i)

            df['index'] = np.arange(i * size, (i + 1) * size)

            # Hack so that we don't have a dtype cast in v1 files
            df['uint32'] = df['uint32'].astype(np.int64)

            path = pjoin(tmpdir, '{0}.parquet'.format(i))

            table = pa.Table.from_pandas(df, preserve_index=False)
            with self.hdfs.open(path, 'wb') as f:
                pq.write_table(table, f)

            test_data.append(table)
            paths.append(path)

        result = self.hdfs.read_parquet(tmpdir)
        expected = pa.concat_tables(test_data)

        pdt.assert_frame_equal(result.to_pandas()
                               .sort_values(by='index').reset_index(drop=True),
                               expected.to_pandas())
开发者ID:NonVolatileComputing,项目名称:arrow,代码行数:35,代码来源:test_hdfs.py


示例3: test_read_single_row_group

def test_read_single_row_group():
    # ARROW-471
    N, K = 10000, 4
    df = alltypes_sample(size=N)

    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(a_table, buf, row_group_size=N / K,
                   compression='snappy', version='2.0')

    buf.seek(0)

    pf = pq.ParquetFile(buf)

    assert pf.num_row_groups == K

    row_groups = [pf.read_row_group(i) for i in range(K)]
    result = pa.concat_tables(row_groups)
    pdt.assert_frame_equal(df, result.to_pandas())

    cols = df.columns[:2]
    row_groups = [pf.read_row_group(i, columns=cols)
                  for i in range(K)]
    result = pa.concat_tables(row_groups)
    pdt.assert_frame_equal(df[cols], result.to_pandas())
开发者ID:StevenMPhillips,项目名称:arrow,代码行数:26,代码来源:test_parquet.py


示例4: test_pandas_parquet_1_0_rountrip

def test_pandas_parquet_1_0_rountrip(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16),
        'uint32': np.arange(size, dtype=np.uint32),
        'uint64': np.arange(size, dtype=np.uint64),
        'int8': np.arange(size, dtype=np.int16),
        'int16': np.arange(size, dtype=np.int16),
        'int32': np.arange(size, dtype=np.int32),
        'int64': np.arange(size, dtype=np.int64),
        'float32': np.arange(size, dtype=np.float32),
        'float64': np.arange(size, dtype=np.float64),
        'bool': np.random.randn(size) > 0,
        'str': [str(x) for x in range(size)],
        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
        'empty_str': [''] * size
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)
    pq.write_table(arrow_table, filename.strpath, version="1.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()

    # We pass uint32_t as int64_t if we write Parquet version 1.0
    df['uint32'] = df['uint32'].values.astype(np.int64)

    pdt.assert_frame_equal(df, df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:29,代码来源:test_parquet.py


示例5: _write_table

def _write_table(table, path, **kwargs):
    import pyarrow.parquet as pq

    if isinstance(table, pd.DataFrame):
        table = pa.Table.from_pandas(table)

    pq.write_table(table, path, **kwargs)
    return table
开发者ID:NonVolatileComputing,项目名称:arrow,代码行数:8,代码来源:test_parquet.py


示例6: make_sample_file

def make_sample_file(df):
    a_table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(a_table, buf, compression='SNAPPY', version='2.0')

    buf.seek(0)
    return pq.ParquetFile(buf)
开发者ID:kiril-me,项目名称:arrow,代码行数:8,代码来源:test_parquet.py


示例7: test_pandas_parquet_2_0_rountrip

def test_pandas_parquet_2_0_rountrip(tmpdir):
    df = alltypes_sample(size=10000)

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True)
    pq.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:9,代码来源:test_parquet.py


示例8: test_pandas_parquet_native_file_roundtrip

def test_pandas_parquet_native_file_roundtrip(tmpdir):
    df = _test_dataframe(10000)
    arrow_table = A.from_pandas_dataframe(df)
    imos = paio.InMemoryOutputStream()
    pq.write_table(arrow_table, imos, version="2.0")
    buf = imos.get_result()
    reader = paio.BufferReader(buf)
    df_read = pq.read_table(reader).to_pandas()
    pdt.assert_frame_equal(df, df_read)
开发者ID:apache,项目名称:arrow,代码行数:9,代码来源:test_parquet.py


示例9: test_column_of_lists

def test_column_of_lists(tmpdir):
    df, schema = dataframe_with_arrays()

    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema)
    pq.write_table(arrow_table, filename.strpath, version="2.0")
    table_read = pq.read_table(filename.strpath)
    df_read = table_read.to_pandas()
    pdt.assert_frame_equal(df, df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:9,代码来源:test_parquet.py


示例10: _write_partition_pyarrow

def _write_partition_pyarrow(df, open_with, filename, write_index,
                             metadata_path=None, **kwargs):
    import pyarrow as pa
    from pyarrow import parquet
    t = pa.Table.from_pandas(df, preserve_index=write_index)

    with open_with(filename, 'wb') as fil:
        parquet.write_table(t, fil, **kwargs)

    if metadata_path is not None:
        with open_with(metadata_path, 'wb') as fil:
            kwargs.pop('compression', None)
            parquet.write_metadata(t.schema, fil, **kwargs)
开发者ID:postelrich,项目名称:dask,代码行数:13,代码来源:parquet.py


示例11: read_parquet

def read_parquet(fn):
  """ read parquet file with Spark """
  print("Loading parquest file: %s..."% fn)
  file_name = 'parquet_sample.dat'
  read_parquest(file_name)
  fn = 'sample.parquet'
  tbl = pq.read_table(fn)
  df = tbl.to_pandas()
  d=df.iloc[:, 0:3]

  table = pa.Table.from_pandas(d)
  pq.write_table(table, 'example.parquet')

  pass
开发者ID:teckoo,项目名称:HTMLCSS_Javascript_cookbook,代码行数:14,代码来源:file_io.py


示例12: test_min_chunksize

def test_min_chunksize():
    data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
    table = pa.Table.from_pandas(data.reset_index())

    buf = io.BytesIO()
    pq.write_table(table, buf, chunk_size=-1)

    buf.seek(0)
    result = pq.read_table(buf)

    assert result.equals(table)

    with pytest.raises(ValueError):
        pq.write_table(table, buf, chunk_size=0)
开发者ID:StevenMPhillips,项目名称:arrow,代码行数:14,代码来源:test_parquet.py


示例13: test_client

def test_client(tmpdir, data):

    # construct with a path to a file
    d = tmpdir / 'pq'
    d.mkdir()

    for k, v in data.items():
        f = d / "{}.parquet".format(k)
        table = pa.Table.from_pandas(v)
        pq.write_table(table, str(f))

    c = ParquetClient(tmpdir)
    assert c.list_databases() == ['pq']
    assert c.database().pq.list_tables() == ['close', 'open']
开发者ID:cloudera,项目名称:ibis,代码行数:14,代码来源:test_parquet.py


示例14: test_pandas_column_selection

def test_pandas_column_selection(tmpdir):
    size = 10000
    np.random.seed(0)
    df = pd.DataFrame({
        'uint8': np.arange(size, dtype=np.uint8),
        'uint16': np.arange(size, dtype=np.uint16)
    })
    filename = tmpdir.join('pandas_rountrip.parquet')
    arrow_table = pa.Table.from_pandas(df)
    pq.write_table(arrow_table, filename.strpath)
    table_read = pq.read_table(filename.strpath, columns=['uint8'])
    df_read = table_read.to_pandas()

    pdt.assert_frame_equal(df[['uint8']], df_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:14,代码来源:test_parquet.py


示例15: test_fastparquet_read_with_hdfs

def test_fastparquet_read_with_hdfs():
    fs = hdfs_test_client()

    df = tm.makeDataFrame()
    table = pa.Table.from_pandas(df)

    path = '/tmp/testing.parquet'
    with fs.open(path, 'wb') as f:
        pq.write_table(table, f)

    parquet_file = fastparquet.ParquetFile(path, open_with=fs.open)

    result = parquet_file.to_pandas()
    tm.assert_frame_equal(result, df)
开发者ID:dremio,项目名称:arrow,代码行数:14,代码来源:parquet_interop.py


示例16: parquet

def parquet(tmpdir, data):
    pa = pytest.importorskip('pyarrow')
    import pyarrow.parquet as pq  # noqa: E402
    from ibis.file.parquet import ParquetClient

    # create single files
    d = tmpdir.mkdir('pq')

    for k, v in data.items():
        f = d / '{}.parquet'.format(k)
        table = pa.Table.from_pandas(v)
        pq.write_table(table, str(f))

    return ParquetClient(tmpdir).database()
开发者ID:cloudera,项目名称:ibis,代码行数:14,代码来源:conftest.py


示例17: test_read_no_metadata

def test_read_no_metadata(tmpdir, engine):
    # use pyarrow.parquet to create a parquet file without
    # pandas metadata
    pa = pytest.importorskip("pyarrow")
    import pyarrow.parquet as pq
    tmp = str(tmpdir) + "table.parq"

    table = pa.Table.from_arrays([pa.array([1, 2, 3]),
                                  pa.array([3, 4, 5])],
                                 names=['A', 'B'])
    pq.write_table(table, tmp)
    result = dd.read_parquet(tmp, engine=engine)
    expected = pd.DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]})
    assert_eq(result, expected)
开发者ID:caseyclements,项目名称:dask,代码行数:14,代码来源:test_parquet.py


示例18: test_single_pylist_column_roundtrip

def test_single_pylist_column_roundtrip(tmpdir):
    for dtype in [int, float]:
        filename = tmpdir.join('single_{}_column.parquet'
                               .format(dtype.__name__))
        data = [pa.from_pylist(list(map(dtype, range(5))))]
        table = pa.Table.from_arrays(data, names=('a', 'b'), name='table_name')
        pq.write_table(table, filename.strpath)
        table_read = pq.read_table(filename.strpath)
        for col_written, col_read in zip(table.itercolumns(),
                                         table_read.itercolumns()):
            assert col_written.name == col_read.name
            assert col_read.data.num_chunks == 1
            data_written = col_written.data.chunk(0)
            data_read = col_read.data.chunk(0)
            assert data_written.equals(data_read)
开发者ID:kiril-me,项目名称:arrow,代码行数:15,代码来源:test_parquet.py


示例19: test_multithreaded_read

def test_multithreaded_read():
    df = alltypes_sample(size=10000)

    table = pa.Table.from_pandas(df, timestamps_to_ms=True)

    buf = io.BytesIO()
    pq.write_table(table, buf, compression='SNAPPY', version='2.0')

    buf.seek(0)
    table1 = pq.read_table(buf, nthreads=4)

    buf.seek(0)
    table2 = pq.read_table(buf, nthreads=1)

    assert table1.equals(table2)
开发者ID:kiril-me,项目名称:arrow,代码行数:15,代码来源:test_parquet.py


示例20: parquet

def parquet(tables, data_directory, ignore_missing_dependency, **params):
    try:
        import pyarrow as pa  # noqa: F401
        import pyarrow.parquet as pq  # noqa: F401
    except ImportError:
        msg = 'PyArrow dependency is missing'
        if ignore_missing_dependency:
            logger.warning('Ignored: %s', msg)
            return 0
        else:
            raise click.ClickException(msg)

    data_directory = Path(data_directory)
    for table, df in read_tables(tables, data_directory):
        arrow_table = pa.Table.from_pandas(df)
        target_path = data_directory / '{}.parquet'.format(table)
        pq.write_table(arrow_table, str(target_path))
开发者ID:cloudera,项目名称:ibis,代码行数:17,代码来源:datamgr.py



注:本文中的pyarrow.parquet.write_table函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python pyasdf.ASDFDataSet类代码示例发布时间:2022-05-25
下一篇:
Python compat.guid函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap