parquet

Posted by neverset on June 12, 2023

parquet format is compressed and efficient way to store data

get metadata

get row/column/shema

import pyarrow as pa
import pyarrow.parquet as pq
import os

ts=pq.read_metadata(first_pq)
ts.num_rows, ts.num_columns, pq.read_schema(first_pq)

get statics

Row Group in the Parquet file format is a collection of rows that are stored together as a unit and divided into smaller chunks for efficient querying and processing.

beautiful_df = pd.DataFrame()
for nm in range(ts.num_columns):
    path_in_schema = ts.column(nm).path_in_schema
    compressed_size = ts.column(nm).total_compressed_size
    stats = ts.column(nm).statistics
    min_value = stats.min
    max_value = stats.max
    physical_type = stats.physical_type
    beautiful_df[path_in_schema] = pd.DataFrame([physical_type, min_value, max_value, compressed_size])
df = beautiful_df.T
df.columns = ['DTYPE', 'Min', 'Max', 'Compressed_Size_(KO)']

get partition

def get_all_partitions(path):
    partitions = {}
    i = 0
    for (_, partitions_layer, _) in os.walk(path):
        if len(partitions_layer)>0:
            key = partitions_layer[0].split('=')[0]
            partitions[key] = sorted([partitions_layer[i].split('=')[1] for i in range(len(partitions_layer))])
        else:
            break
    return partitions