parquet format is compressed and efficient way to store data
get metadata
get row/column/shema
import pyarrow as pa
import pyarrow.parquet as pq
import os
ts=pq.read_metadata(first_pq)
ts.num_rows, ts.num_columns, pq.read_schema(first_pq)
get statics
Row Group in the Parquet file format is a collection of rows that are stored together as a unit and divided into smaller chunks for efficient querying and processing.
beautiful_df = pd.DataFrame()
for nm in range(ts.num_columns):
path_in_schema = ts.column(nm).path_in_schema
compressed_size = ts.column(nm).total_compressed_size
stats = ts.column(nm).statistics
min_value = stats.min
max_value = stats.max
physical_type = stats.physical_type
beautiful_df[path_in_schema] = pd.DataFrame([physical_type, min_value, max_value, compressed_size])
df = beautiful_df.T
df.columns = ['DTYPE', 'Min', 'Max', 'Compressed_Size_(KO)']
get partition
def get_all_partitions(path):
partitions = {}
i = 0
for (_, partitions_layer, _) in os.walk(path):
if len(partitions_layer)>0:
key = partitions_layer[0].split('=')[0]
partitions[key] = sorted([partitions_layer[i].split('=')[1] for i in range(len(partitions_layer))])
else:
break
return partitions