Dump pgstac to parquet
In [1]:
Copied!
!uv pip install psycopg psycopg[binary] fsspec datetime pathlib
!uv pip install -e ../..
!uv pip install memory_profiler
%load_ext memory_profiler
!uv pip install psycopg psycopg[binary] fsspec datetime pathlib
!uv pip install -e ../..
!uv pip install memory_profiler
%load_ext memory_profiler
Using Python 3.10.17 environment at: /home/bitner/data/stac-geoparquet/.venv Audited 5 packages in 3ms Using Python 3.10.17 environment at: /home/bitner/data/stac-geoparquet/.venv Resolved 19 packages in 360ms Prepared 1 package in 381ms Uninstalled 1 package in 0.67ms Installed 1 package in 0.60ms.dev5+g889c95c.d20250626 (from ~ stac-geoparquet==0.7.1.dev5+g889c95c.d20250626 (from file:///home/bitner/data/stac-geoparquet) Using Python 3.10.17 environment at: /home/bitner/data/stac-geoparquet/.venv Audited 1 package in 2ms
In [2]:
Copied!
import logging
from typing import Any
from stac_geoparquet.pgstac_reader import (
pgstac_dsn,
pgstac_to_arrow,
pgstac_to_iter,
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# pgstac test items derived from naip
db = pgstac_dsn(
"postgres://username:password@localhost:5439/postgis", statement_timeout=300000
)
db
import logging
from typing import Any
from stac_geoparquet.pgstac_reader import (
pgstac_dsn,
pgstac_to_arrow,
pgstac_to_iter,
)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# pgstac test items derived from naip
db = pgstac_dsn(
"postgres://username:password@localhost:5439/postgis", statement_timeout=300000
)
db
Out[2]:
"user=username password=password dbname=postgis host=localhost port=5439 options=' -c search_path=pgstac,public -c statement_timeout=300000'"
Create Functions to Modify Each Item
In [3]:
Copied!
def inject_links(item: dict[str, Any]) -> dict[str, Any]:
item["links"] = [
{
"rel": "collection",
"type": "application/json",
"href": f"https://planetarycomputer.microsoft.com/api/stac/v1/collections/{item['collection']}", # noqa: E501
},
{
"rel": "parent",
"type": "application/json",
"href": f"https://planetarycomputer.microsoft.com/api/stac/v1/collections/{item['collection']}", # noqa: E501
},
{
"rel": "root",
"type": "application/json",
"href": "https://planetarycomputer.microsoft.com/api/stac/v1/",
},
{
"rel": "self",
"type": "application/geo+json",
"href": f"https://planetarycomputer.microsoft.com/api/stac/v1/collections/{item['collection']}/items/{item['id']}", # noqa: E501
},
{
"rel": "preview",
"href": f"https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection={item['collection']}&item={item['id']}", # noqa: E501
"title": "Map of item",
"type": "text/html",
},
]
return item
def inject_assets(item: dict[str, Any], render_config) -> dict[str, Any]:
item["assets"]["tilejson"] = {
"href": (
"https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?"
f"collection={item['collection']}"
f"&item={item['id']}&{render_config}"
),
"roles": ["tiles"],
"title": "TileJSON with default rendering",
"type": "application/json",
}
item["assets"]["rendered_preview"] = {
"href": (
"https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?"
f"collection={item['collection']}"
f"&item={item['id']}&{render_config}"
),
"rel": "preview",
"roles": ["overview"],
"title": "Rendered preview",
"type": "image/png",
}
return item
def naip_year_to_int(item: dict[str, Any]) -> dict[str, Any]:
"""Convert the year to an integer."""
if "naip:year" in item["properties"] and isinstance(
item["properties"]["naip:year"], str
):
item["properties"]["naip:year"] = int(item["properties"]["naip:year"])
return item
render_config = "render=myrenderconfig"
def clean_item(item: dict[str, Any]) -> dict[str, Any]:
"""Clean items by making sure that naip:year is an int and injecting links and assets."""
return naip_year_to_int(inject_links(inject_assets(item, render_config)))
def inject_links(item: dict[str, Any]) -> dict[str, Any]:
item["links"] = [
{
"rel": "collection",
"type": "application/json",
"href": f"https://planetarycomputer.microsoft.com/api/stac/v1/collections/{item['collection']}", # noqa: E501
},
{
"rel": "parent",
"type": "application/json",
"href": f"https://planetarycomputer.microsoft.com/api/stac/v1/collections/{item['collection']}", # noqa: E501
},
{
"rel": "root",
"type": "application/json",
"href": "https://planetarycomputer.microsoft.com/api/stac/v1/",
},
{
"rel": "self",
"type": "application/geo+json",
"href": f"https://planetarycomputer.microsoft.com/api/stac/v1/collections/{item['collection']}/items/{item['id']}", # noqa: E501
},
{
"rel": "preview",
"href": f"https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection={item['collection']}&item={item['id']}", # noqa: E501
"title": "Map of item",
"type": "text/html",
},
]
return item
def inject_assets(item: dict[str, Any], render_config) -> dict[str, Any]:
item["assets"]["tilejson"] = {
"href": (
"https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?"
f"collection={item['collection']}"
f"&item={item['id']}&{render_config}"
),
"roles": ["tiles"],
"title": "TileJSON with default rendering",
"type": "application/json",
}
item["assets"]["rendered_preview"] = {
"href": (
"https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?"
f"collection={item['collection']}"
f"&item={item['id']}&{render_config}"
),
"rel": "preview",
"roles": ["overview"],
"title": "Rendered preview",
"type": "image/png",
}
return item
def naip_year_to_int(item: dict[str, Any]) -> dict[str, Any]:
"""Convert the year to an integer."""
if "naip:year" in item["properties"] and isinstance(
item["properties"]["naip:year"], str
):
item["properties"]["naip:year"] = int(item["properties"]["naip:year"])
return item
render_config = "render=myrenderconfig"
def clean_item(item: dict[str, Any]) -> dict[str, Any]:
"""Clean items by making sure that naip:year is an int and injecting links and assets."""
return naip_year_to_int(inject_links(inject_assets(item, render_config)))
Demonstrate Injecting Additional Links and Assets.
In [4]:
Copied!
items = pgstac_to_iter(
db,
row_func=clean_item,
)
print(next(items)["id"])
items = pgstac_to_iter(
db,
row_func=clean_item,
)
print(next(items)["id"])
INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:With no filter, fetching all items INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection pgstac-test-item-0089
In [ ]:
Copied!
render_config = "render=myrenderconfig"
items = pgstac_to_iter(
db,
row_func=clean_item,
)
# batches = parse_stac_items_to_arrow(items=items, chunk_size=100000, schema="ChunksToDisk", tmpdir='/tmp/pqtest')
# batches
render_config = "render=myrenderconfig"
items = pgstac_to_iter(
db,
row_func=clean_item,
)
# batches = parse_stac_items_to_arrow(items=items, chunk_size=100000, schema="ChunksToDisk", tmpdir='/tmp/pqtest')
# batches
Inject Additional Links and Assets, Convert to RecordBatchReader, and Dump to Parquet (We need to refetch the data as the iterator was spent)
In [6]:
Copied!
arrow = pgstac_to_arrow(
db,
row_func=clean_item,
)
arrow = pgstac_to_arrow(
db,
row_func=clean_item,
)
INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 0.0 | CPU_USER_TIME: 2.830 | RSS(MB):234.64 | USS(MB):149.66 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:With no filter, fetching all items INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 65536
Dump any partition in pgstac that has been updated after a given time in order to incrementally dump only new records to parquet.
In [13]:
Copied!
%memit pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4.parquet",row_func=clean_item,chunk_size=100000,schema="FullFile")
%memit pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4.parquet",row_func=clean_item,chunk_size=100000,schema="FullFile")
INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d6584fb530> /tmp/pgstactoparquet4.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 77.4 | CPU_USER_TIME: 415.880 | RSS(MB):3739.94 | USS(MB):17.41 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:With no filter, fetching all items INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 500101 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 94.3 | CPU_USER_TIME: 514.360 | RSS(MB):4861.70 | USS(MB):1911.30 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 99.4 | CPU_USER_TIME: 515.470 | RSS(MB):4177.33 | USS(MB):1227.85 peak memory: 9443.96 MiB, increment: 5703.12 MiB
In [14]:
Copied!
%memit pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4.parquet",row_func=clean_item,chunk_size=100000,schema="FirstBatch")
%memit pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4.parquet",row_func=clean_item,chunk_size=100000,schema="FirstBatch")
INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d683122df0> /tmp/pgstactoparquet4.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 75.5 | CPU_USER_TIME: 515.950 | RSS(MB):4160.68 | USS(MB):17.38 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:With no filter, fetching all items INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 97.8 | CPU_USER_TIME: 534.780 | RSS(MB):4893.75 | USS(MB):1538.61 INFO:stac_geoparquet.arrow._api:Batch 0 | CPU%: 101.0 | CPU_USER_TIME: 534.780 | RSS(MB):4893.75 | USS(MB):1538.72 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 1 | CPU%: 86.2 | CPU_USER_TIME: 554.770 | RSS(MB):5241.51 | USS(MB):2151.86 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 2 | CPU%: 98.3 | CPU_USER_TIME: 574.000 | RSS(MB):5342.93 | USS(MB):2281.09 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 3 | CPU%: 97.8 | CPU_USER_TIME: 593.000 | RSS(MB):5536.80 | USS(MB):2483.91 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 4 | CPU%: 98.0 | CPU_USER_TIME: 612.280 | RSS(MB):5302.53 | USS(MB):2256.39 INFO:stac_geoparquet.arrow._batch:Items Length: 101 INFO:stac_geoparquet.arrow._api:Batch 5 | CPU%: 96.3 | CPU_USER_TIME: 612.680 | RSS(MB):4647.70 | USS(MB):1601.68 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 109.0 | CPU_USER_TIME: 612.690 | RSS(MB):4530.96 | USS(MB):1485.08 peak memory: 6063.12 MiB, increment: 1897.59 MiB
In [15]:
Copied!
%memit pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4.parquet",row_func=clean_item,chunk_size=100000,schema="ChunksToDisk")
%memit pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4.parquet",row_func=clean_item,chunk_size=100000,schema="ChunksToDisk")
INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d65934f270> /tmp/pgstactoparquet4.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 71.6 | CPU_USER_TIME: 613.140 | RSS(MB):4517.27 | USS(MB):17.35 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:With no filter, fetching all items INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 0 | CPU%: 97.7 | CPU_USER_TIME: 632.030 | RSS(MB):5198.04 | USS(MB):1543.83 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 1 | CPU%: 88.7 | CPU_USER_TIME: 652.160 | RSS(MB):5467.35 | USS(MB):2081.73 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 2 | CPU%: 97.7 | CPU_USER_TIME: 671.460 | RSS(MB):5326.94 | USS(MB):1996.55 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 3 | CPU%: 97.7 | CPU_USER_TIME: 690.050 | RSS(MB):5421.30 | USS(MB):2133.34 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 4 | CPU%: 98.0 | CPU_USER_TIME: 709.160 | RSS(MB):5335.68 | USS(MB):2072.63 INFO:stac_geoparquet.arrow._batch:Items Length: 101 INFO:stac_geoparquet.arrow._api:Batch 5 | CPU%: 101.6 | CPU_USER_TIME: 709.310 | RSS(MB):4542.91 | USS(MB):1279.75 INFO:stac_geoparquet.arrow._api:Created Dataset | CPU%: 98.6 | CPU_USER_TIME: 709.310 | RSS(MB):4542.91 | USS(MB):1279.80 INFO:stac_geoparquet.arrow._api:Created Batches | CPU%: 106.9 | CPU_USER_TIME: 709.310 | RSS(MB):4545.54 | USS(MB):1551.15 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 1052.5 | CPU_USER_TIME: 710.120 | RSS(MB):4820.96 | USS(MB):2387.61 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 219.3 | CPU_USER_TIME: 712.090 | RSS(MB):4677.41 | USS(MB):2531.95 peak memory: 6064.57 MiB, increment: 1547.29 MiB
In [9]:
Copied!
%memit sync_pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4",row_func=clean_item,schema="FirstBatch",chunk_size=100000,)
%memit sync_pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4",row_func=clean_item,schema="FirstBatch",chunk_size=100000,)
INFO:stac_geoparquet.pgstac_reader:Syncing PgSTAC partitions that have been updated since None to /tmp/pgstactoparquet4 on filesystem <pyarrow._fs.LocalFileSystem object at 0x73d683122af0>. INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d683122af0> /tmp/pgstactoparquet4/pgstac-test-collection/items_20110701_20110801.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 97.4 | CPU_USER_TIME: 15.270 | RSS(MB):1090.88 | USS(MB):14.47 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:Using Collection pgstac-test-collection, Start 2011-07-31 00:00:00+00:00, End 2011-07-31 00:00:00.000001+00:00 INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 12 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 50.8 | CPU_USER_TIME: 15.290 | RSS(MB):1090.88 | USS(MB):17.34 INFO:stac_geoparquet.arrow._api:Batch 0 | CPU%: 112.9 | CPU_USER_TIME: 15.300 | RSS(MB):1082.53 | USS(MB):18.93 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 95.0 | CPU_USER_TIME: 15.300 | RSS(MB):1083.68 | USS(MB):20.73 INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d683122af0> /tmp/pgstactoparquet4/pgstac-test-collection/items_20110801_20110901.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 84.3 | CPU_USER_TIME: 15.300 | RSS(MB):1083.68 | USS(MB):20.79 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:Using Collection pgstac-test-collection, Start 2011-08-01 00:00:00+00:00, End 2011-08-25 00:00:00.000001+00:00 INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 97.6 | CPU_USER_TIME: 33.800 | RSS(MB):2413.20 | USS(MB):1520.11 INFO:stac_geoparquet.arrow._api:Batch 0 | CPU%: 99.5 | CPU_USER_TIME: 33.800 | RSS(MB):2413.20 | USS(MB):1520.11 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 1 | CPU%: 98.0 | CPU_USER_TIME: 53.650 | RSS(MB):2898.96 | USS(MB):2006.03 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 2 | CPU%: 98.0 | CPU_USER_TIME: 72.560 | RSS(MB):3177.39 | USS(MB):2284.49 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 3 | CPU%: 98.0 | CPU_USER_TIME: 91.220 | RSS(MB):3233.25 | USS(MB):2340.50 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 4 | CPU%: 97.7 | CPU_USER_TIME: 110.270 | RSS(MB):3274.36 | USS(MB):2381.65 INFO:stac_geoparquet.arrow._batch:Items Length: 89 INFO:stac_geoparquet.arrow._api:Batch 5 | CPU%: 100.6 | CPU_USER_TIME: 110.630 | RSS(MB):2482.80 | USS(MB):1590.07 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 106.4 | CPU_USER_TIME: 110.640 | RSS(MB):2374.66 | USS(MB):1481.93 peak memory: 3892.30 MiB, increment: 2801.70 MiB
In [10]:
Copied!
%memit sync_pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4",row_func=clean_item,schema="FirstBatch",chunk_size=10000,)
%memit sync_pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4",row_func=clean_item,schema="FirstBatch",chunk_size=10000,)
INFO:stac_geoparquet.pgstac_reader:Syncing PgSTAC partitions that have been updated since None to /tmp/pgstactoparquet4 on filesystem <pyarrow._fs.LocalFileSystem object at 0x73d6f0a2b970>. INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d6f0a2b970> /tmp/pgstactoparquet4/pgstac-test-collection/items_20110701_20110801.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 1.6 | CPU_USER_TIME: 111.130 | RSS(MB):2367.04 | USS(MB):17.21 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:Using Collection pgstac-test-collection, Start 2011-07-31 00:00:00+00:00, End 2011-07-31 00:00:00.000001+00:00 INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 12 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 70.2 | CPU_USER_TIME: 111.150 | RSS(MB):2367.04 | USS(MB):20.29 INFO:stac_geoparquet.arrow._api:Batch 0 | CPU%: 107.0 | CPU_USER_TIME: 111.150 | RSS(MB):2367.04 | USS(MB):20.61 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 82.0 | CPU_USER_TIME: 111.150 | RSS(MB):2367.04 | USS(MB):21.19 INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d6f0a2b970> /tmp/pgstactoparquet4/pgstac-test-collection/items_20110801_20110901.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 116.8 | CPU_USER_TIME: 111.160 | RSS(MB):2367.04 | USS(MB):21.24 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:Using Collection pgstac-test-collection, Start 2011-08-01 00:00:00+00:00, End 2011-08-25 00:00:00.000001+00:00 INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 94.4 | CPU_USER_TIME: 113.180 | RSS(MB):2240.66 | USS(MB):322.53 INFO:stac_geoparquet.arrow._api:Batch 0 | CPU%: 106.9 | CPU_USER_TIME: 113.180 | RSS(MB):2240.66 | USS(MB):322.56 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 1 | CPU%: 98.2 | CPU_USER_TIME: 115.170 | RSS(MB):2274.42 | USS(MB):403.96 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 2 | CPU%: 98.3 | CPU_USER_TIME: 117.140 | RSS(MB):2262.11 | USS(MB):411.58 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 3 | CPU%: 97.7 | CPU_USER_TIME: 119.120 | RSS(MB):2290.82 | USS(MB):450.43 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 4 | CPU%: 98.6 | CPU_USER_TIME: 121.100 | RSS(MB):2267.00 | USS(MB):431.16 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 5 | CPU%: 97.9 | CPU_USER_TIME: 123.080 | RSS(MB):2297.81 | USS(MB):463.98 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 6 | CPU%: 98.0 | CPU_USER_TIME: 125.100 | RSS(MB):2274.87 | USS(MB):442.65 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 7 | CPU%: 98.2 | CPU_USER_TIME: 127.610 | RSS(MB):2302.46 | USS(MB):472.01 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 8 | CPU%: 98.0 | CPU_USER_TIME: 129.620 | RSS(MB):2279.35 | USS(MB):449.18 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 9 | CPU%: 98.3 | CPU_USER_TIME: 131.670 | RSS(MB):2304.64 | USS(MB):474.97 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 10 | CPU%: 98.3 | CPU_USER_TIME: 133.660 | RSS(MB):2286.43 | USS(MB):457.12 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 11 | CPU%: 97.7 | CPU_USER_TIME: 135.640 | RSS(MB):2297.56 | USS(MB):469.68 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 12 | CPU%: 98.0 | CPU_USER_TIME: 137.670 | RSS(MB):2283.80 | USS(MB):455.92 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 13 | CPU%: 98.7 | CPU_USER_TIME: 139.670 | RSS(MB):2306.25 | USS(MB):478.57 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 14 | CPU%: 97.8 | CPU_USER_TIME: 141.640 | RSS(MB):2293.17 | USS(MB):465.45 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 15 | CPU%: 98.3 | CPU_USER_TIME: 143.710 | RSS(MB):2310.23 | USS(MB):483.40 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 16 | CPU%: 98.0 | CPU_USER_TIME: 145.820 | RSS(MB):2301.02 | USS(MB):474.16 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 17 | CPU%: 97.6 | CPU_USER_TIME: 147.810 | RSS(MB):2312.17 | USS(MB):485.46 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 18 | CPU%: 98.3 | CPU_USER_TIME: 149.870 | RSS(MB):2302.50 | USS(MB):475.83 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 19 | CPU%: 98.9 | CPU_USER_TIME: 152.470 | RSS(MB):2314.25 | USS(MB):487.85 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 20 | CPU%: 98.6 | CPU_USER_TIME: 154.490 | RSS(MB):2306.08 | USS(MB):479.67 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 21 | CPU%: 98.1 | CPU_USER_TIME: 156.520 | RSS(MB):2292.57 | USS(MB):477.74 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 22 | CPU%: 98.0 | CPU_USER_TIME: 158.510 | RSS(MB):2288.41 | USS(MB):473.64 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 23 | CPU%: 98.0 | CPU_USER_TIME: 160.520 | RSS(MB):2299.28 | USS(MB):484.64 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 24 | CPU%: 98.1 | CPU_USER_TIME: 162.520 | RSS(MB):2291.36 | USS(MB):476.71 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 25 | CPU%: 98.3 | CPU_USER_TIME: 164.660 | RSS(MB):2308.28 | USS(MB):493.66 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 26 | CPU%: 98.6 | CPU_USER_TIME: 166.740 | RSS(MB):2308.25 | USS(MB):493.62 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 27 | CPU%: 97.9 | CPU_USER_TIME: 168.810 | RSS(MB):2316.27 | USS(MB):501.77 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 28 | CPU%: 98.2 | CPU_USER_TIME: 170.850 | RSS(MB):2311.48 | USS(MB):497.14 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 29 | CPU%: 98.3 | CPU_USER_TIME: 172.850 | RSS(MB):2321.90 | USS(MB):507.63 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 30 | CPU%: 97.8 | CPU_USER_TIME: 174.850 | RSS(MB):2312.47 | USS(MB):498.09 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 31 | CPU%: 98.7 | CPU_USER_TIME: 177.450 | RSS(MB):2320.68 | USS(MB):506.31 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 32 | CPU%: 98.2 | CPU_USER_TIME: 179.460 | RSS(MB):2315.55 | USS(MB):501.27 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 33 | CPU%: 98.3 | CPU_USER_TIME: 181.500 | RSS(MB):2322.30 | USS(MB):508.03 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 34 | CPU%: 97.5 | CPU_USER_TIME: 183.520 | RSS(MB):2319.16 | USS(MB):505.05 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 35 | CPU%: 97.8 | CPU_USER_TIME: 185.520 | RSS(MB):2325.51 | USS(MB):511.52 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 36 | CPU%: 98.2 | CPU_USER_TIME: 187.520 | RSS(MB):2294.58 | USS(MB):480.61 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 37 | CPU%: 97.8 | CPU_USER_TIME: 189.530 | RSS(MB):2309.05 | USS(MB):495.05 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 38 | CPU%: 98.4 | CPU_USER_TIME: 191.560 | RSS(MB):2309.23 | USS(MB):495.41 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 39 | CPU%: 98.2 | CPU_USER_TIME: 193.590 | RSS(MB):2320.14 | USS(MB):506.25 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 40 | CPU%: 98.0 | CPU_USER_TIME: 195.640 | RSS(MB):2320.82 | USS(MB):507.06 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 41 | CPU%: 98.1 | CPU_USER_TIME: 197.670 | RSS(MB):2304.74 | USS(MB):491.09 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 42 | CPU%: 98.7 | CPU_USER_TIME: 200.180 | RSS(MB):2307.84 | USS(MB):494.23 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 43 | CPU%: 98.1 | CPU_USER_TIME: 202.210 | RSS(MB):2312.16 | USS(MB):498.64 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 44 | CPU%: 97.9 | CPU_USER_TIME: 204.230 | RSS(MB):2310.00 | USS(MB):496.58 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 45 | CPU%: 97.8 | CPU_USER_TIME: 206.240 | RSS(MB):2315.50 | USS(MB):502.09 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 46 | CPU%: 98.0 | CPU_USER_TIME: 208.220 | RSS(MB):2314.43 | USS(MB):501.19 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 47 | CPU%: 98.1 | CPU_USER_TIME: 210.270 | RSS(MB):2316.80 | USS(MB):503.65 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 48 | CPU%: 97.8 | CPU_USER_TIME: 212.450 | RSS(MB):2313.78 | USS(MB):500.62 INFO:stac_geoparquet.arrow._batch:Items Length: 10000 INFO:stac_geoparquet.arrow._api:Batch 49 | CPU%: 98.6 | CPU_USER_TIME: 214.520 | RSS(MB):2319.65 | USS(MB):506.48 INFO:stac_geoparquet.arrow._batch:Items Length: 89 INFO:stac_geoparquet.arrow._api:Batch 50 | CPU%: 96.2 | CPU_USER_TIME: 214.580 | RSS(MB):2294.87 | USS(MB):481.66 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 96.7 | CPU_USER_TIME: 214.590 | RSS(MB):2278.00 | USS(MB):464.80 peak memory: 2367.04 MiB, increment: 0.00 MiB
In [12]:
Copied!
%memit sync_pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4",row_func=clean_item,schema="ChunksToDisk",chunk_size=100000,)
%memit sync_pgstac_to_parquet(db,output_path="/tmp/pgstactoparquet4",row_func=clean_item,schema="ChunksToDisk",chunk_size=100000,)
INFO:stac_geoparquet.pgstac_reader:Syncing PgSTAC partitions that have been updated since None to /tmp/pgstactoparquet4 on filesystem <pyarrow._fs.LocalFileSystem object at 0x73d6f0a2b0b0>. INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d6f0a2b0b0> /tmp/pgstactoparquet4/pgstac-test-collection/items_20110701_20110801.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 81.7 | CPU_USER_TIME: 315.610 | RSS(MB):2365.64 | USS(MB):17.47 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:Using Collection pgstac-test-collection, Start 2011-07-31 00:00:00+00:00, End 2011-07-31 00:00:00.000001+00:00 INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 12 INFO:stac_geoparquet.arrow._api:Batch 0 | CPU%: 70.2 | CPU_USER_TIME: 315.630 | RSS(MB):2363.93 | USS(MB):22.02 INFO:stac_geoparquet.arrow._api:Created Dataset | CPU%: 98.4 | CPU_USER_TIME: 315.630 | RSS(MB):2364.82 | USS(MB):22.96 INFO:stac_geoparquet.arrow._api:Created Batches | CPU%: 113.5 | CPU_USER_TIME: 315.640 | RSS(MB):2365.57 | USS(MB):24.07 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 101.0 | CPU_USER_TIME: 315.640 | RSS(MB):2364.84 | USS(MB):24.63 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 118.5 | CPU_USER_TIME: 315.650 | RSS(MB):2365.84 | USS(MB):27.12 INFO:stac_geoparquet.arrow._api:Saving STAC Items to Parquet INFO:stac_geoparquet.arrow._api:Exporting PgSTAC to <pyarrow._fs.LocalFileSystem object at 0x73d6f0a2b0b0> /tmp/pgstactoparquet4/pgstac-test-collection/items_20110801_20110901.parquet INFO:stac_geoparquet.arrow._api:parse_stac_items_to_arrow start | CPU%: 116.7 | CPU_USER_TIME: 315.660 | RSS(MB):2365.84 | USS(MB):27.25 INFO:stac_geoparquet.pgstac_reader:Fetching Data from PGStac Into an Iterator of Items INFO:stac_geoparquet.pgstac_reader:Using Collection pgstac-test-collection, Start 2011-08-01 00:00:00+00:00, End 2011-08-25 00:00:00.000001+00:00 INFO:stac_geoparquet.pgstac_reader:Getting Base Item for pgstac-test-collection INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 0 | CPU%: 97.5 | CPU_USER_TIME: 334.690 | RSS(MB):3238.19 | USS(MB):1553.66 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 1 | CPU%: 98.1 | CPU_USER_TIME: 355.080 | RSS(MB):3698.38 | USS(MB):2071.08 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 2 | CPU%: 97.7 | CPU_USER_TIME: 374.470 | RSS(MB):3679.55 | USS(MB):2055.06 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 3 | CPU%: 97.8 | CPU_USER_TIME: 393.430 | RSS(MB):3831.19 | USS(MB):2207.02 INFO:stac_geoparquet.arrow._batch:Items Length: 100000 INFO:stac_geoparquet.arrow._api:Batch 4 | CPU%: 97.7 | CPU_USER_TIME: 412.900 | RSS(MB):3559.70 | USS(MB):1935.41 INFO:stac_geoparquet.arrow._batch:Items Length: 89 INFO:stac_geoparquet.arrow._api:Batch 5 | CPU%: 99.4 | CPU_USER_TIME: 413.040 | RSS(MB):2865.91 | USS(MB):1241.62 INFO:stac_geoparquet.arrow._api:Created Dataset | CPU%: 104.9 | CPU_USER_TIME: 413.040 | RSS(MB):2865.91 | USS(MB):1241.63 INFO:stac_geoparquet.arrow._api:Created Batches | CPU%: 93.4 | CPU_USER_TIME: 413.040 | RSS(MB):2867.16 | USS(MB):1243.35 INFO:stac_geoparquet.arrow._api:Parsed to arrow | CPU%: 518.2 | CPU_USER_TIME: 413.220 | RSS(MB):3093.38 | USS(MB):2069.73 INFO:stac_geoparquet.arrow._api:Written to parquet | CPU%: 248.1 | CPU_USER_TIME: 415.410 | RSS(MB):3757.96 | USS(MB):2576.59 peak memory: 4464.94 MiB, increment: 2099.30 MiB
In [ ]:
Copied!