Python Scripts as DuckDB Table Functions
DuckDB's read_json_auto() can fetch data from URLs directly. But what about paginated APIs? GraphQL endpoints? Authenticated requests? HTML scraping?
Using shellfs and PEP 723 inline scripts, you can call Python scripts as table functions with zero setup.
The Pattern
INSTALL shellfs FROM community;
INSTALL arrow FROM community;
LOAD shellfs;
LOAD arrow;
CREATE OR REPLACE MACRO uv_run(script, args := '') AS TABLE
SELECT * FROM read_arrow('uv run ' || script || ' ' || args || ' |');
Now any Python script that emits Arrow IPC to stdout becomes a table function:
SELECT * FROM uv_run('fetch_weather.py', '--lat 37.77 --lon -122.42');
Writing Scripts
Scripts use PEP 723 inline metadata for dependencies. No virtualenv, no requirements.txt. Just run with uv run:
# fetch_weather.py
#!/usr/bin/env -S uv run
# /// script
# requires-python = ">=3.11"
# dependencies = ["polars", "pyarrow", "httpx"]
# ///
import polars as pl
import pyarrow as pa
import httpx
import sys
import argparse
def emit(df):
table = df.to_arrow()
with pa.ipc.new_stream(sys.stdout.buffer, table.schema) as w:
w.write_table(table)
parser = argparse.ArgumentParser()
parser.add_argument("--lat", type=float, required=True)
parser.add_argument("--lon", type=float, required=True)
args = parser.parse_args()
url = f"https://api.open-meteo.com/v1/forecast?latitude={args.lat}&longitude={args.lon}&daily=temperature_2m_max,temperature_2m_min&timezone=auto"
data = httpx.get(url).json()
df = pl.DataFrame({
"date": data["daily"]["time"],
"temp_max": data["daily"]["temperature_2m_max"],
"temp_min": data["daily"]["temperature_2m_min"],
})
emit(df)
The emit() helper converts any Polars/Pandas DataFrame to Arrow IPC on stdout. That's it.
When Native DuckDB Falls Short
DuckDB's HTTP + JSON support handles simple cases:
-- This works fine
SELECT name->>'common' as name, population
FROM read_json_auto('https://restcountries.com/v3.1/all?fields=name,population');
But real-world data fetching often needs more:
| Pattern | Why DuckDB Can't |
|---|---|
| Pagination | Can't loop through pages |
| GraphQL | Requires POST requests |
| Fan-out | Can't make N concurrent requests |
| N+1 queries | Can't fetch list then details for each |
| Auth headers | Can't read env vars or add Bearer tokens |
| HTML scraping | No DOM parser |
Practical Examples
Paginated GitHub Search
Fetch 100+ results from GitHub's API (which returns 30 per page):
# github_search.py
#!/usr/bin/env -S uv run
# /// script
# dependencies = ["polars", "pyarrow", "httpx"]
# ///
import polars as pl
import pyarrow as pa
import httpx
import sys
import argparse
def emit(df):
with pa.ipc.new_stream(sys.stdout.buffer, df.to_arrow().schema) as w:
w.write_table(df.to_arrow())
parser = argparse.ArgumentParser()
parser.add_argument("--query", required=True)
parser.add_argument("--max-results", type=int, default=100)
args = parser.parse_args()
client = httpx.Client(headers={"Accept": "application/vnd.github.v3+json"})
rows, page = [], 1
while len(rows) < args.max_results:
resp = client.get(
"https://api.github.com/search/repositories",
params={"q": args.query, "per_page": 30, "page": page, "sort": "stars"}
)
items = resp.json().get("items", [])
if not items:
break
for r in items:
rows.append({"name": r["full_name"], "stars": r["stargazers_count"], "language": r["language"]})
if len(rows) >= args.max_results:
break
page += 1
emit(pl.DataFrame(rows))
Query it:
SELECT name, stars, language
FROM uv_run('github_search.py', '--query "language:rust stars:>5000" --max-results 100')
ORDER BY stars DESC;
GraphQL API (POST Request)
# graphql_countries.py
#!/usr/bin/env -S uv run
# /// script
# dependencies = ["polars", "pyarrow", "httpx"]
# ///
import polars as pl
import pyarrow as pa
import httpx
import sys
def emit(df):
with pa.ipc.new_stream(sys.stdout.buffer, df.to_arrow().schema) as w:
w.write_table(df.to_arrow())
query = """
{
countries {
name
capital
continent { name }
languages { name }
}
}
"""
resp = httpx.post("https://countries.trevorblades.com/graphql", json={"query": query})
data = resp.json()["data"]["countries"]
rows = [{
"name": c["name"],
"capital": c["capital"],
"continent": c["continent"]["name"],
"languages": ", ".join(l["name"] for l in c["languages"]),
} for c in data]
emit(pl.DataFrame(rows))
Query it:
SELECT name, capital, continent
FROM uv_run('graphql_countries.py')
WHERE continent = 'Europe';
Wikipedia Table Scraping
# wiki_tables.py
#!/usr/bin/env -S uv run
# /// script
# dependencies = ["polars", "pyarrow", "httpx", "pandas", "lxml"]
# ///
import polars as pl
import pyarrow as pa
import httpx
import pandas as pd
import sys
import argparse
import io
def emit(df):
with pa.ipc.new_stream(sys.stdout.buffer, df.to_arrow().schema) as w:
w.write_table(df.to_arrow())
parser = argparse.ArgumentParser()
parser.add_argument("--article", required=True)
parser.add_argument("--table-index", type=int, default=0)
args = parser.parse_args()
# Use Wikipedia's API to get rendered HTML
params = {"action": "parse", "page": args.article, "format": "json", "prop": "text"}
headers = {"User-Agent": "uv_run/1.0"}
resp = httpx.get("https://en.wikipedia.org/w/api.php", params=params, headers=headers)
html = resp.json()["parse"]["text"]["*"]
tables = pd.read_html(io.StringIO(html))
emit(pl.from_pandas(tables[args.table_index]))
Query it:
SELECT *
FROM uv_run('wiki_tables.py', '--article "List_of_countries_and_dependencies_by_population"')
LIMIT 10;
Composing Multiple Sources
The real power is joining data from different APIs in pure SQL:
WITH quakes AS (
SELECT magnitude, latitude, longitude
FROM uv_run('earthquakes.py', '--min-magnitude 4.5 --days 30')
),
countries AS (
SELECT name, region, latitude as lat, longitude as lon
FROM uv_run('countries.py')
)
SELECT
c.region,
count(*) as quakes,
round(avg(q.magnitude), 2) as avg_magnitude
FROM quakes q
JOIN countries c ON (
abs(q.latitude - c.lat) < 10
AND abs(q.longitude - c.lon) < 15
)
GROUP BY c.region
ORDER BY quakes DESC;
Each script manages its own dependencies. DuckDB handles the SQL. No shared virtualenv, no dependency conflicts.
Setup
Add to your ~/.duckdbrc:
INSTALL shellfs FROM community;
INSTALL arrow FROM community;
LOAD shellfs;
LOAD arrow;
CREATE OR REPLACE MACRO uv_run(script, args := '') AS TABLE
SELECT * FROM read_arrow('uv run ' || script || ' ' || args || ' |');
Requires uv installed (curl -LsSf https://astral.sh/uv/install.sh | sh).
Why This Works
- PEP 723: Dependencies declared inline, resolved by
uv runon first execution, cached after - shellfs: Pipes shell command output into DuckDB's file readers
- Arrow IPC: Binary format, zero parsing overhead, type preservation
- uv: Fast dependency resolution, no global pollution
Scripts stay small and focused. Each one does one thing: fetch data, transform it, emit it. DuckDB does the rest.