PDF to JSON
Convert any PDF into structured data with typed columns, tables, and sections. This API is asynchronous — submit a job, poll for status, then fetch the result.
Submit your PDF
POST a PDF to start an extraction job. The API returns immediately with a job ID while processing continues in the background. Multi-page documents are automatically split and extracted page-by-page with rolling context for accurate table continuation.
curl -X POST https://api.contexa.works/api/v1/pdf \ -H "x-rapidapi-key: YOUR_API_KEY" \ -F "file=@report.pdf"
{
"jobId": "e5f6a7b8-...",
"status": "processing"
}curl -X POST https://api.contexa.works/api/v1/pdf \
-H "x-rapidapi-key: YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"base64": "'$(base64 -i report.pdf)'",
"pages": "1-5",
"callbackUrl": "https://your-app.com/webhooks/contexa"
}'Use the pages parameter to focus on specific pages (e.g. "1-5"). This speeds up extraction and reduces cost for large documents.
Poll for completion
Check the job status by polling with the job ID. The status will be processing, completed, or failed. Or skip polling entirely by providing a callbackUrl.
curl https://api.contexa.works/api/v1/jobs/e5f6a7b8-... \ -H "x-rapidapi-key: YOUR_API_KEY"
{
"id": "e5f6a7b8-...",
"status": "completed",
"documentType": "pdf",
"fileName": "report.pdf",
"processingTimeMs": 4210,
"createdAt": "2024-06-15T10:30:00Z",
"completedAt": "2024-06-15T10:30:04Z"
}Fetch the result
Once the job is completed, fetch the structured data. The response includes document metadata, sections, tables with typed columns, and key-value pairs.
curl https://api.contexa.works/api/v1/jobs/e5f6a7b8-.../result \ -H "x-rapidapi-key: YOUR_API_KEY"
{
"id": "e5f6a7b8-...",
"status": "completed",
"processingTimeMs": 4210,
"data": {
"title": "Q4 2024 Financial Report",
"author": "Finance Team",
"date": "2024-12-31",
"documentType": "report",
"sections": [
{
"heading": "Revenue Summary",
"content": "Total revenue grew 12% year-over-year...",
"tables": [
{
"name": "Revenue by Region",
"columns": [
{ "name": "Region", "type": "string", "unit": null },
{ "name": "Q4 Revenue", "type": "currency", "unit": null },
{ "name": "YoY Growth", "type": "percentage", "unit": null }
],
"rows": [
["EMEA", 5200000, 14.2],
["North America", 4800000, 11.5],
["APAC", 3100000, 8.7]
]
}
]
}
],
"keyValuePairs": [
{ "key": "Report Period", "value": "Q4 2024" },
{ "key": "Currency", "value": "USD" }
],
"summary": "Q4 revenue totalled $13.1M across three regions..."
}
}Units like pence, bps, or cents are extracted into a separate unit field.
const BASE = "https://api.contexa.works/api/v1"
// 1. Submit
const submit = await fetch(`${BASE}/pdf`, {
method: "POST",
headers: { "x-rapidapi-key": "YOUR_API_KEY" },
body: formData,
})
const { jobId } = await submit.json()
// 2. Poll until done
let job
do {
await new Promise(r => setTimeout(r, 2000))
const res = await fetch(`${BASE}/jobs/${jobId}`, {
headers: { "x-rapidapi-key": "YOUR_API_KEY" },
})
job = await res.json()
} while (job.status === "processing")
// 3. Fetch result
if (job.status === "completed") {
const res = await fetch(`${BASE}/jobs/${jobId}/result`, {
headers: { "x-rapidapi-key": "YOUR_API_KEY" },
})
const { data } = await res.json()
for (const section of data.sections) {
for (const table of section.tables) {
console.log(table.name, table.rows.length, "rows")
}
}
}import requests, time
BASE = "https://api.contexa.works/api/v1"
headers = {"x-rapidapi-key": key}
# 1. Submit
r = requests.post(
f"{BASE}/pdf",
headers=headers,
files={"file": open("report.pdf", "rb")},
)
job_id = r.json()["jobId"]
# 2. Poll
while True:
time.sleep(2)
status = requests.get(
f"{BASE}/jobs/{job_id}",
headers=headers,
).json()
if status["status"] in ("completed", "failed"):
break
# 3. Fetch result
if status["status"] == "completed":
result = requests.get(
f"{BASE}/jobs/{job_id}/result",
headers=headers,
).json()
for section in result["data"]["sections"]:
for table in section["tables"]:
print(f"{table['name']}: {len(table['rows'])} rows")