Source code for app.extractors.common
"""Shared extractor runtime primitives (schema + Outlines/OpenAI execution).
Authors:
Roger Erismann (https://hammerdirt.solutions), OpenAI Codex
Purpose:
Provide common extraction utilities used by all section extractors:
- strict model base config (`StrictBaseModel`)
- OpenAI schema normalization helper for Outlines compatibility
- prompt assembly + model execution via Outlines/OpenAI
Dependencies:
- `outlines` for structured generation wrapper
- `openai` SDK for model invocation
- `pydantic` for response-model validation
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Any, Type, TypeVar
import outlines
from openai import OpenAI
from pydantic import BaseModel
from pydantic.config import ConfigDict
T = TypeVar("T", bound=BaseModel)
def _enforce_openai_schema(schema: dict[str, Any]) -> None:
"""Mutate JSON schema into OpenAI-compatible required-property form.
OpenAI `response_format` requires every declared property key to appear in
`required`. This helper recursively enforces that constraint for object
schemas while preserving `$ref` nodes.
"""
if not isinstance(schema, dict):
return
if "$ref" in schema:
ref = schema.get("$ref")
schema.clear()
schema["$ref"] = ref
return
if "properties" in schema and isinstance(schema["properties"], dict):
schema["required"] = list(schema["properties"].keys())
for value in schema["properties"].values():
_enforce_openai_schema(value)
for key in ("anyOf", "oneOf", "allOf"):
if key in schema and isinstance(schema[key], list):
for subschema in schema[key]:
_enforce_openai_schema(subschema)
if "items" in schema:
_enforce_openai_schema(schema["items"])
if "$defs" in schema and isinstance(schema["$defs"], dict):
for subschema in schema["$defs"].values():
_enforce_openai_schema(subschema)
if "definitions" in schema and isinstance(schema["definitions"], dict):
for subschema in schema["definitions"].values():
_enforce_openai_schema(subschema)
[docs]
class StrictBaseModel(BaseModel):
"""Base model for all extractors with strict schema behavior.
- Rejects unknown keys (`extra="forbid"`).
- Applies `_enforce_openai_schema` before schema export.
"""
model_config = ConfigDict(
extra="forbid",
json_schema_extra=_enforce_openai_schema,
)
def _read_text(path: Path) -> str:
"""Read UTF-8 prompt file text and strip leading/trailing whitespace."""
return path.read_text(encoding="utf-8").strip()